In [3]:
import numpy as np
import pandas as pd
import sklearn as sk
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier

from sklearn.grid_search import GridSearchCV

def find_index(value, li):
    l = len(li)
    for i in range(l):
        if(li[i] == value):
            return i
    return -1


def find_best_value_for_parameter(X, y, other_parameter_values,
                                  parameter_name,
                                 first_level_values,
                                 second_level_values):
    grid = {parameter_name: first_level_values}
    clf = RandomForestClassifier()
    clf.set_params(**other_parameter_values)
    grid_search = GridSearchCV(estimator = clf, param_grid = grid, scoring='roc_auc', cv=5, verbose=100)
    grid_search.fit(X, y)
    ind = find_index(grid_search.best_params_[parameter_name], first_level_values)
    if(ind == -1):
        return grid_search.best_params_[parameter_name]
    else:
        grid = {parameter_name: second_level_values[ind]}
        grid_search = GridSearchCV(estimator = clf, param_grid = grid, scoring='roc_auc', cv=5, verbose=100)
        grid_search.fit(X, y)
        return grid_search.best_params_[parameter_name]
    

if __name__ == "__main__": 
    # load labeled data
    train_df = pd.read_csv('train.csv')
    train_data = pd.DataFrame.as_matrix(train_df)
    y = train_data[:,0]; X = train_data[:,1:9];
    

In [5]:
    other_parameter_values = {}
    parameter_name = 'max_features'
    other_parameter_values[parameter_name] = find_best_value_for_parameter(X, y, other_parameter_values,
                                                                          parameter_name,
                                                                          [1,2,3,4,5,6,7,8],
                                                                          {0:[1],
                                                                           1:[2],
                                                                           2:[3],
                                                                           3:[4],
                                                                           4:[5],
                                                                           5:[6],
                                                                           6:[7],
                                                                           7:[8]})
    

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] max_features=1 ..................................................
[CV] ......................... max_features=1, score=0.787366 -   0.2s
[Parallel(n_jobs=1)]: Done   1 tasks       | elapsed:    0.2s
[CV] max_features=1 ..................................................
[CV] ......................... max_features=1, score=0.827487 -   0.2s
[Parallel(n_jobs=1)]: Done   2 tasks       | elapsed:    0.4s
[CV] max_features=1 ..................................................
[CV] ......................... max_features=1, score=0.801067 -   0.2s
[Parallel(n_jobs=1)]: Done   3 tasks       | elapsed:    0.6s
[CV] max_features=1 ..................................................
[CV] ......................... max_features=1, score=0.789378 -   0.2s
[Parallel(n_jobs=1)]: Done   4 tasks       | elapsed:    0.8s
[CV] max_features=1 ..................................................
[CV] ......................... max_features=1, score=

In [6]:
    print other_parameter_values[parameter_name]
    parameter_name = 'n_estimators'
    other_parameter_values[parameter_name] = find_best_value_for_parameter(X, y, other_parameter_values,
                                                                          parameter_name,
                                                                          [50,100,200],
                                                                          {0:[50,60,70,80,90],
                                                                           1:[100,120,140,160,180],
                                                                           2:[200,240,280,320,360]})

2
Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] n_estimators=50 .................................................
[CV] ........................ n_estimators=50, score=0.833586 -   1.4s
[Parallel(n_jobs=1)]: Done   1 tasks       | elapsed:    1.4s
[CV] n_estimators=50 .................................................
[CV] ........................ n_estimators=50, score=0.855886 -   1.4s
[Parallel(n_jobs=1)]: Done   2 tasks       | elapsed:    2.8s
[CV] n_estimators=50 .................................................
[CV] ........................ n_estimators=50, score=0.839638 -   1.3s
[Parallel(n_jobs=1)]: Done   3 tasks       | elapsed:    4.0s
[CV] n_estimators=50 .................................................
[CV] ........................ n_estimators=50, score=0.832788 -   1.3s
[Parallel(n_jobs=1)]: Done   4 tasks       | elapsed:    5.3s
[CV] n_estimators=50 .................................................
[CV] ........................ n_estimators=50, scor

In [9]:
    print other_parameter_values[parameter_name]
    parameter_name = 'min_samples_leaf'
    other_parameter_values[parameter_name] = find_best_value_for_parameter(X, y, other_parameter_values,
                                                                          parameter_name,
                                                                          [1,2,4,8,16,32,64,128,256],
                                                                          {0:[1],
                                                                           1:[2,3],
                                                                           2:[4,5,6,7],
                                                                           3:[8,10,12,14],
                                                                           4:[16,20,24,28],
                                                                           5:[32,40,48,56],
                                                                           6:[65,80,96,112],
                                                                           7:[128,160,192,224]})
  

240
Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] min_samples_leaf=1 ..............................................
[CV] ..................... min_samples_leaf=1, score=0.847022 -   6.4s
[Parallel(n_jobs=1)]: Done   1 tasks       | elapsed:    6.4s
[CV] min_samples_leaf=1 ..............................................
[CV] ..................... min_samples_leaf=1, score=0.870919 -   6.3s
[Parallel(n_jobs=1)]: Done   2 tasks       | elapsed:   12.7s
[CV] min_samples_leaf=1 ..............................................
[CV] ..................... min_samples_leaf=1, score=0.848872 -   6.3s
[Parallel(n_jobs=1)]: Done   3 tasks       | elapsed:   19.0s
[CV] min_samples_leaf=1 ..............................................
[CV] ..................... min_samples_leaf=1, score=0.851803 -   6.7s
[Parallel(n_jobs=1)]: Done   4 tasks       | elapsed:   25.8s
[CV] min_samples_leaf=1 ..............................................
[CV] ..................... min_samples_leaf=1, sc

KeyboardInterrupt: 

In [20]:
other_parameter_values = {'n_estimators':240, 'min_samples_leaf':2}
parameter_name = 'max_features'
other_parameter_values[parameter_name] = find_best_value_for_parameter(X, y, other_parameter_values,
                                                                          parameter_name,
                                                                          [1,2,3,4,5,6,7,8],
                                                                          {0:[1],
                                                                           1:[2],
                                                                           2:[3],
                                                                           3:[4],
                                                                           4:[5],
                                                                           5:[6],
                                                                           6:[7],
                                                                           7:[8]})

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] max_features=1 ..................................................
[CV] ......................... max_features=1, score=0.854603 -   4.1s
[Parallel(n_jobs=1)]: Done   1 tasks       | elapsed:    4.2s
[CV] max_features=1 ..................................................
[CV] ......................... max_features=1, score=0.875140 -   4.2s
[Parallel(n_jobs=1)]: Done   2 tasks       | elapsed:    8.3s
[CV] max_features=1 ..................................................
[CV] ......................... max_features=1, score=0.866091 -   4.2s
[Parallel(n_jobs=1)]: Done   3 tasks       | elapsed:   12.5s
[CV] max_features=1 ..................................................
[CV] ......................... max_features=1, score=0.860304 -   4.0s
[Parallel(n_jobs=1)]: Done   4 tasks       | elapsed:   16.5s
[CV] max_features=1 ..................................................
[CV] ......................... max_features=1, score=

In [None]:
parameter_name = 'max_depth'
other_parameter_values[parameter_name] = find_best_value_for_parameter(X, y, other_parameter_values,
                                                                          parameter_name,
                                                                          [9,10,11,12,13,14,15,23,24,25],
                                                                          {0:[9],
                                                                           1:[10],
                                                                           2:[11],
                                                                           3:[12],
                                                                           4:[13],
                                                                           5:[14],
                                                                           6:[15],
                                                                           7:[23],
                                                                           8:[24],
                                                                           9:[25]})

In [None]:
parameter_name = 'max_features'
other_parameter_values[parameter_name] = find_best_value_for_parameter(X, y, other_parameter_values,
                                                                          parameter_name,
                                                                          [1,2,3,4,5,6,7,8],
                                                                          {0:[1],
                                                                           1:[2],
                                                                           2:[3],
                                                                           3:[4],
                                                                           4:[5],
                                                                           5:[6],
                                                                           6:[7],
                                                                           7:[8]})

In [38]:
    parameter_name = 'n_estimators'
    other_parameter_values[parameter_name] = find_best_value_for_parameter(X, y, other_parameter_values,
                                                                          parameter_name,
                                                                          [200, 400, 600, 800,1000],
                                                                          {0:[200],
                                                                           1:[400],
                                                                           2:[600],
                                                                           3:[800],
                                                                           4:[1000],
                                                                           })

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] n_estimators=200 ................................................
[CV] ....................... n_estimators=200, score=0.855717 -   8.4s
[Parallel(n_jobs=1)]: Done   1 tasks       | elapsed:    8.4s
[CV] n_estimators=200 ................................................
[CV] ....................... n_estimators=200, score=0.875750 -   8.8s
[Parallel(n_jobs=1)]: Done   2 tasks       | elapsed:   17.2s
[CV] n_estimators=200 ................................................
[CV] ....................... n_estimators=200, score=0.858989 -   8.2s
[Parallel(n_jobs=1)]: Done   3 tasks       | elapsed:   25.4s
[CV] n_estimators=200 ................................................
[CV] ....................... n_estimators=200, score=0.852297 -   9.3s
[Parallel(n_jobs=1)]: Done   4 tasks       | elapsed:   34.7s
[CV] n_estimators=200 ................................................
[CV] ....................... n_estimators=200, score=

In [None]:
other_parameter_values = {'n_estimators': 270, 'max_features': 4, 
                          'max_depth': 23, 'min_samples_leaf': 2}
parameter_name = 'min_samples_split'
other_parameter_values[parameter_name] = find_best_value_for_parameter(X, y, other_parameter_values,
                                                                          parameter_name,
                                                                          [1,2,4,8,16,32,64,128,256],
                                                                          {0:[1],
                                                                           1:[2,3],
                                                                           2:[4,5,6,7],
                                                                           3:[8,10,12,14],
                                                                           4:[16,20,24,28],
                                                                           5:[32,40,48,56],
                                                                           6:[65,80,96,112],
                                                                           7:[128,160,192,224]})
  

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] min_samples_split=1 .............................................
[CV] .................... min_samples_split=1, score=0.856079 -  11.2s
[Parallel(n_jobs=1)]: Done   1 tasks       | elapsed:   11.2s
[CV] min_samples_split=1 .............................................
[CV] .................... min_samples_split=1, score=0.873649 -  12.0s
[Parallel(n_jobs=1)]: Done   2 tasks       | elapsed:   23.2s
[CV] min_samples_split=1 .............................................
[CV] .................... min_samples_split=1, score=0.863304 -  10.9s
[Parallel(n_jobs=1)]: Done   3 tasks       | elapsed:   34.1s
[CV] min_samples_split=1 .............................................
[CV] .................... min_samples_split=1, score=0.857005 -  12.1s
[Parallel(n_jobs=1)]: Done   4 tasks       | elapsed:   46.2s
[CV] min_samples_split=1 .............................................
[CV] .................... min_samples_split=1, score=

In [4]:
other_parameter_values = {'n_estimators': 270, 'max_features': 4, 
                          'max_depth': 23, 'min_samples_leaf': 2,
                          'min_samples_split':8}
forest = RandomForestClassifier()
forest.set_params(**other_parameter_values)
forest.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=23, max_features=4, max_leaf_nodes=None,
            min_samples_leaf=2, min_samples_split=8,
            min_weight_fraction_leaf=0.0, n_estimators=270, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [5]:
    # load unlabeled data
    test_df = pd.read_csv('test.csv')
    test_data = pd.DataFrame.as_matrix(test_df)
    id_test = test_data[:,0]; X_test = test_data[:,1:9];

In [6]:
    
    # predictions on unlabeled data
    y_test_pred = forest.predict_proba(X_test)
    ans = pd.DataFrame({'Id': id_test, 'Action' : y_test_pred[:,1]})
    ans.to_csv('hw3rf.csv', index=False, columns=['Id', 'Action'])