In [1]:
# Data manipulation
import pandas as pd
import numpy as np

# Random Forest Model
from sklearn.ensemble import RandomForestClassifier

# Evaluation of the model
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('/home/Ethan_google/final_dataset_train.csv')
y_train = df['TARGET']
x_train = df.drop(['TARGET', 'SK_ID_CURR'], axis = 1)

In [4]:
print('x_train.shape: {}'.format(x_train.shape))
print('y_train.shape: {}'.format(y_train.shape))
del df

x_train.shape: (307511, 364)
y_train.shape: (307511,)


In [6]:
%%time
# Baseline
clr_rf = RandomForestClassifier(n_estimators=500, n_jobs = -1, verbose = 1) 
roc_auc_scores = cross_val_score(clr_rf, x_train, y_train, cv=5, scoring='roc_auc')
print('5 fold CV ROC_AUC is %0.2f (+/- %0.2f)' %(roc_auc_scores.mean(), roc_auc_scores.std() * 2))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 40 concurrent workers.
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   36.0s
[Parallel(n_jobs=-1)]: Done 370 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  2.2min finished
[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done 120 tasks      | elapsed:    0.3s
[Parallel(n_jobs=40)]: Done 370 tasks      | elapsed:    0.7s
[Parallel(n_jobs=40)]: Done 500 out of 500 | elapsed:    0.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 40 concurrent workers.
[Parallel(n_jobs=-1)]: Done 120 tasks      | elapsed:   32.0s
[Parallel(n_jobs=-1)]: Done 370 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  2.1min finished
[Parallel(n_jobs=40)]: Using backend ThreadingBackend with 40 concurrent workers.
[Parallel(n_jobs=40)]: Done 120 tasks      | elapsed:    0.3s
[Parallel(n_jobs=40)]: Done 370 tas

5 fold CV ROC_AUC is 0.73 (+/- 0.01)
CPU times: user 2min 40s, sys: 13.2 s, total: 2min 54s
Wall time: 11min


### Objective function

In [7]:
import csv
from hyperopt import STATUS_OK
from timeit import default_timer as timer

In [17]:
def objective(hyperparameters):
    
    global ITERATION
    
    ITERATION += 1
        
    start = timer()
    clf = RandomForestClassifier(**hyperparameters, n_jobs = -1)
    # Perform n_fold cross validation
    roc_auc_scores = cross_val_score(clf, x_train, y_train, cv = N_FOLDS, scoring = 'roc_auc')
    
    run_time = timer() - start
    
    # Extract the best score
    score = roc_auc_scores.mean()
    
     # Loss must be minimized
    loss = 1 - score
    
    # Write to the csv file ('a' means append)
    of_connection = open(OUT_FILE, 'a')
    writer = csv.writer(of_connection)
    writer.writerow([loss, hyperparameters, ITERATION, run_time, score])
    of_connection.close()
    
    return {'loss': loss, 'hyperparameters': hyperparameters, 'iteration': ITERATION, 'train_time': run_time, 'status': STATUS_OK}


### Specify the domain for hyperparameters

In [9]:
from hyperopt import hp
from hyperopt.pyll.stochastic import sample

In [10]:
space = {
        'bootstrap': hp.choice('bootstrap', [True, False]),
        'max_depth': hp.choice('max_depth', np.arange(1, 100, dtype=int)),
        'max_features': hp.choice('max_features', ['auto', 'log2']),
        'min_samples_leaf': hp.choice('min_samples_leaf', np.arange(1, 12, dtype=int)),
        'min_samples_split': hp.choice('min_samples_split', np.arange(1, 12, dtype=int)),
        'n_estimators': hp.choice('n_estimators', np.arange(50, 1000, dtype=int))
        }

In [11]:
x = sample(space)
x

{'bootstrap': True,
 'max_depth': 28,
 'max_features': 'log2',
 'min_samples_leaf': 2,
 'min_samples_split': 11,
 'n_estimators': 61}

### Optimization Algorithm

In [12]:
from hyperopt import tpe

# Create the algorithm
tpe_algorithm = tpe.suggest

In [13]:
from hyperopt import Trials

# Record results
trials = Trials()

In [14]:
# Create a file and open a connection
OUT_FILE = 'Random_Forest/Automated_Tuning.csv'
of_connection = open(OUT_FILE, 'w')
writer = csv.writer(of_connection)

# Write column names
headers = ['loss', 'hyperparameters', 'iteration', 'runtime', 'score']
writer.writerow(headers)
of_connection.close()

In [15]:
from hyperopt import fmin

In [18]:
global  ITERATION

ITERATION = 0
# Governing choices for search
N_FOLDS = 5
MAX_EVALS = 50

best = fmin(fn = objective, space = space, algo = tpe.suggest, trials = trials,
            max_evals = MAX_EVALS)

  2%|▏         | 1/50 [23:40<19:20:14, 1420.70s/it, best loss: 0.2547769743024013]


KeyboardInterrupt: 