### Import dependencies

In [1]:
import numpy as np 
import pandas as pd

from sklearn.datasets import make_blobs
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

### Read dataset

In [2]:
data = pd.read_csv('../reduced.csv')
data.drop('Index', axis=1, inplace=True)
data.head()

x = data.iloc[:, :-1]
y = data.iloc[:, -1]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=1)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=1) 

### Model

In [3]:
model = RandomForestClassifier()
n_estimators = [10, 100, 1000]
max_features = ['sqrt', 'log2']

### Define Grid

In [4]:
grid = dict(n_estimators=n_estimators,max_features=max_features)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(x, y)

### Summary

In [5]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.512175 using {'max_features': 'sqrt', 'n_estimators': 1000}
0.463587 (0.009350) with: {'max_features': 'sqrt', 'n_estimators': 10}
0.506595 (0.008241) with: {'max_features': 'sqrt', 'n_estimators': 100}
0.512175 (0.008441) with: {'max_features': 'sqrt', 'n_estimators': 1000}
0.459382 (0.007854) with: {'max_features': 'log2', 'n_estimators': 10}
0.506103 (0.008639) with: {'max_features': 'log2', 'n_estimators': 100}
0.511416 (0.009201) with: {'max_features': 'log2', 'n_estimators': 1000}


In [6]:
import pickle
with open('./models/randomForest.pickle', 'wb') as f:
    pickle.dump(grid_result, f)