# Random Forest with Feature selection comparison

In [1]:

from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
from pickle import dump
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Load training sets
X_train_scaled = pd.read_csv("test_train_data/X_train_scaled.csv")
X_train_scaled_sk = pd.read_csv("test_train_data/X_train_scaled_sk.csv")
X_train_scaled_cs = pd.read_csv("test_train_data/X_train_scaled_cs.csv")
encoded_y_train = pd.read_csv("test_train_data/encoded_y_train.csv")

In [3]:
# Load testing sets
X_test_scaled = pd.read_csv("test_train_data/X_test_scaled.csv")
X_test_scaled_sk = pd.read_csv("test_train_data/X_test_scaled_sk.csv")
X_test_scaled_cs = pd.read_csv("test_train_data/X_test_scaled_cs.csv")
encoded_y_test = pd.read_csv("test_train_data/encoded_y_test.csv")

In [4]:
# Code for Random Forest for all features


rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train_scaled, np.ravel(encoded_y_train))
rf.score(X_test_scaled, np.ravel(encoded_y_test))

0.971830985915493

In [5]:
# Code for Random Forest with SelectKBest features

rf_sk = RandomForestClassifier(n_estimators=200)
rf_sk = rf_sk.fit(X_train_scaled_sk, np.ravel(encoded_y_train))
rf_sk.score(X_test_scaled_sk, np.ravel(encoded_y_test))

0.9577464788732394

In [6]:
# Code for Random Forest with Correlation based selection

rf_cs = RandomForestClassifier(n_estimators=200)
rf_cs = rf_cs.fit(X_train_scaled_cs, np.ravel(encoded_y_train))
rf_cs.score(X_test_scaled_cs, np.ravel(encoded_y_test))

0.9577464788732394

In [7]:
# Get parameters
from pprint import pprint
pprint(rf.get_params())

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 200,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}


In [8]:
# Adjust # of trees

rf = RandomForestClassifier(n_estimators=10)
rf = rf.fit(X_train_scaled, np.ravel(encoded_y_train))
rf.score(X_test_scaled, np.ravel(encoded_y_test))

0.9647887323943662

In [9]:
# Adjust # of max features for node splitting

rf = RandomForestClassifier(n_estimators=10, max_features = 'sqrt')
rf = rf.fit(X_train_scaled, np.ravel(encoded_y_train))
rf.score(X_test_scaled, np.ravel(encoded_y_test))

0.9647887323943662

### Better to go with a Grid Search Estimator on best feature selection - Very Time Consuming (~1min runtime)

In [10]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}

grid = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 3)

In [11]:
grid.fit(X_train_scaled, np.ravel(encoded_y_train))

Fitting 3 folds for each of 288 candidates, totalling 864 fits


GridSearchCV(cv=3,
             estimator=RandomForestClassifier(max_features='sqrt',
                                              n_estimators=10),
             n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth': [80, 90, 100, 110],
                         'max_features': [2, 3], 'min_samples_leaf': [3, 4, 5],
                         'min_samples_split': [8, 10, 12],
                         'n_estimators': [100, 200, 300, 1000]},
             verbose=3)

In [12]:
print(grid.best_params_)

{'bootstrap': True, 'max_depth': 80, 'max_features': 2, 'min_samples_leaf': 5, 'min_samples_split': 8, 'n_estimators': 100}


In [13]:
predictions = grid.predict(X_test_scaled)

In [14]:
print('Test Acc: %.3f' % grid.score(X_test_scaled, np.ravel(encoded_y_test)))

Test Acc: 0.979


In [15]:
dump(rf, open('pickles/randomforest_trained_all.pkl', 'wb'))
dump(rf_sk, open('pickles/randomforest_trained_sk.pkl', 'wb'))
dump(rf_cs, open('pickles/randomforest_trained_cs.pkl', 'wb'))