In [1]:
%run data_getter_and_processor.ipynb

import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
# Obtain Training and testing data.
data_train_x, data_test_x, train_y, test_y = get_split_train_data(random_state=10)

In [2]:
train_x, test_x = data_train_x.iloc[:,1:], data_test_x.iloc[:,1:]

In [3]:
from sklearn.model_selection import GridSearchCV
# Random Forest without fizing the the class imbalance.
# Apply gridSearch to search best Random Forest model for imbalanced data for worst
worst_stress_levels = train_y.loc[:,"worst_stress_level"]

param_grid = [
    {'n_estimators': [5, 7, 10, 15, 20, 30, 40, 50],
    'max_features' : [3 ,5 ,8],
    'criterion' : ["gini", "entropy"]}
 ]

clf = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring="accuracy", n_jobs=-1)
clf.fit(train_x, worst_stress_levels)
print(clf.best_params_, clf.best_score_, clf.best_estimator_)

# predicting only worst stress levels.
neigh = clf.estimator
neigh.fit(train_x, worst_stress_levels)
pred_worst_stress_levels = neigh.predict(test_x)

score = accuracy_score(test_y.loc[:,"worst_stress_level"], pred_worst_stress_levels, normalize=True)
f1 = f1_score(test_y.iloc[:,0], pred_worst_stress_levels, average=None)

print("Worst stress levels accuracy is "+ str(score * 100) + " %")
print("Worst stress levels f_1 score ", f1)
print("predicted values", pred_worst_stress_levels)

{'criterion': 'gini', 'max_features': 5, 'n_estimators': 50} 0.516351118761 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=5, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Worst stress levels accuracy is 49.3975903614 %
Worst stress levels f_1 score  [ 0.4         0.46153846  0.62318841  0.09836066  0.        ]
predicted values [2 2 1 3 0 2 2 2 3 2 3 2 1 2 2 2 1 1 2 2 3 2 2 2 2 2 2 2 2 2 2 3 1 2 2 2 2
 0 2 3 0 1 1 3 2 2 2 1 2 1 0 0 2 0 1 2 2 2 2 2 2 2 3 2 2 1 0 2 0 2 2 2 2 2
 2 2 2 2 2 0 1 2 0 2 0 2 2 0 1 2 2 0 2 2 2 0 3 2 0 2 2 3 2 2 2 1 2 2 2 2 0
 2 2 2 2 2 2 2 0 2 2 0 2 2 2 1 2 2 1 2 0 2 2 0 3 2 0 0 2 2 2 1 2 2 3 2 1 2
 2 2 2 2 2 2 2 0 2 2 2 2 0 0 2 3 2 2 0 2 2 2 2 3 1

  'precision', 'predicted', average, warn_for)


In [4]:
# Same Model as above but with fixed Imbalanced classes.
from sklearn.model_selection import GridSearchCV

# Apply gridSearch to search best Random Forest model for Best Stress levels.
worst_stress_levels = train_y.loc[:,"worst_stress_level"]
balanced_train_x, worst_stress_levels = balance_data(train_x, worst_stress_levels)

param_grid = [
    {'n_estimators': [5, 7, 10, 15, 20, 30, 40, 50]},
    {'max_features' : [3 ,5 ,8]},
    {'criterion' : ["gini", "entropy"]}
 ]

clf = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring="accuracy", n_jobs=-1)
clf.fit(balanced_train_x, worst_stress_levels)
print(clf.best_params_, clf.best_score_, clf.best_estimator_)

# predicting only worst stress levels.
neigh = clf.estimator
neigh.fit(balanced_train_x, worst_stress_levels)
pred_worst_stress_levels = neigh.predict(test_x)

score = accuracy_score(test_y.loc[:,"worst_stress_level"], pred_worst_stress_levels, normalize=True)
f1 = f1_score(test_y.iloc[:,0], pred_worst_stress_levels, average=None)

print("Worst stress levels accuracy is "+ str(score * 100) + " %")
print("Worst stress levels f_1 score ", f1)
print("predicted values", pred_worst_stress_levels)

{'n_estimators': 50} 0.89328358209 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
Worst stress levels accuracy is 43.7751004016 %
Worst stress levels f_1 score  [ 0.45454545  0.2195122   0.60085837  0.16129032  0.        ]
predicted values [2 2 1 0 2 1 2 2 2 1 2 4 2 2 1 2 1 0 2 2 3 2 2 2 2 2 3 2 2 2 2 2 1 1 2 2 3
 0 4 2 0 0 1 3 3 1 2 0 2 1 0 0 0 0 2 2 1 2 3 2 2 2 2 1 1 2 2 0 0 2 2 2 1 0
 2 2 2 1 2 0 2 2 0 3 0 3 0 2 2 2 0 0 0 1 0 1 3 2 0 2 2 2 2 2 2 0 2 0 2 3 0
 2 2 2 3 4 1 0 0 2 0 0 2 2 2 1 2 0 2 2 0 2 2 0 2 1 0 0 2 2 2 0 2 3 2 4 0 2
 2 2 1 2 0 2 2 0 2 0 0 2 0 0 3 2 0 1 0 2 0 2 4 3 0 3 2 1 2 2 1 4 0 0 0 3 0
 2 2 1 2 2 