In [1]:
%run data_getter_and_processor.ipynb

import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from IPython.display import display
from sklearn.model_selection import GridSearchCV

# Obtain Training and testing data.
train_x, test_x, train_y, test_y = get_split_train_data(random_state=10)

In [2]:
from sklearn.linear_model import Ridge
from sklearn.preprocessing import scale

# Apply gridSearch to search best Random Forest model for imbalanced data for worst
worst_stress_levels = train_y.loc[:,"worst_stress_level"]
ballanced_train_x, worst_stress_levels = balance_data(train_x, worst_stress_levels) 
# ballanced_train_x = scale(ballanced_train_x)

param_grid = [
    {'alpha': [10e-3, 10e-4, 10e-5, 10e-2, 10e-1, 1, 10],
    'normalize': [False]}
 ]

# display(worst_stress_levels.head())
# print(worst_stress_levels.columns.values)

clf = GridSearchCV(Ridge(), param_grid, cv=5, scoring="neg_mean_squared_error", n_jobs=-1)
clf.fit(ballanced_train_x, worst_stress_levels)
print(clf.best_params_, clf.best_score_, clf.best_estimator_)

# predicting only worst stress levels.
estimator = clf.estimator
estimator.fit(ballanced_train_x, worst_stress_levels)
pred_worst_stress_levels = estimator.predict(test_x)

print(pred_worst_stress_levels)

0    2
1    2
2    2
3    2
4    2
Name: worst_stress_level, dtype: int64

{'alpha': 0.1, 'normalize': False} -3.13275942904 Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)
[ 2.9679184   0.93437035  2.09344308  2.0963043   1.6120607   3.11110459
  1.66927616  1.42934325  2.54774115  1.91250764  1.51095754  2.2074159
  1.30299789  1.48190942  0.8842465   1.35921274  3.28809704  2.06026197
  1.216133    1.30367661  1.34861986  0.95247456  1.42956462  1.19321114
  1.21022021  2.0144164   1.26137497  1.6391965   2.14453322  1.89720382
  2.84229329  1.94615537  1.98715497  2.21215096  2.11746417  1.36722261
  1.50228618  1.89629554  2.49372132  1.80713176  1.44196595  2.19456427
  1.60086652  1.19914868  1.57965597  1.40333224  1.02878783  1.280647
  1.91947418  1.60317008  2.1254349   1.29113542  2.2245023   1.68200515
  1.97724616  1.90246788  1.41601123  1.5939767   2.29176614  2.50060329
  2.6543492   2.58801066  2.27777282  1.9863463   1.74426227  1.65526794
  2.31186502  3.4418

In [3]:
# Scoring the metrics.

score = accuracy_score(test_y.loc[:,"worst_stress_level"], np.round(pred_worst_stress_levels), normalize=True)
print("Worst stress levels accuracy is "+ str(score * 100) + " %")

Worst stress levels accuracy is 35.3092783505 %


In [5]:
# Finding L2 error with the predicted data.
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

l2_error = mean_squared_error(y_pred=pred_worst_stress_levels, y_true=test_y.loc[:,"worst_stress_level"])
l1_error = mean_absolute_error(y_pred=pred_worst_stress_levels, y_true=test_y.loc[:,"worst_stress_level"])

print("L2_error is : ", l2_error)
print("l1_error is : ", l1_error)

L2_error is :  1.26038273656
l1_error is :  0.906369423527
