In [1]:
%run data_getter_and_processor.ipynb

import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from IPython.display import display
from sklearn.model_selection import GridSearchCV

# Obtain Training and testing data.
train_x, test_x, train_y, test_y = get_split_train_data(random_state=10)

In [2]:
from sklearn.linear_model import Lasso
from sklearn.preprocessing import scale

# Apply gridSearch to search best Random Forest model for imbalanced data for worst
worst_stress_levels = train_y.loc[:,"worst_stress_level"]
ballanced_train_x, worst_stress_levels = balance_data(train_x, worst_stress_levels) 
# ballanced_train_x = scale(ballanced_train_x)

param_grid = [
    {'alpha': [10e-3, 10e-4, 10e-5, 10e-2, 10e-1, 1, 10],
    'normalize': [False]}
 ]

display(worst_stress_levels.head())
# print(worst_stress_levels.columns.values)

clf = GridSearchCV(Lasso(), param_grid, cv=5, scoring="neg_mean_squared_error", n_jobs=-1)
clf.fit(ballanced_train_x, worst_stress_levels)
print(clf.best_params_, clf.best_score_, clf.best_estimator_)

# predicting only worst stress levels.
estimator = clf.estimator
estimator.fit(ballanced_train_x, worst_stress_levels)
pred_worst_stress_levels = estimator.predict(test_x)

print(pred_worst_stress_levels)

0    2
1    2
2    2
3    2
4    2
Name: worst_stress_level, dtype: int64

{'alpha': 0.01, 'normalize': False} -3.13613017673 Lasso(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)
[ 2.19429786  2.07773406  1.75465216  2.41284397  1.70450434  2.10148546
  1.58684509  1.7871483   2.03336594  1.78096041  1.93480843  2.43098512
  1.88909763  2.3922531   2.20561301  2.20956253  2.14685623  2.08452172
  2.87121352  1.87759919  1.47849376  2.26200543  1.72185561  1.82712034
  2.05594349  1.73351652  1.77191764  1.72006878  1.42900326  2.2943976
  1.76489825  2.11416846  2.0218629   2.10699904  2.09765375  2.15967683
  1.57559576  2.24572598  2.08079097  1.90963254  2.39675141  1.98585806
  2.55587704  1.93306581  1.83344029  1.8850932   1.39369498  2.39675141
  1.94033721  1.7664617   2.06202742  1.90459255  1.56644844  1.64604263
  1.92122103  2.44679197  2.64274465  1.58084238  1.72094912  1.49748394
  2.22167151  2.12002896  1.

In [11]:
# Scoring the metrics.
from sklearn.metrics import f1_score

score = accuracy_score(test_y.loc[:,"worst_stress_level"], np.round(pred_worst_stress_levels), normalize=True)
f1 = f1_score(test_y.loc[:,"worst_stress_level"], np.round(pred_worst_stress_levels), average=None)
print("Worst stress levels accuracy is "+ str(score * 100) + " %")
print("Worst stress levels f1 SCore is "+ str(f1))

Worst stress levels accuracy is 42.0103092784 %
Worst stress levels f1 SCore is [ 0.          0.10752688  0.5917603   0.          0.        ]


  'precision', 'predicted', average, warn_for)


In [4]:
# Finding L2 error with the predicted data.
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

l2_error = mean_squared_error(y_pred=pred_worst_stress_levels, y_true=test_y.loc[:,"worst_stress_level"])
l1_error = mean_absolute_error(y_pred=pred_worst_stress_levels, y_true=test_y.loc[:,"worst_stress_level"])

print("L2_error is : ", l2_error)
print("L1_error is : ", l1_error)

L2_error is :  1.19457122553
L1_error is :  0.846930967099


In [6]:
print(pred_worst_stress_levels[:10])
print(test_y.loc[:,"worst_stress_level"].iloc[:10])

[ 2.  2.  2.  2.  2.  2.  2.  2.  2.  2.]
DATE
2013-05-07    2
2013-04-10    4
2013-04-15    3
2013-05-09    2
2013-03-30    2
2013-05-21    2
2013-04-08    2
2013-03-28    2
2013-05-01    3
2013-04-30    0
Name: worst_stress_level, dtype: int64
