In [27]:
import pandas as pd
from sklearn.linear_model import LogisticRegressionCV

def change_labels(x):
    label_dict = {
         0.0:2
        ,1.0:1
        ,2.0:0
        ,3.0:3
        ,4.0:4
       }
    
    return label_dict[x]

data = pd.read_csv("data/variable_interval_aggregate_train.csv", skip_blank_lines=False, index_col=0, infer_datetime_format=True)
data.loc[:,"stress_level"] = data.loc[:,"stress_level"].apply(change_labels)

In [28]:
# Stratified and shuffled train split.
from sklearn.model_selection import train_test_split

X = data.iloc[:, :-1]
y = data.iloc[:, -1]

train_X , test_X, train_y, test_y = train_test_split(X, y, shuffle=False, stratify=None, test_size=0.25, random_state=10)

In [29]:
# scaling the values.

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize

X = StandardScaler().fit_transform(X)
X = normalize(X)

In [30]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

In [25]:
# Logistic regression on personalized aggregates.

param_grid = [
    {'penalty': [ "l2"],
    'C' : [1e-5, 1e-6, 1e-7],
    'max_iter' : [60 ,70 ,80 ,100, 120]}
 ]

clf = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring="accuracy")
clf.fit(train_X, train_y)

print(clf.best_params_, clf.best_score_, clf.best_estimator_)

best_clf = clf.estimator
best_clf.fit(train_X, train_y)
pred_y = best_clf.predict(test_X)
score = accuracy_score(test_y, pred_y, normalize=True)

f1 = f1_score(test_y, pred_y, average=None)
precission = precision_score(test_y, pred_y, average=None)
recall = recall_score(test_y, pred_y, average=None)

print("\n\n")
print("Accuracy is "+ str(score * 100) + " %")
print("\n\n")
print("predicted values ", pred_y)
print("\n\n")
print("f_1 score ", f1)
print("Recall ", recall)
print("precission ", precission)
print("\n\n")
print("confusion_matrics \n", confusion_matrix(test_y, pred_y, labels=[0,1,2,3,4]))

{'C': 1e-06, 'max_iter': 60, 'penalty': 'l2'} 0.383147853736 LogisticRegression(C=1e-06, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=60, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)



Accuracy is 43.5714285714 %



predicted values  [2 2 2 2 2 2 2 2 2 4 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 4 2 2 2
 2 2 2 2 2 2 2 2 2 2 4 2 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 3 2 2 2 2 2 3 2 2
 2 3 2 2 3 2 2 2 2 2 3 2 2 2 2 2 2 4 2 2 3 2 2 2 2 2 3 2 2 2 3 3 2 3 2 3 3
 2 3 3 2 2 2 2 2 2 3 2 2 2 2 3 2 2 3 2 2 2 2 2 2 2 2 3 2 0 2 3 3 3 3 3 3 3
 3 3 3 3 3 4 2 2 2 2 2 2 3 3 2 3 3 3 2 2 2 3 2 3 3 2 3 3 3 2 3 2 2 3 3 2 2
 3 2 3 1 2 1 3 2 3 1 3 2 2 3 2 3 2 2 3 2 4 2 2 2 2 2 2 2 4 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 1 2 2 2 2 

In [26]:
# Logistic regression on generalized aggregates.

param_grid = [
    {'penalty': [ "l1", "l2"],
    'C' : [ 1e-4, 1e-5, 1e-6, 1e-7, 1e-8],
    'max_iter' : [60 ,70 ,80 ,100, 120]}
 ]

clf = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring="accuracy")

# SLicing removes student_id as a feature in the table and makes the model generalized.
clf.fit(train_X.iloc[:,1:], train_y)

print(clf.best_params_, clf.best_score_, clf.best_estimator_)

best_clf = clf.estimator
best_clf.fit(train_X.iloc[:,1:], train_y)
pred_y = best_clf.predict(test_X.iloc[:,1:])
score = accuracy_score(test_y, pred_y, normalize=True)

f1 = f1_score(test_y, pred_y, average=None)
precission = precision_score(test_y, pred_y, average=None)
recall = recall_score(test_y, pred_y, average=None)

print("\n\n")
print("Accuracy is "+ str(score * 100) + " %")
print("\n\n")
print("predicted values", pred_y)
print("\n\n")
print("f_1 score ", f1)
print("Recall ", recall)
print("precission ", precission)
print("confusion_matrics \n", confusion_matrix(test_y, pred_y, labels=[0,1,2,3,4]))

{'C': 1e-07, 'max_iter': 60, 'penalty': 'l1'} 0.39586645469 LogisticRegression(C=1e-07, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=60, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)



Accuracy is 42.380952381 %



predicted values [2 2 2 2 2 2 2 2 2 1 2 4 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 0 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 3 2 2 2 2 2 3 2 2
 2 3 2 3 3 2 2 2 2 2 3 2 2 2 3 3 2 1 2 2 3 2 2 2 2 2 3 2 2 3 3 3 3 3 1 3 3
 2 3 3 2 2 2 2 2 2 3 2 2 2 2 3 2 2 3 2 2 2 2 2 2 2 2 3 2 0 2 3 3 3 3 3 3 3
 3 3 3 3 3 2 3 2 1 2 3 2 3 3 2 3 3 3 2 2 2 3 2 3 3 2 3 3 3 2 3 2 2 3 3 3 3
 3 3 3 3 3 4 1 1 4 1 3 2 2 3 3 3 2 3 3 2 2 2 2 2 2 2 2 2 4 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 4 2 2 2 2 2 2

In [18]:
print (best_clf.coef_ )

[[  3.16394538e-04  -1.40447725e-03  -4.31310374e-06   2.49270399e-04
    6.59901824e-05  -3.53121744e-04  -1.05003189e-04   5.72802324e-04
   -2.25614736e-03  -2.97095152e-04   4.31618903e-04  -5.86473556e-06
    3.63736795e-04   7.59400096e-06  -3.86886516e-05  -3.86886516e-05
   -3.86886516e-05  -3.86886516e-05   7.70480859e-03  -1.69615469e-04
    7.70480859e-03   1.33889135e-02  -5.45551795e-04  -1.04444670e-02
   -3.54867249e-03   6.76581487e-04  -2.55691190e-06  -2.61140595e-03
    5.64197602e-04  -4.94050031e-04  -6.70240097e-04   7.42850808e-04
    9.11837836e-05   6.21419152e-06   4.79165411e-04  -7.83650791e-04
   -8.75285610e-04  -3.60220829e-04  -1.71513025e-03  -6.23445905e-05
   -1.73115414e-03   6.68024693e-04  -1.92822245e-03   7.83891971e-03
   -5.14381719e-04  -1.83083836e-06   1.24011501e-03  -7.80940627e-04
    3.71543341e-03  -1.01735813e-03   3.29509126e-03  -1.03899064e-03
    3.13209197e-03  -1.05512059e-03   3.11254745e-03   1.18497658e-02
    7.23629106e-03  

In [33]:
print(train_y.value_counts())

2    508
3    272
1    221
0    210
4     47
Name: stress_level, dtype: int64
