In [1]:
import pandas as pd
import numpy as np
import torch
from torch.autograd import Variable
from IPython.display import display
import math
from sklearn.preprocessing import normalize
import warnings
warnings.filterwarnings("ignore")


def generate_baseline_variables(feature_list=[
    "activity_details",
    # "dinning_details" ,
    "sms_details",
    "audio_details",
    "conversation_details",
    "dark_details",
    "phonecharge_details",
    "phonelock_details",
    "gps_details"],
    val_set_size = .3,
    restrict_seqlen=10,
    is_cuda_available=False):
    
    # Dict Initialization.
    train_feature_dict = {}
    val_feature_dict = {}
    train_x_list = []
    val_x_list = []
    for feature in feature_list:

        # Read CSV and skip the time columns.
        raw_feature_train_x = pd.read_csv("Data/"+feature+"_train_x.csv", skip_blank_lines=False).iloc[:,1:]
        raw_feature_train_y = pd.read_csv("Data/"+feature+"_train_y.csv", skip_blank_lines=False)
        # to bring the values from 0-4.
        raw_feature_train_y["stress_level"] += -1
        raw_feature_train_y_indices = pd.read_csv("Data/"+feature+"_train_y_indices.csv", skip_blank_lines=False)

        # Finding Last Valid index.
        last_valid_index = raw_feature_train_y_indices.iloc[-1,:].values
        last_valid_index = int(last_valid_index)

        #Truncating the raw_x features for which y values do not exist.
        raw_feature_train_x = raw_feature_train_x.iloc[:last_valid_index+1,:]

        # Hardcore indexing, to convert single index to multi so that max, min and avg can be taken easily.
        # Train set.
        list_a = []
        list_b = raw_feature_train_x.index.values    
        feature_indices_list = raw_feature_train_y_indices['indices'].values

        for idx in feature_indices_list:
            if len(list_a) == 0:
                list_a += [idx for k in range(0,idx+1)]
            else:
                list_a += [idx for k in range(len(list_a)-1,idx)]


        index_keys = [
            np.array(list_a),
            np.array(list_b)        
        ]


        raw_feature_train_x.set_index(keys=index_keys, inplace=True)

        # Colapsing Multindex to fin min, max and mean of the seq. 
        raw_feature_train_x_min = raw_feature_train_x.min(level=0)
        raw_feature_train_x_max = raw_feature_train_x.max(level=0)
        raw_feature_train_x_mean = raw_feature_train_x.mean(level=0)

        raw_feature_train_x = pd.concat([raw_feature_train_x_min, 
                                         raw_feature_train_x_max.iloc[:,1:], 
                                         raw_feature_train_x_mean.iloc[:,1:]],
                                         axis=1,
                                         ignore_index=True)


        # splitting data into test and train splits. Keeping 30% of labels for Val.
        total_y_labels = len(raw_feature_train_y_indices)
        val_samples = total_y_labels * val_set_size
        val_samples = math.floor(val_samples)


        # Selecting new subset of data.
        feature_train_x = raw_feature_train_x.iloc[:total_y_labels-val_samples+1]
        feature_train_y = raw_feature_train_y.dropna().iloc[:total_y_labels-val_samples+1]
#         print("train_x_len", len(feature_train_x))
#         print("train_y_len", len(feature_train_y))

        feature_val_x = raw_feature_train_x.iloc[total_y_labels-val_samples+1:]
        feature_val_y = raw_feature_train_y.dropna().iloc[total_y_labels-val_samples+1:]
        
#         print("val_x_len", len(feature_val_x))
#         print("val_y_len", len(feature_val_y))
        
        train_x_list.append(feature_train_x)
        val_x_list.append(feature_val_x)
    
    
    # removing extra student_id columns.
    # train
    first_feature = train_x_list[0]
    student_id_col = first_feature.iloc[:,0]
    final_list = [feature.iloc[:,1:]  for feature in train_x_list]
    final_list.insert(0, student_id_col)
    
    #resetting indices for each feature.
    for i in range(len(final_list)):
        final_list[i].reset_index(drop=True, inplace=True)
        
    train_x = pd.concat(final_list, axis=1, ignore_index=True)
    
    # val
    first_feature = val_x_list[0]
    student_id_col = first_feature.iloc[:,0]
    final_list =  [ feature.iloc[:,1:]  for feature in val_x_list]
    final_list.insert(0, student_id_col)
    
    #resetting indices for each feature.
    for i in range(len(final_list)):
        final_list[i].reset_index(drop=True, inplace=True)
        
    val_x = pd.concat(final_list, axis=1, ignore_index=True)
    
    train_target = feature_train_y
    val_target = feature_val_y
    
    np_train_x = train_x.as_matrix()
    np_val_x = val_x.as_matrix()
    np_train_target = train_target.as_matrix()
    np_val_target = val_target.as_matrix()
    
    return np_train_x,  np_train_target, np_val_x, np_val_target
    
#     tensor_train_x = torch.from_numpy(np_train_x)
#     tensor_val_x = torch.from_numpy(np_val_x) 
#     tensor_train_target = torch.from_numpy(np_train_target)
#     tensor_val_target = torch.from_numpy(np_val_target)
    
#     if is_cuda_available:
#         tensor_train_x = tensor_train_x.cuda()
#         tensor_val_x = tensor_val_x.cuda()
#         tensor_train_target = tensor_train_target.cuda()
#         tensor_val_target = tensor_val_target.cuda()

#     train_input_seq = Variable(tensor_train_x ,requires_grad=False).float()
#     train_target = Variable(tensor_train_target ,requires_grad=False).long()
#     val_input_seq = Variable(tensor_val_x ,requires_grad=False).float()
#     val_target = Variable(tensor_val_target ,requires_grad=False).long()
    
#     return train_input_seq, train_target, val_input_seq, val_target
    

In [14]:
# feature_list=["activity_details",
#              "sms_details",
#     "conversation_details",
#              ]
train_input_seq, train_target, val_input_seq, val_target = generate_baseline_variables()


In [15]:
# Normalizing 
import pandas as pd


display(pd.concat([pd.DataFrame(train_input_seq), pd.DataFrame(train_target)], axis=1))


# print("training set", train_input_seq)
# print("val set", val_input_seq)

train_input_seq = normalize(train_input_seq, axis=1)
val_input_seq = normalize(val_input_seq, axis=1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,0.1
0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0,2.0,2.0,...,465.0,465.0,465.0,43.706667,-72.289097,43.706667,-72.289097,43.706667,-72.289097,0.0
1,1.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0,2.0,2.0,...,465.0,465.0,465.0,43.706667,-72.289097,43.706667,-72.289097,43.706667,-72.289097,2.0
2,1.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0,2.0,2.0,...,465.0,465.0,465.0,43.706667,-72.289097,43.706667,-72.289097,43.706667,-72.289097,3.0
3,1.0,0.0,0.0,0.0,1.0,1.0,1.0,2.0,2.0,2.0,...,465.0,465.0,465.0,43.706667,-72.289097,43.706667,-72.289097,43.706667,-72.289097,2.0
4,1.0,0.0,3.0,0.011952,1.0,1.0,1.0,0.0,2.0,0.861498,...,465.0,465.0,465.0,43.706637,-72.289097,43.706678,-72.289018,43.706665,-72.28905,3.0
5,1.0,0.0,3.0,0.128988,1.0,1.0,1.0,0.0,2.0,0.785464,...,68.0,132.0,99.285714,43.70161,-72.294811,43.708786,-72.283628,43.706175,-72.288902,3.0
6,1.0,0.0,3.0,0.217645,1.0,1.0,1.0,0.0,2.0,0.889954,...,72.0,485.0,209.4,43.701603,-72.294845,43.708742,-72.283706,43.705945,-72.288889,0.0
7,1.0,0.0,3.0,0.240741,1.0,1.0,1.0,0.0,2.0,1.078723,...,116.0,148.0,126.666667,43.701764,-72.289242,43.706736,-72.288215,43.706335,-72.289048,4.0
8,1.0,0.0,3.0,0.111966,1.0,1.0,1.0,0.0,2.0,0.711462,...,85.0,581.0,236.0,43.704066,-72.300492,43.706944,-72.288925,43.706451,-72.290155,0.0
9,1.0,0.0,3.0,0.030612,1.0,1.0,1.0,0.0,2.0,0.547547,...,71.0,594.0,353.2,43.70659,-72.289481,43.706902,-72.288963,43.70669,-72.289105,0.0


In [16]:
# Implementing Logistic Regression.

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

param_grid = [
    {'C': [0.1, 0.5, 1, 2, 5],
    'penalty' : ["l2", "l1"]}
 ]

clf = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring="accuracy", n_jobs=-1)


clf.fit(train_input_seq, train_target)
best_estimator = clf.estimator
best_estimator.fit(train_input_seq, train_target)
y_pred = best_estimator.predict(val_input_seq)

score = accuracy_score(y_pred, val_target, normalize=True)
f1 = f1_score(y_pred, val_target, average=None)

print("Worst stress levels accuracy is "+ str(score * 100) + " %")
print("Worst stress levels f_1 score ", f1)
print("predicted values", y_pred)

Worst stress levels accuracy is 22.22222222222222 %
Worst stress levels f_1 score  [0.36363636 0.         0.         0.        ]
predicted values [0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [30]:
train_input_seq, train_target, val_input_seq, val_target = generate_baseline_variables()

# display(train_input_seq)
# train_input_seq = normalize(train_input_seq, axis=1)
# val_input_seq = normalize(val_input_seq, axis=1)

In [32]:
# Implementing Linear Regression. 

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler


scalar = StandardScaler()
train_input_seq = scalar.fit_transform(train_input_seq)
val_input_seq = scalar.fit_transform(val_input_seq)

clf = LinearRegression(fit_intercept=False)
clf.fit(X=train_input_seq, y=train_target)
y_pred = clf.predict(val_input_seq)

display(pd.concat([pd.DataFrame(y_pred), pd.DataFrame(val_target)], axis=1, ignore_index=True))
l1_error = mean_absolute_error(y_pred, val_target)
l2_error = mean_squared_error(y_pred, val_target)

print("L1 error is {} and L2 error is {}".format(l1_error, l2_error))

Unnamed: 0,0,1
0,-6.886412,3.0
1,-10.717297,3.0
2,-0.955997,0.0
3,6.578642,1.0
4,-11.091046,1.0
5,0.564446,1.0
6,-4.214649,2.0
7,11.300965,1.0
8,15.421348,0.0


L1 error is 8.28910107918298 and L2 error is 94.09704005672371
