Importing the libraries

In [11]:
import sklearn
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.feature_selection import RFECV
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
from sklearn import preprocessing
import numpy as np

Read data file

In [12]:
df = pd.read_csv("out_without_nan_mood_normalised_extra_temp_train.csv")
clf = LinearRegression()

Add dummies, remove double features

In [13]:
df['mood_mean_TARGET'] = df['mood_mean_TARGET'].astype(float)
df = pd.concat([df,pd.get_dummies(df['id'])],axis=1)
df = pd.concat([df,pd.get_dummies(df['weekDay'],prefix="day_")],axis=1)
df = df.drop(['weekDay_time_5','day_time_5','weekDay_time_4','day_time_4','weekDay_time_3','day_time_3','weekDay_time_2','day_time_2','weekDay_time_1','day_time_1','weekDay_time_5','mood_mean_time_1','appCat.office_sum_time_1'],axis=1)
df = df.fillna(0)
df = df.loc[:, (df != 0).any(axis=0)]



In [14]:
notRelevantAll=df.drop(['id','date'],axis=1).dropna()
goldY= notRelevantAll['mood_mean_TARGET']
relevant = notRelevantAll.drop(['mood_mean_TARGET'],axis=1)

In [15]:
selector = RFECV(clf, step=1,cv=10,scoring='r2')
selector.fit(relevant,goldY)


RFECV(cv=10,
   estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False),
   n_jobs=1, scoring='r2', step=1, verbose=0)

In [16]:
new_features = []
for bool, feature in zip(selector.get_support(), relevant.columns.values):
    if bool:
        new_features.append(feature)
print(new_features)

['appCat.entertainment_sum', 'screen_sum', 'appCat.entertainment_sum_time_1', 'screen_sum_time_1', 'mood_mean_time_2', 'appCat.travel_sum_time_2', 'circumplex.arousal_mean_time_3', 'mood_mean_time_3', 'appCat.travel_sum_time_3', 'appCat.finance_sum_time_4', 'mood_mean_time_5', 'appCat.finance_sum_time_5', 'lastTimeSeenMood_5']


In [17]:
scoring = {'R2': 'r2',
           'Mean squared Error': 'neg_mean_squared_error',
           'Explained variance': 'explained_variance'}

scores = cross_validate(clf, relevant[new_features], goldY, cv=7,scoring=scoring)
print("R2: %0.2f (+/- %0.2f)" % (scores['test_R2'].mean(), scores['test_R2'].std() * 2))
print("MSE: %0.2f (+/- %0.2f)" % (scores['test_Mean squared Error'].mean(), scores['test_Mean squared Error'].std() * 2))
print("EXPLAINED VARIANCE: %0.2f (+/- %0.2f)" % (scores['test_Explained variance'].mean(), scores['test_Explained variance'].std() * 2))


R2: -0.14 (+/- 1.34)
MSE: -0.48 (+/- 0.26)
EXPLAINED VARIANCE: -0.12 (+/- 1.35)


Evaluate on real test set

In [18]:
clf.fit(relevant[new_features], goldY)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [19]:
dfTest = pd.read_csv("out_without_nan_mood_normalised_extra_temp_test.csv")
dfTest['mood_mean_TARGET'] = dfTest['mood_mean_TARGET'].astype(float)
dfTest = pd.concat([dfTest,pd.get_dummies(dfTest['id'])],axis=1)
dfTest = pd.concat([dfTest,pd.get_dummies(dfTest['weekDay'],prefix="day_")],axis=1)
dfTest = dfTest.fillna(0)


In [10]:
y_pred =clf.predict(dfTest[new_features])
print("MAE")
print(metrics.mean_absolute_error(dfTest['mood_mean_TARGET'],y_pred))
print("MSE")
print(metrics.mean_squared_error(dfTest['mood_mean_TARGET'],y_pred))
print("RMSE")
print(np.sqrt(metrics.mean_squared_error(dfTest['mood_mean_TARGET'],y_pred)))
print("R2")
print(metrics.r2_score(dfTest['mood_mean_TARGET'],y_pred))

import matplotlib.pyplot as plt
plt.plot(y_pred-dfTest['mood_mean_TARGET'])
plt.show()

print(list(zip(new_features,clf.coef_)))
allTogether = pd.concat([dfTest['mood_mean_TARGET'],dfTest[new_features]],axis=1)
allTogether['y_pred'] = y_pred
allTogether.to_csv("pred.csv")

MAE
0.4467369710569621
MSE
0.36312386494165394
RMSE
0.602597597855861
R2
0.34693420448140067


<Figure size 640x480 with 1 Axes>

[('appCat.entertainment_sum', -243087215321.06747), ('screen_sum', 4719834453851.17), ('appCat.entertainment_sum_time_1', 243087215321.84702), ('screen_sum_time_1', -4719834453851.267), ('mood_mean_time_2', 2.318983614161153), ('appCat.travel_sum_time_2', 2.0546544445007515), ('circumplex.arousal_mean_time_3', 0.14068864763785371), ('mood_mean_time_3', -1.8520765936652714), ('appCat.travel_sum_time_3', -1.518536831056085), ('appCat.finance_sum_time_4', -2.760916583066527), ('mood_mean_time_5', 2.477836484436355), ('appCat.finance_sum_time_5', 2.808706758778546), ('lastTimeSeenMood_5', 1.9080305667147457)]
