Importing the libraries

In [1]:
import sklearn
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_selection import RFECV
from sklearn import tree
from sklearn import metrics
from sklearn import preprocessing
import numpy as np

Read data file

In [2]:
df = pd.read_csv("out_without_nan_mood_normalised_extra_temp_train.csv")

Add dummies, remove weekDay mean, remove day mean

In [3]:
df['mood_mean_TARGET'] = df['mood_mean_TARGET'].astype(float)
df = pd.concat([df,pd.get_dummies(df['id'])],axis=1)
df = pd.concat([df,pd.get_dummies(df['weekDay'],prefix="day_")],axis=1)
df = df.drop(['weekDay_time_5','day_time_5','weekDay_time_4','day_time_4','weekDay_time_3','day_time_3','weekDay_time_2','day_time_2','weekDay_time_1','day_time_1','weekDay_time_5'],axis=1)
df = df.fillna(0)
df = df.loc[:, (df != 0).any(axis=0)]

Split in train and test set

In [4]:
corr=df.corr()
goodColumns= corr[abs(corr['mood_mean_TARGET'])>0.01]['mood_mean_TARGET'].sort_values()
goodColumns = goodColumns.drop(['mood_mean_TARGET','mood_mean_time_1'])
print(goodColumns)


numberOfTimesSeenMood5Days_5      -0.268224
numberOfTimesSeenMood5Days_6      -0.266362
numberOfTimesSeenMood5Days_4      -0.264096
AS14.07                           -0.238132
lastTimeSeenMood_7                -0.217298
AS14.12                           -0.164646
numberOfTimesSeenMood5Days_3      -0.154276
lastTimeSeenMood_8                -0.143156
lastTimeSeenMood_4                -0.126671
AS14.16                           -0.116964
AS14.05                           -0.113905
sms_sum_time_5                    -0.105072
sms_sum_time_4                    -0.097740
lastTimeSeenMood_6                -0.092742
AS14.33                           -0.090548
sms_sum_time_3                    -0.088051
sms_sum_time_2                    -0.069055
AS14.08                           -0.056615
day__1.0                          -0.052260
lastTimeSeenMood_5                -0.051633
appCat.social_sum_time_2          -0.048294
appCat.social_sum_time_3          -0.046565
AS14.14                         

In [5]:
notRelevantAll=df.drop(['id','date'],axis=1).dropna()
goldY= notRelevantAll['mood_mean_TARGET']
relevant = notRelevantAll.drop(['mood_mean_TARGET'],axis=1)
X_train, X_test, y_train, y_test = train_test_split(relevant[goodColumns.keys()], goldY, test_size=0.33, random_state=42)

In [13]:
clf = sklearn.ensemble.GradientBoostingRegressor(n_estimators=25)
clf = clf.fit(X_train, y_train)


In [14]:
selector = RFECV(clf, step=1, cv=10)
selector = selector.fit(relevant, goldY)

In [15]:
new_features = []
for bool, feature in zip(selector.get_support(), relevant.columns.values):
    if bool:
        new_features.append(feature)
print(new_features)

['circumplex.arousal_mean_time_1', 'mood_mean_time_1', 'mood_mean_time_2', 'circumplex.arousal_mean_time_3', 'circumplex.valence_mean_time_3', 'mood_mean_time_3', 'mood_mean_time_4', 'appCat.other_sum_time_4', 'mood_mean_time_5', 'appCat.travel_sum_time_5', 'numberOfTimesSeenMood5Days_6', 'lastTimeSeenMood_9', 'day__0.6666666666666666']


In [16]:
clf = sklearn.ensemble.GradientBoostingRegressor(n_estimators=100)
clf.fit(relevant[new_features],goldY)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

Define scoring

In [17]:
scoring = {'R2': 'r2',
           'Mean squared Error': 'neg_mean_squared_error',
           'Explained variance': 'explained_variance'}

In [18]:

scores = cross_validate(clf, relevant[new_features], goldY, cv=7,scoring=scoring)

print("R2: %0.2f (+/- %0.2f)" % (scores['test_R2'].mean(), scores['test_R2'].std() * 2))
print("MSE: %0.2f (+/- %0.2f)" % (scores['test_Mean squared Error'].mean(), scores['test_Mean squared Error'].std() * 2))
print("EXPLAINED VARIANCE: %0.2f (+/- %0.2f)" % (scores['test_Explained variance'].mean(), scores['test_Explained variance'].std() * 2))


R2: 0.10 (+/- 0.22)
MSE: -0.43 (+/- 0.30)
EXPLAINED VARIANCE: 0.12 (+/- 0.23)


In [19]:
dfTest = pd.read_csv("out_without_nan_mood_normalised_extra_temp_test.csv")
dfTest['mood_mean_TARGET'] = dfTest['mood_mean_TARGET'].astype(float)
dfTest = pd.concat([dfTest,pd.get_dummies(dfTest['id'])],axis=1)
dfTest = pd.concat([dfTest,pd.get_dummies(dfTest['weekDay'],prefix="day_")],axis=1)
dfTest = dfTest.fillna(0)

y_pred =clf.predict(dfTest[new_features])
print("MAE")
print(metrics.mean_absolute_error(dfTest['mood_mean_TARGET'],y_pred))
print("MSE")
print(metrics.mean_squared_error(dfTest['mood_mean_TARGET'],y_pred))
print("RMSE")
print(np.sqrt(metrics.mean_squared_error(dfTest['mood_mean_TARGET'],y_pred)))
print("R2")
print(metrics.r2_score(dfTest['mood_mean_TARGET'],y_pred))



MAE
0.4598149329800977
MSE
0.3932117214075758
RMSE
0.6270659625650047
R2
0.2928222283337467
