In [12]:
import numpy
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [13]:
df = pd.read_csv("out_without_nan_mood_normalised.csv")
df['mood_mean_TARGET'] = df['mood_mean_TARGET'].astype(float)
df = pd.concat([df,pd.get_dummies(df['id'])],axis=1)
df = pd.concat([df,pd.get_dummies(df['weekDay'],prefix="day_")],axis=1)
df = df.drop(['weekDay_time_5','day_time_5','weekDay_time_4','day_time_4','weekDay_time_3','day_time_3','weekDay_time_2','day_time_2','weekDay_time_1','day_time_1','weekDay_time_5','circumplex.valence_mean_time_1'],axis=1)

Lets remove highly correlated features

In [3]:
def correlation(dataset, threshold):
    col_corr = set() # Set of all the names of deleted columns
    corr_matrix = dataset.corr()
    
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if (abs(corr_matrix.iloc[i, j]) >= threshold) and (corr_matrix.columns[j] not in col_corr) and corr_matrix.columns[i] != "mood_mean_TARGET" and corr_matrix.columns[j] != "mood_mean_TARGET":
                
                colnameI = corr_matrix.columns[i]
                colnameJ = corr_matrix.columns[j]
                if abs(corr_matrix.mood_mean_TARGET[colnameI]) > abs(corr_matrix.mood_mean_TARGET[colnameJ]):
                    colname = colnameJ
                else:
                    colname = colnameI
                col_corr.add(colname)
                if colname in dataset.columns:
                    del dataset[colname] # deleting the column from the dataset
    
    return(dataset)
newDF =df
correlation(newDF,0.7)
print(newDF.columns.values)

['id' 'date' 'weekDay' 'circumplex.arousal_mean_time_1'
 'appCat.builtin_sum_time_1' 'activity_mean_time_2'
 'circumplex.valence_mean_time_2' 'appCat.travel_sum_time_2'
 'appCat.weather_sum_time_2' 'appCat.social_sum_time_3'
 'appCat.utilities_sum_time_3' 'appCat.builtin_sum_time_4'
 'circumplex.arousal_mean_time_5' 'mood_mean_time_5'
 'appCat.communication_sum_time_5' 'appCat.entertainment_sum_time_5'
 'appCat.finance_sum_time_5' 'appCat.game_sum_time_5'
 'appCat.office_sum_time_5' 'appCat.other_sum_time_5'
 'appCat.unknown_sum_time_5' 'call_sum_time_5' 'sms_sum_time_5'
 'lastTimeSeenMood_1' 'lastTimeSeenMood_2' 'lastTimeSeenMood_3'
 'lastTimeSeenMood_4' 'lastTimeSeenMood_6' 'lastTimeSeenMood_7'
 'lastTimeSeenMood_8' 'lastTimeSeenMood_9' 'mood_mean_TARGET'
 'numberOfTimesSeenMood5Days_7' 'numberOfTimesSeenMood5Days_8' 'AS14.01'
 'AS14.02' 'AS14.03' 'AS14.05' 'AS14.06' 'AS14.07' 'AS14.08' 'AS14.09'
 'AS14.12' 'AS14.13' 'AS14.14' 'AS14.15' 'AS14.16' 'AS14.17' 'AS14.19'
 'AS14.20' 'AS14.

In [14]:
corr=df.corr()
goodColumns= corr[abs(corr['mood_mean_TARGET'])>0.25]['mood_mean_TARGET'].sort_values()
goodColumns = goodColumns.drop(['mood_mean_TARGET'])

usedColumns = goodColumns.keys()



In [15]:
usedColumns


Index(['AS14.07', 'lastTimeSeenMood_7', 'circumplex.valence_mean',
       'circumplex.valence_mean_time_2', 'mood_mean', 'mood_mean_time_1',
       'mood_mean_time_3', 'mood_mean_time_4', 'mood_mean_time_2',
       'mood_mean_time_5'],
      dtype='object')

In [16]:
notRelevantAll=df.drop(['id','date'],axis=1).dropna()
goldY= notRelevantAll['mood_mean_TARGET']
relevant = notRelevantAll.drop(['mood_mean_TARGET'],axis=1)


In [17]:
# define base model
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(len(usedColumns), input_dim=len(usedColumns), kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

In [18]:
# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)
# evaluate model with standardized dataset
estimator = KerasRegressor(build_fn=baseline_model, epochs=100, batch_size=5, verbose=0)

In [19]:
scoring = {'R2': 'r2',
           'Mean squared Error': 'neg_mean_squared_error',
           'Explained variance': 'explained_variance'}

scores = cross_validate(estimator, relevant[usedColumns], goldY, cv=7,scoring=scoring)
print("R2: %0.2f (+/- %0.2f)" % (scores['test_R2'].mean(), scores['test_R2'].std() * 2))
print("MSE: %0.2f (+/- %0.2f)" % (scores['test_Mean squared Error'].mean(), scores['test_Mean squared Error'].std() * 2))
print("EXPLAINED VARIANCE: %0.2f (+/- %0.2f)" % (scores['test_Explained variance'].mean(), scores['test_Explained variance'].std() * 2))


R2: 0.16 (+/- 0.26)
MSE: -0.41 (+/- 0.26)
EXPLAINED VARIANCE: 0.18 (+/- 0.29)
