# Data cleaning

In [1]:
import pandas as pd
import numpy as np
import random
import json
from tensorflow.keras.models import Model, load_model
# import pycaret

# Replace double quotes with single quotes for moods JSON arrays
f = open('song_data.csv','r+')
text = f.read()
text = text.replace('"["', '"[?').replace('", "', '?,?').replace('"]"', '?]"') # Changed to "[$Happy$,$Sad$]" for easy replacing later
f.seek(0)
f.write(text)
f.close()

# Importing data
df = pd.read_csv('song_data.csv', index_col=0)
df.drop('uuid', axis=1, inplace=True)
df.dropna(inplace=True) # drop rows with nan values
for col in df.columns:
    if col not in ['id','isSkipped']:
        if col == 'moods':
            df[col] = df[col].apply(lambda x:x.replace('?','"')) # Replaces mood values back to the form ["Happy","Sad"] so it can be loaded by json
        df[col] = df[col].apply(json.loads)
df['activity'] = df.apply(lambda _: '', axis=1) # empty activity column
print('Number of samples: ', df.shape[0])
df.head()

Number of samples:  525


Unnamed: 0_level_0,gyroX,gyroY,gyroZ,accelX,accelY,accelZ,optical,temp,humidity,moods,isSkipped,activity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,"[499.30572509765625, 499.53460693359375, 0.198...","[1.861572265625, 2.49481201171875, 1.022338867...","[1.24359130859375, 1.2359619140625, 1.06048583...","[1.1943359375, 1.201171875, 1.1845703125, 1.18...","[1.1455078125, 1.1591796875, 1.1630859375, 1.1...","[3.68359375, 3.654296875, 3.6748046875, 3.6650...","[139.64, 138.36, 139.64, 140.28]","[30.50567626953125, 30.50567626953125, 30.5056...","[71.3134765625, 71.3134765625, 71.3134765625]","[Depressive, Atmospheric]",0,
2,"[498.1765747070313, 0.98419189453125, 1.579284...","[32.27996826171875, 14.7247314453125, 9.864807...","[497.9248046875, 496.368408203125, 494.9645996...","[2.9326171875, 2.9345703125, 2.728515625, 2.64...","[0.8466796875, 0.74609375, 0.865234375, 15.524...","[2.7548828125, 2.8037109375, 2.806640625, 3.34...","[123.24, 123.24, 139.32, 228.64]","[31.69403076171875, 31.69403076171875, 31.6940...","[67.05322265625, 67.05322265625, 67.0532226562...",[Depressive],0,
3,"[15.76995849609375, 10.65826416015625, 6.87408...","[488.36517333984375, 486.5798950195313, 496.92...","[3.86810302734375, 5.0811767578125, 498.947143...","[14.0107421875, 14.3212890625, 14.232421875, 1...","[14.96484375, 15.212890625, 15.275390625, 15.1...","[3.2021484375, 3.3291015625, 3.375, 3.35644531...","[256.08, 307.84000000000003, 315.2, 301.36, 30...","[32.21771240234375, 32.21771240234375, 32.2177...","[65.850830078125, 65.850830078125, 65.85083007...","[Passionate, Depressive]",0,
4,"[499.93896484375, 499.45068359375, 499.7482299...","[1.82342529296875, 2.74658203125, 1.8844604492...","[1.57928466796875, 1.434326171875, 1.365661621...","[1.7353515625, 1.708984375, 1.7333984375, 1.71...","[13.7841796875, 13.80078125, 13.7744140625, 13...","[2.8232421875, 2.8369140625, 2.8154296875, 2.8...","[127.08, 126.76, 125.48, 124.52]","[32.42919921875, 32.42919921875, 32.4291992187...","[64.6728515625, 64.6728515625, 64.6728515625, ...",[Elegant],1,
5,"[499.9465942382813, 0.03814697265625, 499.7482...","[2.01416015625, 1.77001953125, 1.7852783203125...","[1.1444091796875, 1.2359619140625, 1.129150390...","[0.4580078125, 0.4609375, 0.453125, 0.43847656...","[13.181640625, 13.1689453125, 13.1787109375, 1...","[2.6806640625, 2.693359375, 2.6875, 2.68652343...","[145.76, 144.48, 146.4, 144.8]","[32.42919921875, 32.42919921875, 32.4291992187...","[64.6728515625, 64.6728515625, 64.6728515625, ...","[Passionate, Depressive]",1,


In [2]:
data = df.iloc[0,:]
display(data)
# print(data['temp'])

gyroX        [499.30572509765625, 499.53460693359375, 0.198...
gyroY        [1.861572265625, 2.49481201171875, 1.022338867...
gyroZ        [1.24359130859375, 1.2359619140625, 1.06048583...
accelX       [1.1943359375, 1.201171875, 1.1845703125, 1.18...
accelY       [1.1455078125, 1.1591796875, 1.1630859375, 1.1...
accelZ       [3.68359375, 3.654296875, 3.6748046875, 3.6650...
optical                       [139.64, 138.36, 139.64, 140.28]
temp         [30.50567626953125, 30.50567626953125, 30.5056...
humidity         [71.3134765625, 71.3134765625, 71.3134765625]
moods                                [Depressive, Atmospheric]
isSkipped                                                    0
activity                                                      
Name: 1, dtype: object

In [3]:
# Filtering defective data

defective_ids = []
for idx,row in df.iterrows():
    # defective if temp array only has -40 values
    # defective if any humidity values are above 99.99
    if len([k for k in row['temp'] if k==-40]) == len(row['temp']) or \
    len([k for k in row['humidity'] if k>99]) == len(row['humidity']):# or int(row['isSkipped']) == 1:
        defective_ids.append(idx)
    
    # if only some values are defective, keep the row, but remove defective values
    # remove -40 temp values and >99.99 humidity values
    elif (-40 in row['temp']) or len([k for k in row['humidity'] if k>99]) or (0 in row['optical']):
        df.at[idx,'temp'] = [k for k in row['temp'] if k!=-40]
        df.at[idx,'humidity'] = [k for k in row['humidity'] if k<=99]
        
    # some gyro/accel data have 40 samples
    # take the last 30 samples for these rows
    for col in df.columns[:6]:
        if len(row[col]) > 30:
            df.at[idx,col] = row[col][-30:]

filtered_df = df[~df.index.isin(defective_ids)].copy() # .copy() to avoid warning
print('%d defective rows: ' % len(defective_ids),defective_ids)

23 defective rows:  [21, 22, 33, 214, 236, 238, 245, 246, 247, 248, 249, 250, 251, 252, 386, 387, 388, 389, 390, 391, 392, 393, 394]


In [4]:
motion_model_path = 'firstModel_stackedLSTM.hd5'
model = load_model(motion_model_path)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 30, 128)           69120     
_________________________________________________________________
lstm_1 (LSTM)                (None, 30, 64)            49408     
_________________________________________________________________
lstm_2 (LSTM)                (None, 32)                12416     
_________________________________________________________________
dense (Dense)                (None, 100)               3300      
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 303       
Total params: 134,547
Trainable params: 134,547
Non-trainable params: 0
_________________________________________________________________


In [5]:
# Add activity from motion recognition model
x = [list(k) for k in filtered_df.iloc[:,:6].values]
x = np.array(x) # (num_samples, 6, 30)
x = np.array([k.T for k in x]) # reshape as (num_samples, 30, 6)

pred = model.predict(x)

activity_cats = np.array(['Running', 'Walking', 'Working']) # hardcoded categories from 'Physical Activity Classification.ipynb'
filtered_df['activity'] = activity_cats[np.argmax(pred, axis=1)]
#filtered_df['activity'] = np.argmax(pred, axis=1)
filtered_df = filtered_df.iloc[:,6:].copy() # drop gyro and accel columns

filtered_df.sample(5)

Unnamed: 0_level_0,optical,temp,humidity,moods,isSkipped,activity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
522,"[312.48, 311.84000000000003, 304.48, 309.6]","[31.13006591796875, 31.1199951171875, 31.11999...","[79.33349609375, 79.33349609375, 79.33349609375]",[Elegant],1,Working
208,"[4858.88, 4828.16, 4599.04, 5698.56]","[34.67498779296875, 34.67498779296875, 34.6749...","[70.166015625, 70.166015625, 70.166015625, 70....",[Warm],1,Walking
120,"[201.12, 202.4, 201.76, 201.44]","[33.758544921875, 33.758544921875, 33.75854492...","[58.19091796875, 58.19091796875, 58.19091796875]","[Depressive, Passionate]",0,Working
90,"[95.12, 94.8, 95.64, 96.28, 95.48]","[34.01031494140625, 34.01031494140625, 34.0103...","[76.28173828125, 76.28173828125, 76.28173828125]",[Celebratory],0,Working
42,"[57.36, 57.38, 55.78, 57.52]","[31.3616943359375, 31.3616943359375, 31.361694...","[79.62646484375, 79.62646484375, 79.62646484375]","[Warm, Passionate, Elegant]",1,Working


In [6]:
# Obtain mean optical, temp and humidity values

for col in filtered_df.columns:
    if col not in ['moods','isSkipped','activity']:
        filtered_df[col] = filtered_df[col].apply(np.mean)
filtered_df.head()

Unnamed: 0_level_0,optical,temp,humidity,moods,isSkipped,activity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,139.48,30.505676,71.313477,"[Depressive, Atmospheric]",0,Working
2,153.61,31.694031,67.053223,[Depressive],0,Working
3,297.792,32.217712,65.85083,"[Passionate, Depressive]",0,Working
4,125.96,32.429199,64.672852,[Elegant],1,Working
5,145.36,32.429199,64.672852,"[Passionate, Depressive]",1,Working


In [7]:
# One-hot encoding for moods

moods = []
for k in filtered_df['moods'].values:
    moods += list(k)
moods = np.unique(np.array(moods))
for mood in moods:
    mood_values = filtered_df['moods'].astype(str).str.contains(mood)
    filtered_df[mood] = mood_values
filtered_df.drop('moods', axis=1, inplace=True)
print('Added one-hot encoded columns for moods:')
filtered_df.head()

Added one-hot encoded columns for moods:


Unnamed: 0_level_0,optical,temp,humidity,isSkipped,activity,Aggressive,Athletic,Atmospheric,Celebratory,Depressive,Elegant,Passionate,Warm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,139.48,30.505676,71.313477,0,Working,False,False,True,False,True,False,False,False
2,153.61,31.694031,67.053223,0,Working,False,False,False,False,True,False,False,False
3,297.792,32.217712,65.85083,0,Working,False,False,False,False,True,False,True,False
4,125.96,32.429199,64.672852,1,Working,False,False,False,False,False,True,False,False
5,145.36,32.429199,64.672852,1,Working,False,False,False,False,True,False,True,False


In [8]:
# Invert mood boolean values based on "isSkipped"

for mood in moods:
    filtered_df[mood] = np.abs(filtered_df[mood] - filtered_df['isSkipped'])
    #filtered_df[mood] = np.where(filtered_df['isSkipped'] == 1, filtered_df[mood] / 4.0, filtered_df[mood]) #
filtered_df.drop('isSkipped', axis=1, inplace=True)
print('Invert mood values based on "isSkipped" boolean:')
filtered_df.head()

Invert mood values based on "isSkipped" boolean:


Unnamed: 0_level_0,optical,temp,humidity,activity,Aggressive,Athletic,Atmospheric,Celebratory,Depressive,Elegant,Passionate,Warm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,139.48,30.505676,71.313477,Working,0,0,1,0,1,0,0,0
2,153.61,31.694031,67.053223,Working,0,0,0,0,1,0,0,0
3,297.792,32.217712,65.85083,Working,0,0,0,0,1,0,1,0
4,125.96,32.429199,64.672852,Working,1,1,1,1,1,0,1,1
5,145.36,32.429199,64.672852,Working,1,1,1,1,0,1,0,1


In [9]:
# One-hot encoding for activity
activities = []
for k in filtered_df['activity']:
    activities.append(k)
activities = np.unique(np.array(activities))
for activity in activities:
    activity_values = filtered_df['activity'].astype(str).str.contains(activity)
    activity_values = activity_values.astype(int)
    filtered_df[activity] = activity_values
filtered_df.drop('activity', axis=1, inplace=True)

filtered_df.head()

Unnamed: 0_level_0,optical,temp,humidity,Aggressive,Athletic,Atmospheric,Celebratory,Depressive,Elegant,Passionate,Warm,Running,Walking,Working
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,139.48,30.505676,71.313477,0,0,1,0,1,0,0,0,0,0,1
2,153.61,31.694031,67.053223,0,0,0,0,1,0,0,0,0,0,1
3,297.792,32.217712,65.85083,0,0,0,0,1,0,1,0,0,0,1
4,125.96,32.429199,64.672852,1,1,1,1,1,0,1,1,0,0,1
5,145.36,32.429199,64.672852,1,1,1,1,0,1,0,1,0,0,1


# Split into train/test datasets

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

# label encoding for activity
# le = LabelEncoder()
# filtered_df['activity'] = le.fit_transform(filtered_df['activity'].values)
# display(filtered_df.head())

# split into training & testing
x = filtered_df[['optical', 'temp', 'humidity','Working', 'Running', 'Walking']]
y = filtered_df[['Aggressive', 'Athletic', 'Atmospheric', 'Celebratory', \
                 'Melancholic', 'Elegant', 'Passionate', 'Warm']]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=2)

print('Training (x, y): ', x_train.shape, y_train.shape)
print('Testing (x, y): ', x_test.shape, y_test.shape)

# create dfs for training and test data for easy prediction later
train_df = x_train.join(y_train)
test_df = x_test.join(y_test)

print('\nTrain:')
display(train_df.head())
print('Test:')
display(test_df.head())

Training (x, y):  (401, 6) (401, 8)
Testing (x, y):  (101, 6) (101, 8)

Train:


Unnamed: 0_level_0,optical,temp,humidity,Working,Running,Walking,Aggressive,Athletic,Atmospheric,Celebratory,Depressive,Elegant,Passionate,Warm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
3,297.792,32.217712,65.85083,1,0,0,0,0,0,0,1,0,1,0
36,59.925,31.361694,79.626465,1,0,0,1,0,1,1,1,1,1,1
547,222.533333,33.345642,62.168376,1,0,0,0,1,0,0,0,0,0,0
151,1887.84,31.321411,81.719971,0,0,1,1,0,1,1,1,1,1,1
323,235.12,32.731323,80.114746,1,0,0,1,1,1,0,1,1,1,1


Test:


Unnamed: 0_level_0,optical,temp,humidity,Working,Running,Walking,Aggressive,Athletic,Atmospheric,Celebratory,Depressive,Elegant,Passionate,Warm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
199,5458.24,35.541077,65.740967,0,0,1,1,1,0,1,0,1,1,1
551,240.933333,33.360748,61.566162,1,0,0,1,1,1,1,1,1,1,0
262,43.215,29.266968,43.444824,1,0,0,0,1,0,0,0,0,0,0
296,0.86,31.039429,38.787842,1,0,0,1,1,0,1,1,1,1,1
558,267.588571,33.242057,60.681152,1,0,0,1,0,0,0,0,0,0,0


# SVM

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import svm

# model training
# since predict_proba only works with two classes at a time,
# eight svm models are trained for each mood and stored in the dictionary "svm_moods"
svms_std = {} # key:mood, value:svm model trained on that mood
params = {'C':[0.001, 0.01, 0.1, 1, 10], 'kernel':('linear', 'poly', 'rbf', 'sigmoid'), 'decision_function_shape':('ovr', 'ovo')}
for mood in y_train.columns:
    svm_pipe_std = Pipeline([('scaler', StandardScaler()), 
                       ('svm', GridSearchCV(svm.SVC(max_iter=100000, probability=True), params)), ])
    svm_pipe_std.fit(x_train, y_train.loc[:,mood].values)
    svms_std[mood] = svm_pipe_std



In [18]:
svms_mm = {} # key:mood, value:svm model trained on that mood
params = {'C':[0.001, 0.01, 0.1, 1, 10], 'kernel':('linear', 'poly', 'rbf', 'sigmoid'), 'decision_function_shape':('ovr', 'ovo')}
for mood in y_train.columns:
    svm_pipe_mm = Pipeline([('scaler', MinMaxScaler()), 
                       ('svm', GridSearchCV(svm.SVC(max_iter=100000, probability=True), params)), ])
    svm_pipe_mm.fit(x_train, y_train.loc[:,mood].values)
    svms_mm[mood] = svm_pipe_mm

In [31]:
# Define functions for prediction and evaluation of SVM model

from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score

from sklearn.preprocessing import normalize

# Predict binary values or confidence scores for moods from sensor data
# For now, input_data is a Dataframe for convenience
def svm_predict(svms, input_data, prob=True):
    pred_df = input_data.copy()
    for mood,svm in svms.items():
        if prob:
            pred = svm.predict_proba(pred_df.iloc[:,:-8].values)
            pred_df[mood] = pred[:,1]
        else:
            pred = svm.predict(pred_df.iloc[:,:-8].values)
            pred_df[mood] = pred.reshape(-1)
    return pred_df
    
# Get loss (MSE) of predicted confidence scores
# Input: DataFrames of actual and predicted moods
def evaluate(df_actual, df_pred, df_pred_proba, clf):
    evaluation_df_dict = {
        'mood': [],
        'accuracy (%s)'%clf: [],
        'precision (%s)'%clf: [],
        'recall (%s)'%clf: [],
        'mse (%s)'%clf: [],
    }
    df_moods_actual = df_actual.iloc[:,-8:]
    df_moods_pred =df_pred.iloc[:,-8:]
    
    #mse
    df_moods_actual_normed = normalize(df_actual.iloc[:,-8:], axis=1)
    df_moods_pred_proba =  normalize(df_pred_proba.iloc[:,-8:], axis=1)
    currCol = 0
    
    for mood in df_moods_actual.columns:
        y_actual, y_pred = df_moods_actual[mood].values, df_moods_pred[mood].values
        evaluation_df_dict['mood'].append(mood)
        evaluation_df_dict['accuracy (%s)'%clf].append(accuracy_score(y_actual, y_pred))
        evaluation_df_dict['precision (%s)'%clf].append(precision_score(y_actual, y_pred, zero_division=0))
        evaluation_df_dict['recall (%s)'%clf].append(recall_score(y_actual, y_pred))
        
        evaluation_df_dict['mse (%s)'%clf].append(mean_squared_error(df_moods_actual_normed[currCol], df_moods_pred_proba[currCol]))
        currCol += 1
    evaluation_df = pd.DataFrame(evaluation_df_dict)
    evaluation_df.set_index('mood', inplace=True)
    return evaluation_df

In [32]:
train_pred_svm_df = svm_predict(svms_std, train_df, prob=False)
test_pred_svm_df = svm_predict(svms_std, test_df, prob=False)
train_prob_svm_df = svm_predict(svms_std, train_df, prob=True)
test_prob_svm_df = svm_predict(svms_std, test_df, prob=True)

# Show predicted values and confidence scores for moods for random samples
print('Test data mood labels (Actual):')
display(test_df.head())

print('Test data mood labels (Predicted):')
display(test_pred_svm_df.head())

print('Test data mood labels (Confidence Scores):')
display(test_prob_svm_df.head())

# Show evaluation of predictions
train_eval_svm_df = evaluate(train_df, train_pred_svm_df, train_prob_svm_df, 'svm,std')
test_eval_svm_df = evaluate(test_df, test_pred_svm_df, test_prob_svm_df, 'svm,std')
display(test_eval_svm_df)
display(test_eval_svm_df.describe())

# # Show MSE of confidence scores
# print('MSE (train): ', evaluate_mse(train_df, train_pred_svm_df))
# print('MSE (test): ', evaluate_mse(test_df, test_pred_svm_df))

# # Show predicted confidence scores for moods for different activities
# print('\nPredicted moods for different activities:')
# for activity in le.classes_:
#     display(test_pred_svm_df[test_pred_svm_df['activity']==activity].head())

Test data mood labels (Actual):


Unnamed: 0_level_0,optical,temp,humidity,Working,Running,Walking,Aggressive,Athletic,Atmospheric,Celebratory,Depressive,Elegant,Passionate,Warm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
199,5458.24,35.541077,65.740967,0,0,1,1,1,0,1,0,1,1,1
551,240.933333,33.360748,61.566162,1,0,0,1,1,1,1,1,1,1,0
262,43.215,29.266968,43.444824,1,0,0,0,1,0,0,0,0,0,0
296,0.86,31.039429,38.787842,1,0,0,1,1,0,1,1,1,1,1
558,267.588571,33.242057,60.681152,1,0,0,1,0,0,0,0,0,0,0


Test data mood labels (Predicted):


Unnamed: 0_level_0,optical,temp,humidity,Working,Running,Walking,Aggressive,Athletic,Atmospheric,Celebratory,Depressive,Elegant,Passionate,Warm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
199,5458.24,35.541077,65.740967,0,0,1,0,0,0,1,0,0,1,0
551,240.933333,33.360748,61.566162,1,0,0,0,0,0,0,0,0,0,0
262,43.215,29.266968,43.444824,1,0,0,0,0,0,0,0,0,0,0
296,0.86,31.039429,38.787842,1,0,0,0,0,0,0,1,0,0,0
558,267.588571,33.242057,60.681152,1,0,0,0,0,0,0,0,0,0,0


Test data mood labels (Confidence Scores):


Unnamed: 0_level_0,optical,temp,humidity,Working,Running,Walking,Aggressive,Athletic,Atmospheric,Celebratory,Depressive,Elegant,Passionate,Warm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
199,5458.24,35.541077,65.740967,0,0,1,0.424773,0.438609,0.440362,0.608712,0.471591,0.422146,0.589217,0.466918
551,240.933333,33.360748,61.566162,1,0,0,0.353626,0.440614,0.464293,0.413031,0.439711,0.453879,0.411682,0.436654
262,43.215,29.266968,43.444824,1,0,0,0.353629,0.44119,0.466303,0.420765,0.462163,0.453193,0.41161,0.436665
296,0.86,31.039429,38.787842,1,0,0,0.353642,0.441022,0.461617,0.404158,0.5,0.453643,0.411625,0.439964
558,267.588571,33.242057,60.681152,1,0,0,0.353638,0.440847,0.464257,0.411829,0.440134,0.453924,0.411678,0.43689


Unnamed: 0_level_0,"accuracy (svm,std)","precision (svm,std)","recall (svm,std)","mse (svm,std)"
mood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Aggressive,0.623762,0.571429,0.2,0.031719
Athletic,0.584158,0.714286,0.294118,0.017593
Atmospheric,0.49505,0.0,0.0,0.15974
Celebratory,0.544554,0.653846,0.314815,0.019367
Depressive,0.485149,0.5,0.096154,0.176948
Elegant,0.534653,0.0,0.0,0.016817
Passionate,0.455446,0.352941,0.12,0.172957
Warm,0.60396,0.75,0.195652,0.017911


Unnamed: 0,"accuracy (svm,std)","precision (svm,std)","recall (svm,std)","mse (svm,std)"
count,8.0,8.0,8.0,8.0
mean,0.540842,0.442813,0.152592,0.076631
std,0.060094,0.300631,0.120276,0.077512
min,0.455446,0.0,0.0,0.016817
25%,0.492574,0.264706,0.072115,0.017831
50%,0.539604,0.535714,0.157826,0.025543
75%,0.589109,0.668956,0.223529,0.163044
max,0.623762,0.75,0.314815,0.176948


In [33]:
train_mm_pred = svm_predict(svms_mm, train_df, prob=False)
test_mm_pred = svm_predict(svms_mm, test_df, prob=False)
train_mm_prob = svm_predict(svms_mm, train_df, prob=True)
test_mm_prob = svm_predict(svms_mm, test_df, prob=True)

# Show evaluation of predictions
train_eval_mm = evaluate(train_df, train_mm_pred, train_mm_prob, 'svm,mm')
test_eval_mm = evaluate(test_df, test_mm_pred, train_mm_prob, 'svm,mm')
display(test_eval_mm)
display(test_eval_mm.describe())

Unnamed: 0_level_0,"accuracy (svm,mm)","precision (svm,mm)","recall (svm,mm)","mse (svm,mm)"
mood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Aggressive,0.643564,0.666667,0.2,0.03603
Athletic,0.643564,0.8,0.392157,0.02068
Atmospheric,0.49505,0.0,0.0,0.163045
Celebratory,0.544554,0.653846,0.314815,0.013182
Depressive,0.465347,0.416667,0.096154,0.17655
Elegant,0.524752,0.461538,0.12766,0.017397
Passionate,0.465347,0.357143,0.1,0.176011
Warm,0.514851,0.459459,0.369565,0.015459


Unnamed: 0,"accuracy (svm,mm)","precision (svm,mm)","recall (svm,mm)","mse (svm,mm)"
count,8.0,8.0,8.0,8.0
mean,0.537129,0.476915,0.200044,0.077294
std,0.071151,0.244265,0.143863,0.078721
min,0.465347,0.0,0.0,0.013182
25%,0.487624,0.401786,0.099038,0.016913
50%,0.519802,0.460499,0.16383,0.028355
75%,0.569307,0.657051,0.328502,0.166286
max,0.643564,0.8,0.392157,0.17655


# Random Forest

In [34]:
from sklearn.ensemble import RandomForestClassifier

# params = {'n_estimators':[10,50,100,150,200,500], 'criterion':('gini', 'entropy'), 'max_depth':[k for k in range(1,21)]}
params = {'criterion':('gini', 'entropy'), 'max_depth':[k for k in range(1,21)]}
rf = GridSearchCV(RandomForestClassifier(random_state=0), params)
rf.fit(x_train, y_train)
rf.best_params_

{'criterion': 'gini', 'max_depth': 18}

In [28]:
# Predict binary values or confidence scores for moods from sensor data
# For now, input_data is a Dataframe for convenience
# Returns confidence scores if prob=True else binary values
def rf_predict(rf, input_data, prob=True):
    pred_df = input_data.copy()
    data = input_data.iloc[:,:-8]
    target = input_data.iloc[:,-8:]
    if prob:
        pred = np.array(rf.predict_proba(data.values))
    else:
        pred = np.array(rf.predict(data.values))
    for i in range(len(target.columns)):
        mood = target.columns[i]
        if prob:
            pred_df[mood] = pred[i,:,1] # predict_proba returns shape (n_features, n_samples, probs)
        else:
            pred_df[mood] = pred[:,i] # predict returns shape (n_samples, n_features)
    return pred_df

In [39]:
def rf_predict(rf, input_data, prob=True):
    pred_df = input_data.copy()
    data = input_data.iloc[:,:-8]
    target = input_data.iloc[:,-8:]
    if prob:
        pred = np.array(rf.predict_proba(data.values))
    else:
        pred = np.array(rf.predict(data.values))
    for i in range(len(target.columns)):
        mood = target.columns[i]
        if prob:
            pred_df[mood] = pred[i,:,1] # predict_proba returns shape (n_features, n_samples, probs)
        else:
            pred_df[mood] = pred[:,i] # predict returns shape (n_samples, n_features)
    return pred_df

train_pred_rf_df = rf_predict(rf, train_df, prob=False)
test_pred_rf_df = rf_predict(rf, test_df, prob=False)
train_prob_rf_df = rf_predict(rf, train_df, prob=True)
test_prob_rf_df = rf_predict(rf, test_df, prob=True)

# Show predicted values and confidence scores for moods for random samples
print('Test data mood labels (Actual):')
test_df_copy = test_df.copy()
#test_df_copy['activity'] = [activity_cats[k] for k in test_df_copy['activity'].values]
# test_df_copy['activity'] = le.inverse_transform(test_df_copy['activity'].values) # convert activity values back to strings
display(test_df.head())

print('Test data mood labels (Predicted):')
# test_pred_rf_df['activity'] = le.inverse_transform(test_pred_rf_df['activity'].values)
#test_pred_rf_df['activity'] = [activity_cats[k] for k in test_pred_rf_df['activity'].values]
display(test_pred_rf_df.head())

print('Test data mood labels (Confidence Scores):')
#test_prob_rf_df['activity'] = [activity_cats[k] for k in test_prob_rf_df['activity'].values]
# test_pred_proba_rf_df['activity'] = le.inverse_transform(test_pred_proba_rf_df['activity'].values)
display(test_prob_rf_df.head())

# Show evaluation of predictions
train_eval_rf_df = evaluate(train_df, train_pred_rf_df, train_prob_rf_df, 'rf')
test_eval_rf_df = evaluate(test_df, test_pred_rf_df, test_prob_rf_df, 'rf')
display(test_eval_rf_df)
display(test_eval_rf_df.describe())



# Normalize, just for visualization (cosine measure already does it)
from sklearn.preprocessing import normalize
NORMAL_MOODS=['Aggressive', 'Athletic', 'Atmospheric', 'Celebratory', \
                 'Melancholic', 'Elegant', 'Passionate', 'Warm']
test_prob_rf_df[NORMAL_MOODS] = normalize(test_prob_rf_df[NORMAL_MOODS].to_numpy(), axis=1)

print('Test data mood labels (Confidence Scores, Postprocessed):')
display(test_prob_rf_df.head(30))

Test data mood labels (Actual):


Unnamed: 0_level_0,optical,temp,humidity,Working,Running,Walking,Aggressive,Athletic,Atmospheric,Celebratory,Depressive,Elegant,Passionate,Warm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
199,5458.24,35.541077,65.740967,0,0,1,1,1,0,1,0,1,1,1
551,240.933333,33.360748,61.566162,1,0,0,1,1,1,1,1,1,1,0
262,43.215,29.266968,43.444824,1,0,0,0,1,0,0,0,0,0,0
296,0.86,31.039429,38.787842,1,0,0,1,1,0,1,1,1,1,1
558,267.588571,33.242057,60.681152,1,0,0,1,0,0,0,0,0,0,0


Test data mood labels (Predicted):


Unnamed: 0_level_0,optical,temp,humidity,Working,Running,Walking,Aggressive,Athletic,Atmospheric,Celebratory,Depressive,Elegant,Passionate,Warm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
199,5458.24,35.541077,65.740967,0,0,1,1,0,1,1,1,1,1,1
551,240.933333,33.360748,61.566162,1,0,0,0,0,0,0,0,0,0,0
262,43.215,29.266968,43.444824,1,0,0,0,0,1,0,1,0,0,0
296,0.86,31.039429,38.787842,1,0,0,1,0,1,1,1,1,1,1
558,267.588571,33.242057,60.681152,1,0,0,0,1,0,0,0,0,0,0


Test data mood labels (Confidence Scores):


Unnamed: 0_level_0,optical,temp,humidity,Working,Running,Walking,Aggressive,Athletic,Atmospheric,Celebratory,Depressive,Elegant,Passionate,Warm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
199,5458.24,35.541077,65.740967,0,0,1,0.58,0.12,0.58,0.65,0.84,0.58,0.64,0.51
551,240.933333,33.360748,61.566162,1,0,0,0.165,0.135,0.12125,0.3225,0.14625,0.30125,0.305,0.345
262,43.215,29.266968,43.444824,1,0,0,0.026038,0.042058,0.752673,0.036519,0.641904,0.112096,0.068058,0.243808
296,0.86,31.039429,38.787842,1,0,0,0.725,0.152667,0.847667,0.817667,0.852667,0.995,0.73,0.847667
558,267.588571,33.242057,60.681152,1,0,0,0.07,0.73,0.16,0.18,0.17,0.1,0.06,0.05


Unnamed: 0_level_0,accuracy (rf),precision (rf),recall (rf),mse (rf)
mood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Aggressive,0.554455,0.413793,0.3,0.062853
Athletic,0.564356,0.606061,0.392157,0.047684
Atmospheric,0.49505,0.5,0.333333,0.239783
Celebratory,0.623762,0.681818,0.555556,0.031517
Depressive,0.455446,0.463415,0.365385,0.228139
Elegant,0.534653,0.5,0.340426,0.105539
Passionate,0.465347,0.441176,0.3,0.221668
Warm,0.60396,0.588235,0.434783,0.070059


Unnamed: 0,accuracy (rf),precision (rf),recall (rf),mse (rf)
count,8.0,8.0,8.0,8.0
mean,0.537129,0.524312,0.377705,0.125905
std,0.061662,0.092268,0.085092,0.088743
min,0.455446,0.413793,0.3,0.031517
25%,0.487624,0.457855,0.325,0.059061
50%,0.544554,0.5,0.352905,0.087799
75%,0.574257,0.592692,0.402813,0.223285
max,0.623762,0.681818,0.555556,0.239783


Test data mood labels (Confidence Scores, Postprocessed):


Unnamed: 0_level_0,optical,temp,humidity,Working,Running,Walking,Aggressive,Athletic,Atmospheric,Celebratory,Depressive,Elegant,Passionate,Warm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
199,5458.24,35.541077,65.740967,0,0,1,0.345299,0.071441,0.345299,0.386973,0.500089,0.345299,0.38102,0.303625
551,240.933333,33.360748,61.566162,1,0,0,0.236113,0.193183,0.173507,0.461493,0.209282,0.431084,0.43645,0.49369
262,43.215,29.266968,43.444824,1,0,0,0.025303,0.04087,0.731414,0.035488,0.623773,0.10893,0.066135,0.236921
296,0.86,31.039429,38.787842,1,0,0,0.32737,0.068936,0.382759,0.369213,0.385017,0.449286,0.329627,0.382759
558,267.588571,33.242057,60.681152,1,0,0,0.087445,0.91193,0.199875,0.22486,0.212367,0.124922,0.074953,0.062461
512,399.2,31.166992,79.626465,1,0,0,0.150376,0.659985,0.651631,0.200502,0.175439,0.175439,0.083542,0.091897
358,5570.88,31.039429,66.339111,0,0,1,0.113329,0.362654,0.067998,0.113329,0.124662,0.113329,0.895303,0.090664
106,381.12,32.922668,61.92627,1,0,0,0.263801,0.263801,0.192355,0.824378,0.008244,0.255557,0.096177,0.263801
2,153.61,31.694031,67.053223,1,0,0,0.380338,0.349937,0.351231,0.323417,0.227039,0.28202,0.294956,0.535578
167,292.8,34.262085,58.02002,1,0,0,0.344838,0.389334,0.344838,0.422705,0.355962,0.3504,0.422705,0.061181


In [31]:
evaluate_df = pd.concat([test_eval_svm_df, test_eval_mm, test_eval_rf_df], axis=1)
display(evaluate_df)
display(evaluate_df.describe())

Unnamed: 0_level_0,"accuracy (svm,std)","precision (svm,std)","recall (svm,std)","mse (svm,std)","accuracy (svm,mm)","precision (svm,mm)","recall (svm,mm)","mse (svm,mm)",accuracy (rf),precision (rf),recall (rf),mse (rf)
mood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Aggressive,0.631579,0.818182,0.214286,0.368421,0.652632,0.8,0.285714,0.347368,0.547368,0.485714,0.404762,0.452632
Athletic,0.589474,0.666667,0.340426,0.410526,0.578947,0.62963,0.361702,0.421053,0.536842,0.528302,0.595745,0.463158
Atmospheric,0.494737,0.0,0.0,0.505263,0.494737,0.0,0.0,0.505263,0.505263,0.511628,0.458333,0.494737
Celebratory,0.610526,0.65625,0.446809,0.389474,0.621053,0.657143,0.489362,0.378947,0.6,0.591837,0.617021,0.4
Depressive,0.494737,0.470588,0.347826,0.505263,0.515789,0.5,0.326087,0.484211,0.410526,0.407407,0.478261,0.589474
Elegant,0.452632,0.0,0.0,0.547368,0.452632,0.0,0.0,0.547368,0.463158,0.511628,0.423077,0.536842
Passionate,0.6,0.461538,0.162162,0.4,0.6,0.466667,0.189189,0.4,0.505263,0.395833,0.513514,0.494737
Warm,0.547368,0.666667,0.045455,0.452632,0.536842,0.0,0.0,0.463158,0.568421,0.529412,0.613636,0.431579


Unnamed: 0,"accuracy (svm,std)","precision (svm,std)","recall (svm,std)","mse (svm,std)","accuracy (svm,mm)","precision (svm,mm)","recall (svm,mm)","mse (svm,mm)",accuracy (rf),precision (rf),recall (rf),mse (rf)
count,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
mean,0.552632,0.467486,0.19462,0.447368,0.556579,0.38168,0.206507,0.443421,0.517105,0.49522,0.513044,0.482895
std,0.065375,0.310552,0.172285,0.065375,0.068088,0.331794,0.190075,0.068088,0.06019,0.065264,0.086017,0.06019
min,0.452632,0.0,0.0,0.368421,0.452632,0.0,0.0,0.347368,0.410526,0.395833,0.404762,0.4
25%,0.494737,0.346154,0.034091,0.397368,0.510526,0.0,0.0,0.394737,0.494737,0.466138,0.449519,0.447368
50%,0.568421,0.563419,0.188224,0.431579,0.557895,0.483333,0.237452,0.442105,0.521053,0.511628,0.495887,0.478947
75%,0.602632,0.666667,0.342276,0.505263,0.605263,0.636508,0.334991,0.489474,0.552632,0.528579,0.600218,0.505263
max,0.631579,0.818182,0.446809,0.547368,0.652632,0.8,0.489362,0.547368,0.6,0.591837,0.617021,0.589474


## Save Model

In [32]:
import pickle
pickle.dump(rf, open('RandomForest', 'wb'))