# Data cleaning

In [1]:
import pandas as pd
import numpy as np
import random
import json
from tensorflow.keras.models import Model, load_model
# import pycaret

# Replace double quotes with single quotes for moods JSON arrays
f = open('song_data.csv','r+')
text = f.read()
text = text.replace('"["', '"[?').replace('", "', '?,?').replace('"]"', '?]"') # Changed to "[$Happy$,$Sad$]" for easy replacing later
f.seek(0)
f.write(text)
f.close()

# Importing data
df = pd.read_csv('song_data.csv', index_col=0)
df.drop('uuid', axis=1, inplace=True)
df.dropna(inplace=True) # drop rows with nan values
for col in df.columns:
    if col not in ['id','isSkipped']:
        if col == 'moods':
            df[col] = df[col].apply(lambda x:x.replace('?','"')) # Replaces mood values back to the form ["Happy","Sad"] so it can be loaded by json
        df[col] = df[col].apply(json.loads)
df['activity'] = df.apply(lambda _: '', axis=1) # empty activity column
print('Number of samples: ', df.shape[0])
df.head()

Number of samples:  525


Unnamed: 0_level_0,gyroX,gyroY,gyroZ,accelX,accelY,accelZ,optical,temp,humidity,moods,isSkipped,activity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,"[499.30572509765625, 499.53460693359375, 0.198...","[1.861572265625, 2.49481201171875, 1.022338867...","[1.24359130859375, 1.2359619140625, 1.06048583...","[1.1943359375, 1.201171875, 1.1845703125, 1.18...","[1.1455078125, 1.1591796875, 1.1630859375, 1.1...","[3.68359375, 3.654296875, 3.6748046875, 3.6650...","[139.64, 138.36, 139.64, 140.28]","[30.50567626953125, 30.50567626953125, 30.5056...","[71.3134765625, 71.3134765625, 71.3134765625]","[Depressive, Atmospheric]",0,
2,"[498.1765747070313, 0.98419189453125, 1.579284...","[32.27996826171875, 14.7247314453125, 9.864807...","[497.9248046875, 496.368408203125, 494.9645996...","[2.9326171875, 2.9345703125, 2.728515625, 2.64...","[0.8466796875, 0.74609375, 0.865234375, 15.524...","[2.7548828125, 2.8037109375, 2.806640625, 3.34...","[123.24, 123.24, 139.32, 228.64]","[31.69403076171875, 31.69403076171875, 31.6940...","[67.05322265625, 67.05322265625, 67.0532226562...",[Depressive],0,
3,"[15.76995849609375, 10.65826416015625, 6.87408...","[488.36517333984375, 486.5798950195313, 496.92...","[3.86810302734375, 5.0811767578125, 498.947143...","[14.0107421875, 14.3212890625, 14.232421875, 1...","[14.96484375, 15.212890625, 15.275390625, 15.1...","[3.2021484375, 3.3291015625, 3.375, 3.35644531...","[256.08, 307.84000000000003, 315.2, 301.36, 30...","[32.21771240234375, 32.21771240234375, 32.2177...","[65.850830078125, 65.850830078125, 65.85083007...","[Passionate, Depressive]",0,
4,"[499.93896484375, 499.45068359375, 499.7482299...","[1.82342529296875, 2.74658203125, 1.8844604492...","[1.57928466796875, 1.434326171875, 1.365661621...","[1.7353515625, 1.708984375, 1.7333984375, 1.71...","[13.7841796875, 13.80078125, 13.7744140625, 13...","[2.8232421875, 2.8369140625, 2.8154296875, 2.8...","[127.08, 126.76, 125.48, 124.52]","[32.42919921875, 32.42919921875, 32.4291992187...","[64.6728515625, 64.6728515625, 64.6728515625, ...",[Elegant],1,
5,"[499.9465942382813, 0.03814697265625, 499.7482...","[2.01416015625, 1.77001953125, 1.7852783203125...","[1.1444091796875, 1.2359619140625, 1.129150390...","[0.4580078125, 0.4609375, 0.453125, 0.43847656...","[13.181640625, 13.1689453125, 13.1787109375, 1...","[2.6806640625, 2.693359375, 2.6875, 2.68652343...","[145.76, 144.48, 146.4, 144.8]","[32.42919921875, 32.42919921875, 32.4291992187...","[64.6728515625, 64.6728515625, 64.6728515625, ...","[Passionate, Depressive]",1,


In [2]:
data = df.iloc[0,:]
display(data)
# print(data['temp'])

gyroX        [499.30572509765625, 499.53460693359375, 0.198...
gyroY        [1.861572265625, 2.49481201171875, 1.022338867...
gyroZ        [1.24359130859375, 1.2359619140625, 1.06048583...
accelX       [1.1943359375, 1.201171875, 1.1845703125, 1.18...
accelY       [1.1455078125, 1.1591796875, 1.1630859375, 1.1...
accelZ       [3.68359375, 3.654296875, 3.6748046875, 3.6650...
optical                       [139.64, 138.36, 139.64, 140.28]
temp         [30.50567626953125, 30.50567626953125, 30.5056...
humidity         [71.3134765625, 71.3134765625, 71.3134765625]
moods                                [Depressive, Atmospheric]
isSkipped                                                    0
activity                                                      
Name: 1, dtype: object

In [3]:
# Filtering defective data

defective_ids = []
for idx,row in df.iterrows():
    # defective if temp array only has -40 values
    # defective if any humidity values are above 99.99
    if len([k for k in row['temp'] if k==-40]) == len(row['temp']) or \
    len([k for k in row['humidity'] if k>99]) == len(row['humidity']):
        defective_ids.append(idx)
    
    # if only some values are defective, keep the row, but remove defective values
    # remove -40 temp values and >99.99 humidity values
    elif (-40 in row['temp']) or len([k for k in row['humidity'] if k>99]) or (0 in row['optical']):
        df.at[idx,'temp'] = [k for k in row['temp'] if k!=-40]
        df.at[idx,'humidity'] = [k for k in row['humidity'] if k<=99]
        
    # some gyro/accel data have 40 samples
    # take the last 30 samples for these rows
    for col in df.columns[:6]:
        if len(row[col]) > 30:
            df.at[idx,col] = row[col][-30:]

filtered_df = df[~df.index.isin(defective_ids)].copy() # .copy() to avoid warning
print('%d defective rows: ' % len(defective_ids),defective_ids)

23 defective rows:  [21, 22, 33, 214, 236, 238, 245, 246, 247, 248, 249, 250, 251, 252, 386, 387, 388, 389, 390, 391, 392, 393, 394]


In [4]:
motion_model_path = 'firstModel_stackedLSTM.hd5'
model = load_model(motion_model_path)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 30, 128)           69120     
_________________________________________________________________
lstm_1 (LSTM)                (None, 30, 64)            49408     
_________________________________________________________________
lstm_2 (LSTM)                (None, 32)                12416     
_________________________________________________________________
dense (Dense)                (None, 100)               3300      
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 303       
Total params: 134,547
Trainable params: 134,547
Non-trainable params: 0
_________________________________________________________________


In [5]:
# Add activity from motion recognition model
x = [list(k) for k in filtered_df.iloc[:,:6].values]
x = np.array(x) # (num_samples, 6, 30)
x = np.array([k.T for k in x]) # reshape as (num_samples, 30, 6)

pred = model.predict(x)

activity_cats = np.array(['Running', 'Walking', 'Working']) # hardcoded categories from 'Physical Activity Classification.ipynb'
filtered_df['activity'] = activity_cats[np.argmax(pred, axis=1)]
#filtered_df['activity'] = np.argmax(pred, axis=1)
filtered_df = filtered_df.iloc[:,6:].copy() # drop gyro and accel columns

filtered_df.sample(5)

Unnamed: 0_level_0,optical,temp,humidity,moods,isSkipped,activity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
336,"[241.6, 240, 240.96, 245.76]","[32.76153564453125, 32.76153564453125, 32.7615...","[79.62646484375, 79.62646484375, 79.6264648437...",[Elegant],0,Working
222,"[0.08, 0.56, 0.16, 0.4]","[31.0394287109375, 31.0394287109375, 31.039428...","[76.3671875, 76.3671875, 76.3671875]",[Passionate],0,Walking
101,"[202.08, 203.36, 202.4, 203.04]","[33.1341552734375, 33.1341552734375, 33.134155...","[61.468505859375, 61.468505859375, 61.46850585...",[Aggressive],0,Working
567,"[50.9, 51.38, 51.7]","[31.51275634765625, 31.51275634765625, 31.5228...","[73.016357421875, 73.016357421875, 72.91870117...",[Celebratory],0,Working
191,"[2255.36, 2316.8, 2330.88, 2393.6]","[33.48663330078125, 33.48663330078125, 33.4866...","[82.09228515625, 82.09228515625, 82.09228515625]","[Celebratory, Passionate]",0,Running


In [6]:
# Obtain mean optical, temp and humidity values

for col in filtered_df.columns:
    if col not in ['moods','isSkipped','activity']:
        filtered_df[col] = filtered_df[col].apply(np.mean)
filtered_df.head()

Unnamed: 0_level_0,optical,temp,humidity,moods,isSkipped,activity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,139.48,30.505676,71.313477,"[Depressive, Atmospheric]",0,Working
2,153.61,31.694031,67.053223,[Depressive],0,Working
3,297.792,32.217712,65.85083,"[Passionate, Depressive]",0,Working
4,125.96,32.429199,64.672852,[Elegant],1,Working
5,145.36,32.429199,64.672852,"[Passionate, Depressive]",1,Working


In [7]:
# One-hot encoding for moods

moods = []
for k in filtered_df['moods'].values:
    moods += list(k)
moods = np.unique(np.array(moods))
for mood in moods:
    mood_values = filtered_df['moods'].astype(str).str.contains(mood)
    filtered_df[mood] = mood_values
filtered_df.drop('moods', axis=1, inplace=True)
print('Added one-hot encoded columns for moods:')
filtered_df.head()

Added one-hot encoded columns for moods:


Unnamed: 0_level_0,optical,temp,humidity,isSkipped,activity,Aggressive,Athletic,Atmospheric,Celebratory,Depressive,Elegant,Passionate,Warm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,139.48,30.505676,71.313477,0,Working,False,False,True,False,True,False,False,False
2,153.61,31.694031,67.053223,0,Working,False,False,False,False,True,False,False,False
3,297.792,32.217712,65.85083,0,Working,False,False,False,False,True,False,True,False
4,125.96,32.429199,64.672852,1,Working,False,False,False,False,False,True,False,False
5,145.36,32.429199,64.672852,1,Working,False,False,False,False,True,False,True,False


In [8]:
# Invert mood boolean values based on "isSkipped"

ANTI_MOODS = ['Not Aggressive', 'Not Athletic', 'Not Atmospheric', 'Not Celebratory',\
             'Not Melancholic', 'Not Elegant', 'Not Passionate', 'Not Warm']

filtered_df[ANTI_MOODS] = filtered_df[['Aggressive', 'Athletic', 'Atmospheric', 'Celebratory', \
                                       'Melancholic', 'Elegant', 'Passionate', 'Warm']].copy()

for mood in moods:
    filtered_df[mood] = np.where(filtered_df['isSkipped'], 0, filtered_df[mood]) # remove isSkipped from normal decisions

for mood in ANTI_MOODS:
    filtered_df[mood] = np.where(filtered_df['isSkipped'], filtered_df[mood], 0) # and non-isSkipped from skip decisions

filtered_df.drop('isSkipped', axis=1, inplace=True)

print('Invert mood values based on "isSkipped" boolean:')
filtered_df.head()

Invert mood values based on "isSkipped" boolean:


Unnamed: 0_level_0,optical,temp,humidity,activity,Aggressive,Athletic,Atmospheric,Celebratory,Depressive,Elegant,Passionate,Warm,Not Aggressive,Not Athletic,Not Atmospheric,Not Celebratory,Not Depressive,Not Elegant,Not Passionate,Not Warm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,139.48,30.505676,71.313477,Working,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0
2,153.61,31.694031,67.053223,Working,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,297.792,32.217712,65.85083,Working,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0
4,125.96,32.429199,64.672852,Working,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
5,145.36,32.429199,64.672852,Working,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0


In [9]:
# One-hot encoding for activity
activities = []
for k in filtered_df['activity']:
    activities.append(k)
activities = np.unique(np.array(activities))
for activity in activities:
    activity_values = filtered_df['activity'].astype(str).str.contains(activity)
    activity_values = activity_values.astype(int)
    filtered_df[activity] = activity_values
filtered_df.drop('activity', axis=1, inplace=True)

filtered_df.head()

Unnamed: 0_level_0,optical,temp,humidity,Aggressive,Athletic,Atmospheric,Celebratory,Depressive,Elegant,Passionate,...,Not Athletic,Not Atmospheric,Not Celebratory,Not Depressive,Not Elegant,Not Passionate,Not Warm,Running,Walking,Working
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,139.48,30.505676,71.313477,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
2,153.61,31.694031,67.053223,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
3,297.792,32.217712,65.85083,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,1
4,125.96,32.429199,64.672852,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
5,145.36,32.429199,64.672852,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,0,0,1


# Split into train/test datasets

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

# label encoding for activity
# le = LabelEncoder()
# filtered_df['activity'] = le.fit_transform(filtered_df['activity'].values)
# display(filtered_df.head())

# split into training & testing
x = filtered_df[['optical', 'temp', 'humidity','Working', 'Running', 'Walking']]
y = filtered_df[['Aggressive', 'Athletic', 'Atmospheric', 'Celebratory', \
                 'Melancholic', 'Elegant', 'Passionate', 'Warm', \
                 'Not Aggressive', 'Not Athletic', 'Not Atmospheric', 'Not Celebratory',\
                 'Not Melancholic', 'Not Elegant', 'Not Passionate', 'Not Warm']]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=0)

print('Training (x, y): ', x_train.shape, y_train.shape)
print('Testing (x, y): ', x_test.shape, y_test.shape)

# create dfs for training and test data for easy prediction later
train_df = x_train.join(y_train)
test_df = x_test.join(y_test)

print('\nTrain:')
display(train_df.head())
print('Test:')
display(test_df.head())

Training (x, y):  (401, 6) (401, 16)
Testing (x, y):  (101, 6) (101, 16)

Train:


Unnamed: 0_level_0,optical,temp,humidity,Working,Running,Walking,Aggressive,Athletic,Atmospheric,Celebratory,...,Passionate,Warm,Not Aggressive,Not Athletic,Not Atmospheric,Not Celebratory,Not Depressive,Not Elegant,Not Passionate,Not Warm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
301,0.46,30.968933,35.168457,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
86,74.42,34.131165,75.98877,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
571,56.4,32.76825,67.089844,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
7,211.18,32.499695,63.62915,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,1
508,316.4,31.099854,79.947917,1,0,0,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0


Test:


Unnamed: 0_level_0,optical,temp,humidity,Working,Running,Walking,Aggressive,Athletic,Atmospheric,Celebratory,...,Passionate,Warm,Not Aggressive,Not Athletic,Not Atmospheric,Not Celebratory,Not Depressive,Not Elegant,Not Passionate,Not Warm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
107,376.96,32.922668,61.92627,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
114,281.28,33.808899,59.564209,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
372,27973.12,38.139343,55.944824,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
379,1597.92,38.139343,55.944824,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
380,7883.84,38.139343,55.944824,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


# SVM

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import svm

# model training
# since predict_proba only works with two classes at a time,
# eight svm models are trained for each mood and stored in the dictionary "svm_moods"
svms_std = {} # key:mood, value:svm model trained on that mood
params = {'C':[0.001, 0.01, 0.1, 1, 10], 'kernel':('linear', 'poly', 'rbf', 'sigmoid'), 'decision_function_shape':('ovr', 'ovo')}
for mood in y_train.columns:
    svm_pipe_std = Pipeline([('scaler', StandardScaler()), 
                       ('svm', GridSearchCV(svm.SVC(max_iter=100000, probability=True), params)), ])
    svm_pipe_std.fit(x_train, y_train.loc[:,mood].values)
    svms_std[mood] = svm_pipe_std



In [16]:
# Define functions for prediction and evaluation of SVM model

from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score

from sklearn.preprocessing import normalize

# Predict binary values or confidence scores for moods from sensor data
# For now, input_data is a Dataframe for convenience
def svm_predict(svms, input_data, prob=True):
    pred_df = input_data.copy()
    for mood,svm in svms.items():
        if prob:
            pred = svm.predict_proba(pred_df.iloc[:,:-16].values)
            pred_df[mood] = pred[:,1]
        else:
            pred = svm.predict(pred_df.iloc[:,:-16].values)
            pred_df[mood] = pred.reshape(-1)
    return pred_df
    
# Get loss (MSE) of predicted confidence scores
# Input: DataFrames of actual and predicted moods
def evaluate(df_actual, df_pred, df_pred_proba, clf):
    evaluation_df_dict = {
        'mood': [],
        'accuracy (%s)'%clf: [],
        'precision (%s)'%clf: [],
        'recall (%s)'%clf: [],
        'mse (%s)'%clf: [],
    }
    df_moods_actual = df_actual.iloc[:,-16:]
    df_moods_pred = df_pred.iloc[:,-16:]
    
    #mse
    df_moods_actual_normed = normalize(df_actual.iloc[:,-16:], axis=1)
    df_moods_pred_proba =  normalize(df_pred_proba.iloc[:,-16:], axis=1)
    currCol = 0
    
    for mood in df_moods_actual.columns:
        y_actual, y_pred = df_moods_actual[mood].values, df_moods_pred[mood].values
        evaluation_df_dict['mood'].append(mood)
        evaluation_df_dict['accuracy (%s)'%clf].append(accuracy_score(y_actual, y_pred))
        evaluation_df_dict['precision (%s)'%clf].append(precision_score(y_actual, y_pred, zero_division=0))
        evaluation_df_dict['recall (%s)'%clf].append(recall_score(y_actual, y_pred))
        
        evaluation_df_dict['mse (%s)'%clf].append(mean_squared_error(df_moods_actual_normed[currCol], df_moods_pred_proba[currCol]))
        currCol += 1
    evaluation_df = pd.DataFrame(evaluation_df_dict)
    evaluation_df.set_index('mood', inplace=True)
    return evaluation_df

In [17]:
train_pred_svm_df = svm_predict(svms_std, train_df, prob=False)
test_pred_svm_df = svm_predict(svms_std, test_df, prob=False)
train_prob_svm_df = svm_predict(svms_std, train_df, prob=True)
test_prob_svm_df = svm_predict(svms_std, test_df, prob=True)

# Show predicted values and confidence scores for moods for random samples
print('Test data mood labels (Actual):')
#display(test_df.head())

print('Test data mood labels (Predicted):')
#display(test_pred_svm_df.head())

print('Test data mood labels (Confidence Scores):')
display(test_prob_svm_df.head())

#Postprocess, normal categories - probabilities of skip categories

NORMAL_MOODS = ['Aggressive', 'Athletic', 'Atmospheric', 'Celebratory', \
                  'Melancholic', 'Elegant', 'Passionate', 'Warm']

test_prob_svm_df[NORMAL_MOODS] = test_prob_svm_df[NORMAL_MOODS].to_numpy() - test_prob_svm_df[ANTI_MOODS].to_numpy()
test_prob_svm_df[ANTI_MOODS] = 0

print('Test data mood labels (Confidence Scores, Postprocessed):')
display(test_prob_svm_df.head())

# Show evaluation of predictions
train_eval_svm_df = evaluate(train_df, train_pred_svm_df, train_prob_svm_df, 'svm,std')
test_eval_svm_df = evaluate(test_df, test_pred_svm_df, test_prob_svm_df, 'svm,std')
display(test_eval_svm_df)
display(test_eval_svm_df.describe())

Test data mood labels (Actual):
Test data mood labels (Predicted):
Test data mood labels (Confidence Scores):


Unnamed: 0_level_0,optical,temp,humidity,Working,Running,Walking,Aggressive,Athletic,Atmospheric,Celebratory,...,Passionate,Warm,Not Aggressive,Not Athletic,Not Atmospheric,Not Celebratory,Not Depressive,Not Elegant,Not Passionate,Not Warm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
107,376.96,32.922668,61.92627,1,0,0,0.054789,0.075065,0.077385,0.071421,...,0.14095,0.121034,0.088529,0.052609,0.085749,0.036308,0.079021,0.058148,0.096257,0.09263
114,281.28,33.808899,59.564209,1,0,0,0.054844,0.0766,0.078249,0.071168,...,0.140552,0.120339,0.089937,0.050347,0.085205,0.036305,0.082092,0.047148,0.097002,0.092793
372,27973.12,38.139343,55.944824,0,0,1,0.054875,0.094838,0.119031,0.477409,...,0.135338,0.096231,0.085647,0.052856,0.080622,0.036303,0.068153,0.076562,0.097623,0.085401
379,1597.92,38.139343,55.944824,0,0,1,0.055103,0.088748,0.094088,0.222504,...,0.13605,0.110344,0.093706,0.05016,0.081277,0.036309,0.060516,0.083684,0.103217,0.089004
380,7883.84,38.139343,55.944824,0,0,1,0.055049,0.090167,0.099569,0.021985,...,0.13588,0.106824,0.091726,0.05079,0.081121,0.036308,0.062142,0.083102,0.101858,0.088133


Test data mood labels (Confidence Scores, Postprocessed):


Unnamed: 0_level_0,optical,temp,humidity,Working,Running,Walking,Aggressive,Athletic,Atmospheric,Celebratory,...,Passionate,Warm,Not Aggressive,Not Athletic,Not Atmospheric,Not Celebratory,Not Depressive,Not Elegant,Not Passionate,Not Warm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
107,376.96,32.922668,61.92627,1,0,0,-0.03374,0.022455,-0.008364,0.035113,...,0.044692,0.028405,0,0,0,0,0,0,0,0
114,281.28,33.808899,59.564209,1,0,0,-0.035094,0.026254,-0.006955,0.034862,...,0.043551,0.027547,0,0,0,0,0,0,0,0
372,27973.12,38.139343,55.944824,0,0,1,-0.030772,0.041982,0.038409,0.441106,...,0.037715,0.01083,0,0,0,0,0,0,0,0
379,1597.92,38.139343,55.944824,0,0,1,-0.038603,0.038588,0.01281,0.186194,...,0.032833,0.021339,0,0,0,0,0,0,0,0
380,7883.84,38.139343,55.944824,0,0,1,-0.036677,0.039376,0.018449,-0.014323,...,0.034021,0.018691,0,0,0,0,0,0,0,0


Unnamed: 0_level_0,"accuracy (svm,std)","precision (svm,std)","recall (svm,std)","mse (svm,std)"
mood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Aggressive,0.960396,0.0,0.0,0.072856
Athletic,0.90099,0.0,0.0,0.087572
Atmospheric,0.841584,0.0,0.0,0.116422
Celebratory,0.861386,0.5,0.071429,0.111872
Depressive,0.920792,0.0,0.0,0.100643
Elegant,0.891089,0.0,0.0,0.068033
Passionate,0.920792,0.0,0.0,0.151546
Warm,0.910891,0.0,0.0,0.091547
Not Aggressive,0.940594,0.0,0.0,0.167351
Not Athletic,0.960396,0.0,0.0,0.067938


Unnamed: 0,"accuracy (svm,std)","precision (svm,std)","recall (svm,std)","mse (svm,std)"
count,16.0,16.0,16.0,16.0
mean,0.915223,0.03125,0.004464,0.113073
std,0.036148,0.125,0.017857,0.031953
min,0.841584,0.0,0.0,0.067938
25%,0.891089,0.0,0.0,0.090553
50%,0.920792,0.0,0.0,0.114147
75%,0.940594,0.0,0.0,0.125
max,0.970297,0.5,0.071429,0.17016


# Random Forest

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

params = {
    'criterion':('gini', 'entropy'),
}

# About the same, can swap back to try. DecisionTreeClassifier much faster
rf = GridSearchCV(RandomForestClassifier(max_depth=7, random_state=0), params)
#rf = GridSearchCV(DecisionTreeClassifier(max_depth=7), params)
rf.fit(x_train, y_train)
rf.best_params_

{'criterion': 'entropy'}

In [19]:
# Predict binary values or confidence scores for moods from sensor data
# For now, input_data is a Dataframe for convenience
# Returns confidence scores if prob=True else binary values
def rf_predict(rf, input_data, prob=True):
    pred_df = input_data.copy()
    data = input_data.iloc[:,:-16]
    target = input_data.iloc[:,-16:]
    if prob:
        pred = np.array(rf.predict_proba(data.values))
    else:
        pred = np.array(rf.predict(data.values))
    for i in range(len(target.columns)):
        mood = target.columns[i]
        if prob:
            pred_df[mood] = pred[i,:,1] # predict_proba returns shape (n_features, n_samples, probs)
        else:
            pred_df[mood] = pred[:,i] # predict returns shape (n_samples, n_features)
    return pred_df

In [21]:
# Predict binary values or confidence scores for moods from sensor data
# For now, input_data is a Dataframe for convenience
# Returns confidence scores if prob=True else binary values

train_pred_rf_df = rf_predict(rf, train_df, prob=False)
test_pred_rf_df = rf_predict(rf, test_df, prob=False)
train_prob_rf_df = rf_predict(rf, train_df, prob=True)
test_prob_rf_df = rf_predict(rf, test_df, prob=True)

# Show predicted values and confidence scores for moods for random samples
print('Test data mood labels (Actual):')
#test_df_copy = test_df.copy()
#test_df_copy['activity'] = [activity_cats[k] for k in test_df_copy['activity'].values]
# test_df_copy['activity'] = le.inverse_transform(test_df_copy['activity'].values) # convert activity values back to strings
display(test_df.head(10))

print('Test data mood labels (Predicted):')
# test_pred_rf_df['activity'] = le.inverse_transform(test_pred_rf_df['activity'].values)
#test_pred_rf_df['activity'] = [activity_cats[k] for k in test_pred_rf_df['activity'].values]
display(test_pred_rf_df.head(10))

print('Test data mood labels (Confidence Scores):')
#test_prob_rf_df['activity'] = [activity_cats[k] for k in test_prob_rf_df['activity'].values]
# test_pred_proba_rf_df['activity'] = le.inverse_transform(test_pred_proba_rf_df['activity'].values)
pd.set_option('display.max_columns', None)
display(test_prob_rf_df.head(10))


# Show evaluation of predictions
train_eval_rf_df = evaluate(train_df, train_pred_rf_df, train_prob_rf_df, 'rf')
test_eval_rf_df = evaluate(test_df, test_pred_rf_df, test_prob_rf_df, 'rf')
display(test_eval_rf_df)
display(test_eval_rf_df.describe())


test_prob_rf_df[NORMAL_MOODS] = test_prob_rf_df[NORMAL_MOODS].to_numpy() - test_prob_rf_df[ANTI_MOODS].to_numpy()
test_prob_rf_df.drop(ANTI_MOODS, axis=1, inplace=True)

# Normalize, just for visualization (cosine measure already does it)
from sklearn.preprocessing import normalize
test_prob_rf_df[NORMAL_MOODS] = normalize(test_prob_rf_df[NORMAL_MOODS].to_numpy(), axis=1)

print('Test data mood labels (Confidence Scores, Postprocessed):')
display(test_prob_rf_df.head(20))


Test data mood labels (Actual):


Unnamed: 0_level_0,optical,temp,humidity,Working,Running,Walking,Aggressive,Athletic,Atmospheric,Celebratory,Depressive,Elegant,Passionate,Warm,Not Aggressive,Not Athletic,Not Atmospheric,Not Celebratory,Not Depressive,Not Elegant,Not Passionate,Not Warm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
107,376.96,32.922668,61.92627,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
114,281.28,33.808899,59.564209,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
372,27973.12,38.139343,55.944824,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
379,1597.92,38.139343,55.944824,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
380,7883.84,38.139343,55.944824,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
16,240.84,32.677612,62.8479,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0
358,5570.88,31.039429,66.339111,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
491,48.6,32.802826,59.768677,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
180,2789.28,33.255005,80.975342,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
174,1251.84,33.385925,74.310303,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0


Test data mood labels (Predicted):


Unnamed: 0_level_0,optical,temp,humidity,Working,Running,Walking,Aggressive,Athletic,Atmospheric,Celebratory,Depressive,Elegant,Passionate,Warm,Not Aggressive,Not Athletic,Not Atmospheric,Not Celebratory,Not Depressive,Not Elegant,Not Passionate,Not Warm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
107,376.96,32.922668,61.92627,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
114,281.28,33.808899,59.564209,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
372,27973.12,38.139343,55.944824,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
379,1597.92,38.139343,55.944824,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
380,7883.84,38.139343,55.944824,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
16,240.84,32.677612,62.8479,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
358,5570.88,31.039429,66.339111,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
491,48.6,32.802826,59.768677,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
180,2789.28,33.255005,80.975342,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
174,1251.84,33.385925,74.310303,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Test data mood labels (Confidence Scores):


Unnamed: 0_level_0,optical,temp,humidity,Working,Running,Walking,Aggressive,Athletic,Atmospheric,Celebratory,Depressive,Elegant,Passionate,Warm,Not Aggressive,Not Athletic,Not Atmospheric,Not Celebratory,Not Depressive,Not Elegant,Not Passionate,Not Warm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
107,376.96,32.922668,61.92627,1,0,0,0.065859,0.025165,0.016212,0.050183,0.050872,0.013293,0.101356,0.050786,0.039961,0.174478,0.142598,0.019583,0.384013,0.033376,0.20469,0.019394
114,281.28,33.808899,59.564209,1,0,0,0.061785,0.179592,0.031333,0.20785,0.050786,0.028488,0.19124,0.113184,0.017091,0.017565,0.013637,0.044147,0.070113,0.073177,0.059804,0.081923
372,27973.12,38.139343,55.944824,0,0,1,0.010996,0.00065,0.040632,0.159673,0.414425,0.0004,0.144365,0.182725,0.037045,0.005348,0.139903,0.00075,0.044572,0.001,0.02969,0.006498
379,1597.92,38.139343,55.944824,0,0,1,0.212291,0.004883,0.004629,0.021924,0.11746,0.0014,0.09948,0.023939,0.450951,0.00065,0.142479,0.006583,0.187359,0.004214,0.066962,0.018443
380,7883.84,38.139343,55.944824,0,0,1,0.014763,0.00065,0.025632,0.185673,0.382652,0.0004,0.20506,0.065058,0.115562,0.005348,0.187963,0.00075,0.125339,0.001,0.102623,0.003165
16,240.84,32.677612,62.8479,1,0,0,0.067146,0.042629,0.077683,0.047173,0.097621,0.041847,0.133808,0.135951,0.200709,0.118559,0.081308,0.012776,0.09472,0.022808,0.086615,0.049454
358,5570.88,31.039429,66.339111,0,0,1,0.017017,0.189983,0.011056,0.053088,0.004975,0.104314,0.3957,0.249036,0.085746,0.006528,0.033035,0.050762,0.01202,0.055056,0.001309,0.034871
491,48.6,32.802826,59.768677,1,0,0,0.031584,0.05104,0.141598,0.045073,0.149451,0.165238,0.09933,0.249938,0.184627,0.061378,0.005934,0.00784,0.069361,0.006629,0.03411,0.037557
180,2789.28,33.255005,80.975342,0,1,0,0.246318,0.032388,0.009181,0.199057,0.005263,0.026596,0.390583,0.035478,0.00567,0.068579,0.015983,0.00037,0.002696,0.172592,0.136107,0.138023
174,1251.84,33.385925,74.310303,0,1,0,0.482101,0.012377,0.006368,0.006054,0.007375,0.010049,0.02898,0.009783,0.004134,0.005257,0.130133,0.004656,0.010701,0.326062,0.02567,0.032395


Unnamed: 0_level_0,accuracy (rf),precision (rf),recall (rf),mse (rf)
mood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Aggressive,0.950495,0.0,0.0,0.11286
Athletic,0.891089,0.0,0.0,0.068221
Atmospheric,0.841584,0.0,0.0,0.118111
Celebratory,0.861386,0.5,0.071429,0.119818
Depressive,0.920792,0.0,0.0,0.110227
Elegant,0.891089,0.0,0.0,0.083414
Passionate,0.920792,0.0,0.0,0.121011
Warm,0.910891,0.0,0.0,0.082481
Not Aggressive,0.940594,0.0,0.0,0.071266
Not Athletic,0.950495,0.0,0.0,0.123737


Unnamed: 0,accuracy (rf),precision (rf),recall (rf),mse (rf)
count,16.0,16.0,16.0,16.0
mean,0.915223,0.03125,0.004464,0.107019
std,0.033326,0.125,0.017857,0.021013
min,0.841584,0.0,0.0,0.068221
25%,0.898515,0.0,0.0,0.083953
50%,0.920792,0.0,0.0,0.118965
75%,0.940594,0.0,0.0,0.123463
max,0.970297,0.5,0.071429,0.124842


Test data mood labels (Confidence Scores, Postprocessed):


Unnamed: 0_level_0,optical,temp,humidity,Working,Running,Walking,Aggressive,Athletic,Atmospheric,Celebratory,Depressive,Elegant,Passionate,Warm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
107,376.96,32.922668,61.92627,1,0,0,0.06416,-0.369917,-0.313117,0.07581,-0.825344,-0.049753,-0.256006,0.077772
114,281.28,33.808899,59.564209,1,0,0,0.162139,0.587789,0.064198,0.593869,-0.070112,-0.162122,0.476815,0.113406
372,27973.12,38.139343,55.944824,0,0,1,-0.055943,-0.01009,-0.213198,0.341309,0.794309,-0.001289,0.24628,0.378472
379,1597.92,38.139343,55.944824,0,0,1,-0.832443,0.014766,-0.480819,0.053508,-0.243807,-0.009816,0.113423,0.01917
380,7883.84,38.139343,55.944824,0,0,1,-0.259173,-0.01208,-0.417382,0.475468,0.661596,-0.001543,0.263382,0.159139
16,240.84,32.677612,62.8479,1,0,0,-0.715146,-0.406558,-0.019411,0.184171,0.015534,0.10194,0.252688,0.463138
358,5570.88,31.039429,66.339111,0,0,1,-0.139495,0.372346,-0.04461,0.004722,-0.014298,0.099975,0.800466,0.434675
491,48.6,32.802826,59.768677,1,0,0,-0.434176,-0.029328,0.384871,0.105627,0.22721,0.449966,0.185027,0.602511
180,2789.28,33.255005,80.975342,0,1,0,0.544488,-0.081885,-0.01539,0.449546,0.005807,-0.330328,0.575775,-0.232017
174,1251.84,33.385925,74.310303,0,1,0,0.814666,0.012136,-0.21095,0.002382,-0.005668,-0.538625,0.005641,-0.038542


In [None]:
evaluate_df = pd.concat([test_eval_svm_df, test_eval_rf_df], axis=1)
display(evaluate_df)
display(evaluate_df.describe())

## Save Model

In [18]:
import pickle
pickle.dump(rf, open('RandomForest', 'wb'))