# Data cleaning

In [1]:
import pandas as pd
import numpy as np
import random
import json
from tensorflow.keras.models import Model, load_model
# import pycaret

# Replace double quotes with single quotes for moods JSON arrays
f = open('song_data.csv','r+')
text = f.read()
text = text.replace('"["', '"[?').replace('", "', '?,?').replace('"]"', '?]"') # Changed to "[$Happy$,$Sad$]" for easy replacing later
f.seek(0)
f.write(text)
f.close()

# Importing data
df = pd.read_csv('song_data.csv', index_col=0)
df.drop('uuid', axis=1, inplace=True)
df.dropna(inplace=True) # drop rows with nan values
for col in df.columns:
    if col not in ['id','isSkipped']:
        if col == 'moods':
            df[col] = df[col].apply(lambda x:x.replace('?','"')) # Replaces mood values back to the form ["Happy","Sad"] so it can be loaded by json
        df[col] = df[col].apply(json.loads)
df['activity'] = df.apply(lambda _: '', axis=1) # empty activity column
print('Number of samples: ', df.shape[0])
df.head()

Number of samples:  525


Unnamed: 0_level_0,gyroX,gyroY,gyroZ,accelX,accelY,accelZ,optical,temp,humidity,moods,isSkipped,activity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,"[499.30572509765625, 499.53460693359375, 0.198...","[1.861572265625, 2.49481201171875, 1.022338867...","[1.24359130859375, 1.2359619140625, 1.06048583...","[1.1943359375, 1.201171875, 1.1845703125, 1.18...","[1.1455078125, 1.1591796875, 1.1630859375, 1.1...","[3.68359375, 3.654296875, 3.6748046875, 3.6650...","[139.64, 138.36, 139.64, 140.28]","[30.50567626953125, 30.50567626953125, 30.5056...","[71.3134765625, 71.3134765625, 71.3134765625]","[Depressive, Atmospheric]",0,
2,"[498.1765747070313, 0.98419189453125, 1.579284...","[32.27996826171875, 14.7247314453125, 9.864807...","[497.9248046875, 496.368408203125, 494.9645996...","[2.9326171875, 2.9345703125, 2.728515625, 2.64...","[0.8466796875, 0.74609375, 0.865234375, 15.524...","[2.7548828125, 2.8037109375, 2.806640625, 3.34...","[123.24, 123.24, 139.32, 228.64]","[31.69403076171875, 31.69403076171875, 31.6940...","[67.05322265625, 67.05322265625, 67.0532226562...",[Depressive],0,
3,"[15.76995849609375, 10.65826416015625, 6.87408...","[488.36517333984375, 486.5798950195313, 496.92...","[3.86810302734375, 5.0811767578125, 498.947143...","[14.0107421875, 14.3212890625, 14.232421875, 1...","[14.96484375, 15.212890625, 15.275390625, 15.1...","[3.2021484375, 3.3291015625, 3.375, 3.35644531...","[256.08, 307.84000000000003, 315.2, 301.36, 30...","[32.21771240234375, 32.21771240234375, 32.2177...","[65.850830078125, 65.850830078125, 65.85083007...","[Passionate, Depressive]",0,
4,"[499.93896484375, 499.45068359375, 499.7482299...","[1.82342529296875, 2.74658203125, 1.8844604492...","[1.57928466796875, 1.434326171875, 1.365661621...","[1.7353515625, 1.708984375, 1.7333984375, 1.71...","[13.7841796875, 13.80078125, 13.7744140625, 13...","[2.8232421875, 2.8369140625, 2.8154296875, 2.8...","[127.08, 126.76, 125.48, 124.52]","[32.42919921875, 32.42919921875, 32.4291992187...","[64.6728515625, 64.6728515625, 64.6728515625, ...",[Elegant],1,
5,"[499.9465942382813, 0.03814697265625, 499.7482...","[2.01416015625, 1.77001953125, 1.7852783203125...","[1.1444091796875, 1.2359619140625, 1.129150390...","[0.4580078125, 0.4609375, 0.453125, 0.43847656...","[13.181640625, 13.1689453125, 13.1787109375, 1...","[2.6806640625, 2.693359375, 2.6875, 2.68652343...","[145.76, 144.48, 146.4, 144.8]","[32.42919921875, 32.42919921875, 32.4291992187...","[64.6728515625, 64.6728515625, 64.6728515625, ...","[Passionate, Depressive]",1,


In [2]:
data = df.iloc[0,:]
display(data)
# print(data['temp'])

gyroX        [499.30572509765625, 499.53460693359375, 0.198...
gyroY        [1.861572265625, 2.49481201171875, 1.022338867...
gyroZ        [1.24359130859375, 1.2359619140625, 1.06048583...
accelX       [1.1943359375, 1.201171875, 1.1845703125, 1.18...
accelY       [1.1455078125, 1.1591796875, 1.1630859375, 1.1...
accelZ       [3.68359375, 3.654296875, 3.6748046875, 3.6650...
optical                       [139.64, 138.36, 139.64, 140.28]
temp         [30.50567626953125, 30.50567626953125, 30.5056...
humidity         [71.3134765625, 71.3134765625, 71.3134765625]
moods                                [Depressive, Atmospheric]
isSkipped                                                    0
activity                                                      
Name: 1, dtype: object

In [3]:
# Filtering defective data

defective_ids = []
for idx,row in df.iterrows():
    # defective if temp array only has -40 values
    # defective if any humidity values are above 99.99
    if len([k for k in row['temp'] if k==-40]) == len(row['temp']) or \
    len([k for k in row['humidity'] if k>99]) == len(row['humidity']) or row['isSkipped']:
        defective_ids.append(idx)
    
    # if only some values are defective, keep the row, but remove defective values
    # remove -40 temp values and >99.99 humidity values
    elif (-40 in row['temp']) or len([k for k in row['humidity'] if k>99]) or (0 in row['optical']):
        df.at[idx,'temp'] = [k for k in row['temp'] if k!=-40]
        df.at[idx,'humidity'] = [k for k in row['humidity'] if k<=99]
        
    # some gyro/accel data have 40 samples
    # take the last 30 samples for these rows
    for col in df.columns[:6]:
        if len(row[col]) > 30:
            df.at[idx,col] = row[col][-30:]

filtered_df = df[~df.index.isin(defective_ids)].copy() # .copy() to avoid warning
print('%d defective rows: ' % len(defective_ids),defective_ids)

236 defective rows:  [4, 5, 6, 7, 8, 9, 10, 13, 14, 15, 20, 21, 22, 23, 24, 25, 33, 34, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 49, 51, 52, 56, 57, 60, 70, 71, 72, 74, 76, 78, 79, 88, 93, 94, 95, 99, 100, 104, 105, 106, 109, 111, 124, 127, 131, 134, 137, 139, 140, 142, 144, 148, 149, 150, 151, 152, 155, 157, 159, 160, 162, 163, 164, 166, 173, 175, 176, 177, 178, 181, 184, 186, 187, 190, 192, 194, 199, 200, 202, 207, 208, 211, 212, 214, 216, 217, 218, 221, 223, 224, 225, 226, 229, 230, 231, 232, 236, 238, 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 261, 267, 270, 272, 274, 277, 282, 283, 286, 291, 293, 294, 296, 297, 299, 300, 301, 302, 305, 306, 307, 309, 310, 312, 316, 318, 322, 323, 324, 325, 326, 327, 331, 333, 339, 342, 343, 344, 345, 346, 347, 348, 350, 352, 355, 356, 359, 361, 365, 375, 378, 381, 382, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 396, 399, 404, 405, 413, 424, 431, 436, 437, 441, 443, 489, 492, 495, 496, 499, 501, 502, 503, 505, 50

In [4]:
motion_model_path = 'firstModel_stackedLSTM.hd5'
model = load_model(motion_model_path)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 30, 128)           69120     
_________________________________________________________________
lstm_1 (LSTM)                (None, 30, 64)            49408     
_________________________________________________________________
lstm_2 (LSTM)                (None, 32)                12416     
_________________________________________________________________
dense (Dense)                (None, 100)               3300      
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 303       
Total params: 134,547
Trainable params: 134,547
Non-trainable params: 0
_________________________________________________________________


In [5]:
# Add activity from motion recognition model
x = [list(k) for k in filtered_df.iloc[:,:6].values]
x = np.array(x) # (num_samples, 6, 30)
x = np.array([k.T for k in x]) # reshape as (num_samples, 30, 6)

pred = model.predict(x)

activity_cats = np.array(['Running', 'Walking', 'Working']) # hardcoded categories from 'Physical Activity Classification.ipynb'
filtered_df['activity'] = activity_cats[np.argmax(pred, axis=1)]
#filtered_df['activity'] = np.argmax(pred, axis=1)
filtered_df = filtered_df.iloc[:,6:].copy() # drop gyro and accel columns

filtered_df.sample(5)

Unnamed: 0_level_0,optical,temp,humidity,moods,isSkipped,activity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
18,"[159.6, 227.52, 229.76, 279.6]","[32.6910400390625, 32.6910400390625, 32.691040...","[62.652587890625, 62.652587890625, 62.65258789...","[Warm, Passionate]",0,Working
292,"[0.8, 0.8, 0.8, 0.8]","[31.0394287109375, 31.0394287109375, 31.039428...","[38.787841796875, 38.787841796875, 38.78784179...",[Elegant],0,Working
427,"[54.3, 54.14, 54.14, 54.32]","[32.41912841796875, 32.41912841796875, 32.4191...","[71.7529296875, 71.7529296875, 71.7529296875]","[Warm, Passionate]",0,Working
403,"[180.4, 183.28, 182.32, 182.64]","[32.47955322265625, 32.47955322265625, 32.4795...","[67.34619140625, 67.34619140625, 67.34619140625]",[Aggressive],0,Working
430,"[23.43, 23.83, 24.08, 25.12]","[32.41912841796875, 32.41912841796875, 32.4191...","[71.7529296875, 71.7529296875, 71.7529296875]","[Elegant, Passionate, Warm]",0,Working


In [6]:
# Obtain mean optical, temp and humidity values

for col in filtered_df.columns:
    if col not in ['moods','isSkipped','activity']:
        filtered_df[col] = filtered_df[col].apply(np.mean)
filtered_df.head()

Unnamed: 0_level_0,optical,temp,humidity,moods,isSkipped,activity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,139.48,30.505676,71.313477,"[Depressive, Atmospheric]",0,Working
2,153.61,31.694031,67.053223,[Depressive],0,Working
3,297.792,32.217712,65.85083,"[Passionate, Depressive]",0,Working
11,205.92,32.509766,63.531494,[Celebratory],0,Working
12,125.64,32.56012,63.140869,[Warm],0,Working


In [7]:
# One-hot encoding for moods

moods = []
for k in filtered_df['moods'].values:
    moods += list(k)
moods = np.unique(np.array(moods))
for mood in moods:
    mood_values = filtered_df['moods'].astype(str).str.contains(mood)
    filtered_df[mood] = mood_values
filtered_df.drop('moods', axis=1, inplace=True)
print('Added one-hot encoded columns for moods:')
filtered_df.head()

Added one-hot encoded columns for moods:


Unnamed: 0_level_0,optical,temp,humidity,isSkipped,activity,Aggressive,Athletic,Atmospheric,Celebratory,Depressive,Elegant,Passionate,Warm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,139.48,30.505676,71.313477,0,Working,False,False,True,False,True,False,False,False
2,153.61,31.694031,67.053223,0,Working,False,False,False,False,True,False,False,False
3,297.792,32.217712,65.85083,0,Working,False,False,False,False,True,False,True,False
11,205.92,32.509766,63.531494,0,Working,False,False,False,True,False,False,False,False
12,125.64,32.56012,63.140869,0,Working,False,False,False,False,False,False,False,True


In [8]:
# Invert mood boolean values based on "isSkipped"

for mood in moods:
    filtered_df[mood] = np.abs(filtered_df[mood] - filtered_df['isSkipped'])
filtered_df.drop('isSkipped', axis=1, inplace=True)
print('Invert mood values based on "isSkipped" boolean:')
filtered_df.head()

Invert mood values based on "isSkipped" boolean:


Unnamed: 0_level_0,optical,temp,humidity,activity,Aggressive,Athletic,Atmospheric,Celebratory,Depressive,Elegant,Passionate,Warm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,139.48,30.505676,71.313477,Working,0,0,1,0,1,0,0,0
2,153.61,31.694031,67.053223,Working,0,0,0,0,1,0,0,0
3,297.792,32.217712,65.85083,Working,0,0,0,0,1,0,1,0
11,205.92,32.509766,63.531494,Working,0,0,0,1,0,0,0,0
12,125.64,32.56012,63.140869,Working,0,0,0,0,0,0,0,1


In [9]:
# One-hot encoding for activity
activities = []
for k in filtered_df['activity']:
    activities.append(k)
activities = np.unique(np.array(activities))
for activity in activities:
    activity_values = filtered_df['activity'].astype(str).str.contains(activity)
    activity_values = activity_values.astype(int)
    filtered_df[activity] = activity_values
filtered_df.drop('activity', axis=1, inplace=True)

filtered_df.head()

Unnamed: 0_level_0,optical,temp,humidity,Aggressive,Athletic,Atmospheric,Celebratory,Depressive,Elegant,Passionate,Warm,Running,Walking,Working
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1,139.48,30.505676,71.313477,0,0,1,0,1,0,0,0,0,0,1
2,153.61,31.694031,67.053223,0,0,0,0,1,0,0,0,0,0,1
3,297.792,32.217712,65.85083,0,0,0,0,1,0,1,0,0,0,1
11,205.92,32.509766,63.531494,0,0,0,1,0,0,0,0,0,0,1
12,125.64,32.56012,63.140869,0,0,0,0,0,0,0,1,0,0,1


# Split into train/test datasets

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

# label encoding for activity
# le = LabelEncoder()
# filtered_df['activity'] = le.fit_transform(filtered_df['activity'].values)
# display(filtered_df.head())

# split into training & testing
x = filtered_df[['optical', 'temp', 'humidity','Working', 'Running', 'Walking']]
y = filtered_df[['Aggressive', 'Athletic', 'Atmospheric', 'Celebratory', \
                 'Melancholic', 'Elegant', 'Passionate', 'Warm']]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=0)

print('Training (x, y): ', x_train.shape, y_train.shape)
print('Testing (x, y): ', x_test.shape, y_test.shape)

# create dfs for training and test data for easy prediction later
train_df = x_train.join(y_train)
test_df = x_test.join(y_test)

print('\nTrain:')
display(train_df.head())
print('Test:')
display(test_df.head())

Training (x, y):  (231, 6) (231, 8)
Testing (x, y):  (58, 6) (58, 8)

Train:


Unnamed: 0_level_0,optical,temp,humidity,Working,Running,Walking,Aggressive,Athletic,Atmospheric,Celebratory,Depressive,Elegant,Passionate,Warm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
114,281.28,33.808899,59.564209,1,0,0,0,1,0,0,0,0,0,0
560,43.753333,30.871582,37.597656,1,0,0,0,0,0,0,0,1,0,0
444,200.48,31.824951,69.042969,1,0,0,0,0,0,1,0,0,0,0
366,6450.56,31.039429,66.339111,0,0,1,0,1,0,0,0,0,0,0
497,43.86,32.714539,59.838867,1,0,0,0,0,1,0,1,0,0,0


Test:


Unnamed: 0_level_0,optical,temp,humidity,Working,Running,Walking,Aggressive,Athletic,Atmospheric,Celebratory,Depressive,Elegant,Passionate,Warm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
123,34.845,34.030457,57.434082,1,0,0,0,0,0,0,0,0,0,1
384,3345.28,38.139343,55.944824,0,0,1,0,0,0,0,1,0,1,0
198,8876.032,35.279236,65.313721,0,0,1,0,0,1,0,0,0,0,0
433,92.24,32.419128,71.75293,1,0,0,0,1,0,1,0,0,0,0
563,43.22,30.700378,37.33724,1,0,0,0,0,0,0,1,0,0,0


# SVM

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import svm

# model training
# since predict_proba only works with two classes at a time,
# eight svm models are trained for each mood and stored in the dictionary "svm_moods"
svms_std = {} # key:mood, value:svm model trained on that mood
params = {'C':[0.001, 0.01, 0.1, 1, 10], 'kernel':('linear', 'poly', 'rbf', 'sigmoid'), 'decision_function_shape':('ovr', 'ovo')}
for mood in y_train.columns:
    svm_pipe_std = Pipeline([('scaler', StandardScaler()), 
                       ('svm', GridSearchCV(svm.SVC(max_iter=100000, probability=True), params)), ])
    svm_pipe_std.fit(x_train, y_train.loc[:,mood].values)
    svms_std[mood] = svm_pipe_std

In [14]:
# Define functions for prediction and evaluation of SVM model

from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score

from sklearn.preprocessing import normalize

# Predict binary values or confidence scores for moods from sensor data
# For now, input_data is a Dataframe for convenience
def svm_predict(svms, input_data, prob=True):
    pred_df = input_data.copy()
    for mood,svm in svms.items():
        if prob:
            pred = svm.predict_proba(pred_df.iloc[:,:-8].values)
            pred_df[mood] = pred[:,1]
        else:
            pred = svm.predict(pred_df.iloc[:,:-8].values)
            pred_df[mood] = pred.reshape(-1)
    return pred_df
    
# Get loss (MSE) of predicted confidence scores
# Input: DataFrames of actual and predicted moods
def evaluate(df_actual, df_pred, df_pred_proba, clf):
    evaluation_df_dict = {
        'mood': [],
        'accuracy (%s)'%clf: [],
        'precision (%s)'%clf: [],
        'recall (%s)'%clf: [],
        'mse (%s)'%clf: [],
    }
    df_moods_actual = df_actual.iloc[:,-8:]
    df_moods_pred =df_pred.iloc[:,-8:]
    
    #mse
    df_moods_actual_normed = normalize(df_actual.iloc[:,-8:], axis=1)
    df_moods_pred_proba =  normalize(df_pred_proba.iloc[:,-8:], axis=1)
    currCol = 0
    
    for mood in df_moods_actual.columns:
        y_actual, y_pred = df_moods_actual[mood].values, df_moods_pred[mood].values
        evaluation_df_dict['mood'].append(mood)
        evaluation_df_dict['accuracy (%s)'%clf].append(accuracy_score(y_actual, y_pred))
        evaluation_df_dict['precision (%s)'%clf].append(precision_score(y_actual, y_pred, zero_division=0))
        evaluation_df_dict['recall (%s)'%clf].append(recall_score(y_actual, y_pred))
        
        evaluation_df_dict['mse (%s)'%clf].append(mean_squared_error(df_moods_actual_normed[currCol], df_moods_pred_proba[currCol]))
        currCol += 1
    evaluation_df = pd.DataFrame(evaluation_df_dict)
    evaluation_df.set_index('mood', inplace=True)
    return evaluation_df

In [15]:
train_pred_svm_df = svm_predict(svms_std, train_df, prob=False)
test_pred_svm_df = svm_predict(svms_std, test_df, prob=False)
train_prob_svm_df = svm_predict(svms_std, train_df, prob=True)
test_prob_svm_df = svm_predict(svms_std, test_df, prob=True)

# Show predicted values and confidence scores for moods for random samples
print('Test data mood labels (Actual):')
#display(test_df.head())

print('Test data mood labels (Predicted):')
#display(test_pred_svm_df.head())

print('Test data mood labels (Confidence Scores):')
#display(test_prob_svm_df.head())

# Show evaluation of predictions
train_eval_svm_df = evaluate(train_df, train_pred_svm_df, train_prob_svm_df, 'svm,std')
test_eval_svm_df = evaluate(test_df, test_pred_svm_df, test_prob_svm_df, 'svm,std')
display(test_eval_svm_df)
display(test_eval_svm_df.describe())

Test data mood labels (Actual):
Test data mood labels (Predicted):
Test data mood labels (Confidence Scores):


Unnamed: 0_level_0,"accuracy (svm,std)","precision (svm,std)","recall (svm,std)","mse (svm,std)"
mood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Aggressive,0.896552,0.0,0.0,0.148163
Athletic,0.844828,0.0,0.0,0.082798
Atmospheric,0.775862,0.0,0.0,0.158488
Celebratory,0.827586,0.0,0.0,0.131705
Depressive,0.827586,0.0,0.0,0.154927
Elegant,0.862069,0.0,0.0,0.172675
Passionate,0.775862,0.0,0.0,0.205959
Warm,0.793103,0.0,0.0,0.12401


Unnamed: 0,"accuracy (svm,std)","precision (svm,std)","recall (svm,std)","mse (svm,std)"
count,8.0,8.0,8.0,8.0
mean,0.825431,0.0,0.0,0.147341
std,0.04267,0.0,0.0,0.036282
min,0.775862,0.0,0.0,0.082798
25%,0.788793,0.0,0.0,0.129781
50%,0.827586,0.0,0.0,0.151545
75%,0.849138,0.0,0.0,0.162034
max,0.896552,0.0,0.0,0.205959


# Random Forest

In [16]:
from sklearn.ensemble import RandomForestClassifier

# params = {'n_estimators':[10,50,100,150,200,500], 'criterion':('gini', 'entropy'), 'max_depth':[k for k in range(1,21)]}
params = {'criterion':('gini', 'entropy'), 'max_depth':[k for k in range(1,21)]}
rf = GridSearchCV(RandomForestClassifier(random_state=0), params)
rf.fit(x_train, y_train)
rf.best_params_

{'criterion': 'entropy', 'max_depth': 11}

In [17]:
# Predict binary values or confidence scores for moods from sensor data
# For now, input_data is a Dataframe for convenience
# Returns confidence scores if prob=True else binary values
def rf_predict(rf, input_data, prob=True):
    pred_df = input_data.copy()
    data = input_data.iloc[:,:-8]
    target = input_data.iloc[:,-8:]
    if prob:
        pred = np.array(rf.predict_proba(data.values))
    else:
        pred = np.array(rf.predict(data.values))
    for i in range(len(target.columns)):
        mood = target.columns[i]
        if prob:
            pred_df[mood] = pred[i,:,1] # predict_proba returns shape (n_features, n_samples, probs)
        else:
            pred_df[mood] = pred[:,i] # predict returns shape (n_samples, n_features)
    return pred_df

In [21]:
train_pred_rf_df = rf_predict(rf, train_df, prob=False)
test_pred_rf_df = rf_predict(rf, test_df, prob=False)
train_prob_rf_df = rf_predict(rf, train_df, prob=True)
test_prob_rf_df = rf_predict(rf, test_df, prob=True)

# Show predicted values and confidence scores for moods for random samples
print('Test data mood labels (Actual):')
test_df_copy = test_df.copy()
#test_df_copy['activity'] = [activity_cats[k] for k in test_df_copy['activity'].values]
# test_df_copy['activity'] = le.inverse_transform(test_df_copy['activity'].values) # convert activity values back to strings
#display(test_df_copy.head())

print('Test data mood labels (Predicted):')
# test_pred_rf_df['activity'] = le.inverse_transform(test_pred_rf_df['activity'].values)
#test_pred_rf_df['activity'] = [activity_cats[k] for k in test_pred_rf_df['activity'].values]
#display(test_pred_rf_df.head())

print('Test data mood labels (Confidence Scores):')
#test_prob_rf_df['activity'] = [activity_cats[k] for k in test_prob_rf_df['activity'].values]
# test_pred_proba_rf_df['activity'] = le.inverse_transform(test_pred_proba_rf_df['activity'].values)
#display(test_prob_rf_df.head())

# Show evaluation of predictions
train_eval_rf_df = evaluate(train_df, train_pred_rf_df, train_prob_rf_df, 'rf')
test_eval_rf_df = evaluate(test_df, test_pred_rf_df, test_prob_rf_df, 'rf')
display(test_eval_rf_df)
display(test_eval_rf_df.describe())



# Normalize, just for visualization (cosine measure already does it)
from sklearn.preprocessing import normalize
NORMAL_MOODS=['Aggressive', 'Athletic', 'Atmospheric', 'Celebratory', \
                 'Melancholic', 'Elegant', 'Passionate', 'Warm']
test_prob_rf_df[NORMAL_MOODS] = normalize(test_prob_rf_df[NORMAL_MOODS].to_numpy(), axis=1)

print('Test data mood labels (Confidence Scores, Postprocessed):')
display(test_prob_rf_df.head(30))


Test data mood labels (Actual):
Test data mood labels (Predicted):
Test data mood labels (Confidence Scores):


Unnamed: 0_level_0,accuracy (rf),precision (rf),recall (rf),mse (rf)
mood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Aggressive,0.87931,0.0,0.0,0.195686
Athletic,0.810345,0.0,0.0,0.219652
Atmospheric,0.724138,0.0,0.0,0.234584
Celebratory,0.810345,0.0,0.0,0.143604
Depressive,0.793103,0.0,0.0,0.245958
Elegant,0.810345,0.0,0.0,0.094322
Passionate,0.689655,0.222222,0.153846,0.23049
Warm,0.706897,0.0,0.0,0.194381


Unnamed: 0,accuracy (rf),precision (rf),recall (rf),mse (rf)
count,8.0,8.0,8.0,8.0
mean,0.778017,0.027778,0.019231,0.194835
std,0.064799,0.078567,0.054393,0.051837
min,0.689655,0.0,0.0,0.094322
25%,0.719828,0.0,0.0,0.181687
50%,0.801724,0.0,0.0,0.207669
75%,0.810345,0.0,0.0,0.231514
max,0.87931,0.222222,0.153846,0.245958


Test data mood labels (Confidence Scores, Postprocessed):


Unnamed: 0_level_0,optical,temp,humidity,Working,Running,Walking,Aggressive,Athletic,Atmospheric,Celebratory,Depressive,Elegant,Passionate,Warm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
123,34.845,34.030457,57.434082,1,0,0,0.020016,0.745632,0.281251,0.072264,0.251277,0.457777,0.198516,0.217256
384,3345.28,38.139343,55.944824,0,0,1,0.806874,0.0,0.034335,0.291848,0.085838,0.0,0.085838,0.497859
198,8876.032,35.279236,65.313721,0,0,1,0.020886,0.000995,0.061665,0.736995,0.166097,0.150184,0.561946,0.2944
433,92.24,32.419128,71.75293,1,0,0,0.065395,0.398878,0.643314,0.202991,0.280121,0.412433,0.25449,0.261205
563,43.22,30.700378,37.33724,1,0,0,0.0,0.017358,0.847131,0.008096,0.016167,0.31487,0.002393,0.427308
113,283.52,33.815613,59.759521,1,0,0,0.018282,0.622712,0.056424,0.692757,0.086119,0.003499,0.292254,0.189625
241,7.1575,32.610474,67.57609,0,1,0,0.078039,0.905249,0.062431,0.234116,0.280939,0.015608,0.191195,0.0
528,95.013333,31.485901,78.621419,1,0,0,0.209306,0.206959,0.173363,0.40796,0.5404,0.120253,0.624839,0.141266
58,82.24,31.109924,76.599121,1,0,0,0.055692,0.082924,0.27364,0.124544,0.912312,0.099113,0.229313,0.070071
73,81.705,34.040527,74.023438,1,0,0,0.006004,0.079269,0.563883,0.026088,0.48914,0.649549,0.031806,0.11337


In [35]:
evaluate_df = pd.concat([test_eval_svm_df, test_eval_rf_df], axis=1)
display(evaluate_df)
display(evaluate_df.describe())

Unnamed: 0_level_0,"accuracy (svm,std)","precision (svm,std)","recall (svm,std)","mse (svm,std)",accuracy (rf),precision (rf),recall (rf),mse (rf)
mood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Aggressive,0.896552,0.0,0.0,0.098554,0.896552,0.0,0.0,0.114033
Athletic,0.844828,0.0,0.0,0.131442,0.810345,0.0,0.0,0.126039
Atmospheric,0.775862,0.0,0.0,0.181373,0.758621,0.0,0.0,0.202016
Celebratory,0.827586,0.0,0.0,0.142576,0.810345,0.0,0.0,0.162678
Depressive,0.827586,0.0,0.0,0.143697,0.793103,0.0,0.0,0.15025
Elegant,0.862069,0.0,0.0,0.119464,0.793103,0.0,0.0,0.16801
Passionate,0.775862,0.0,0.0,0.175535,0.741379,0.333333,0.153846,0.193339
Warm,0.793103,0.0,0.0,0.163203,0.741379,0.0,0.0,0.213863


Unnamed: 0,"accuracy (svm,std)","precision (svm,std)","recall (svm,std)","mse (svm,std)",accuracy (rf),precision (rf),recall (rf),mse (rf)
count,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
mean,0.825431,0.0,0.0,0.14448,0.793103,0.041667,0.019231,0.166279
std,0.04267,0.0,0.0,0.028247,0.050478,0.117851,0.054393,0.035648
min,0.775862,0.0,0.0,0.098554,0.741379,0.0,0.0,0.114033
25%,0.788793,0.0,0.0,0.128448,0.75431,0.0,0.0,0.144197
50%,0.827586,0.0,0.0,0.143136,0.793103,0.0,0.0,0.165344
75%,0.849138,0.0,0.0,0.166286,0.810345,0.0,0.0,0.195508
max,0.896552,0.0,0.0,0.181373,0.896552,0.333333,0.153846,0.213863


## Save Model

In [32]:
import pickle
pickle.dump(rf, open('RandomForest', 'wb'))