# Data cleaning

In [104]:
import pandas as pd
import numpy as np
import random
import json
# import pycaret

# Replace double quotes with single quotes for moods JSON arrays
f = open('song_data.csv','r+')
text = f.read()
text = text.replace('"["', '"[?').replace('", "', '?,?').replace('"]"', '?]"') # Changed to "[$Happy$,$Sad$]" for easy replacing later
f.seek(0)
f.write(text)
f.close()

# Importing data
df = pd.read_csv('song_data.csv', index_col=0)
df.drop('uuid', axis=1, inplace=True)
df.dropna(inplace=True) # drop rows with nan values
for col in df.columns:
    if 'gyro' in col or 'accel' in col:
        df.drop(col, axis=1, inplace=True)
    elif col not in ['id','isSkipped']:
        if col == 'moods':
            df[col] = df[col].apply(lambda x:x.replace('?','"')) # Replaces mood values back to the form ["Happy","Sad"] so it can be loaded by json
        df[col] = df[col].apply(json.loads)
df['activity'] = df.apply(lambda _: random.choice(['Running','Walking','Working']), axis=1) # initialise random activity
print('Number of samples: ', df.shape[0])
df.head()

Number of samples:  495


Unnamed: 0_level_0,optical,temp,humidity,moods,isSkipped,activity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,"[139.64, 138.36, 139.64, 140.28]","[30.50567626953125, 30.50567626953125, 30.5056...","[71.3134765625, 71.3134765625, 71.3134765625]","[Depressive, Atmospheric]",0,Working
2,"[123.24, 123.24, 139.32, 228.64]","[31.69403076171875, 31.69403076171875, 31.6940...","[67.05322265625, 67.05322265625, 67.0532226562...",[Depressive],0,Running
3,"[256.08, 307.84000000000003, 315.2, 301.36, 30...","[32.21771240234375, 32.21771240234375, 32.2177...","[65.850830078125, 65.850830078125, 65.85083007...","[Passionate, Depressive]",0,Working
4,"[127.08, 126.76, 125.48, 124.52]","[32.42919921875, 32.42919921875, 32.4291992187...","[64.6728515625, 64.6728515625, 64.6728515625, ...",[Elegant],1,Working
5,"[145.76, 144.48, 146.4, 144.8]","[32.42919921875, 32.42919921875, 32.4291992187...","[64.6728515625, 64.6728515625, 64.6728515625, ...","[Passionate, Depressive]",1,Working


In [105]:
# Filtering defective data

defective_ids = []
for id,row in df.iterrows():
    # defective if temp array only has -40 values
    # defective if any humidity values are above 99.99
    if len([k for k in row['temp'] if k==-40]) == len(row['temp']) or \
       len([k for k in row['humidity'] if k>99.99]) == len(row['humidity']):
        defective_ids.append(id)
    
    # if only some values are defective, keep the row, but remove defective values
    # remove -40 temp values and >99.99 humidity values
    elif (-40 in row['temp']) or len([k for k in row['humidity'] if k>99.99]):
        row['temp'] = [k for k in row['temp'] if k!=-40]
        row['humidity'] = [k for k in row['humidity'] if k<99.99]

filtered_df = df[~df.index.isin(defective_ids)].copy() # .copy() to avoid warning
print('Defective row ids are: ',defective_ids)

Defective row ids are:  [21, 22, 33, 214, 236, 238, 245, 246, 247, 248, 249, 250, 251, 252, 386, 387, 388, 389, 390, 391, 392, 393, 394]


In [106]:
# Obtain mean optical, temp and humidity values

for col in df.columns:
    if col not in ['moods','isSkipped','activity']:
        filtered_df[col] = filtered_df[col].apply(np.mean)
filtered_df.head()

Unnamed: 0_level_0,optical,temp,humidity,moods,isSkipped,activity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,139.48,30.505676,71.313477,"[Depressive, Atmospheric]",0,Working
2,153.61,31.694031,67.053223,[Depressive],0,Running
3,297.792,32.217712,65.85083,"[Passionate, Depressive]",0,Working
4,125.96,32.429199,64.672852,[Elegant],1,Working
5,145.36,32.429199,64.672852,"[Passionate, Depressive]",1,Working


In [107]:
# One-hot encoding for moods

moods = []
for k in filtered_df['moods'].values:
    moods += list(k)
moods = np.unique(np.array(moods))
for mood in moods:
    mood_values = filtered_df['moods'].astype(str).str.contains(mood)
    filtered_df[mood] = mood_values
filtered_df.drop('moods', axis=1, inplace=True)
print('Added one-hot encoded columns for moods:')
filtered_df.head()

Added one-hot encoded columns for moods:


Unnamed: 0_level_0,optical,temp,humidity,isSkipped,activity,Aggressive,Athletic,Atmospheric,Celebratory,Depressive,Elegant,Passionate,Warm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,139.48,30.505676,71.313477,0,Working,False,False,True,False,True,False,False,False
2,153.61,31.694031,67.053223,0,Running,False,False,False,False,True,False,False,False
3,297.792,32.217712,65.85083,0,Working,False,False,False,False,True,False,True,False
4,125.96,32.429199,64.672852,1,Working,False,False,False,False,False,True,False,False
5,145.36,32.429199,64.672852,1,Working,False,False,False,False,True,False,True,False


In [108]:
# Invert mood boolean values based on "isSkipped"

for mood in moods:
    filtered_df[mood] = np.abs(filtered_df[mood] - filtered_df['isSkipped'])
filtered_df.drop('isSkipped', axis=1, inplace=True)
print('Invert mood values based on "isSkipped" boolean:')
filtered_df.head()

Invert mood values based on "isSkipped" boolean:


Unnamed: 0_level_0,optical,temp,humidity,activity,Aggressive,Athletic,Atmospheric,Celebratory,Depressive,Elegant,Passionate,Warm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,139.48,30.505676,71.313477,Working,0,0,1,0,1,0,0,0
2,153.61,31.694031,67.053223,Running,0,0,0,0,1,0,0,0
3,297.792,32.217712,65.85083,Working,0,0,0,0,1,0,1,0
4,125.96,32.429199,64.672852,Working,1,1,1,1,1,0,1,1
5,145.36,32.429199,64.672852,Working,1,1,1,1,0,1,0,1


# Split into train/test datasets

In [110]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

# label encoding for activity
le = LabelEncoder()
filtered_df['activity'] = le.fit_transform(filtered_df['activity'].values)
# display(filtered_df.head())

# split into training & testing
x = filtered_df.iloc[:,:4]
y = filtered_df.iloc[:,4:]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=0)

print('Training (x, y): ', x_train.shape, y_train.shape)
print('Testing (x, y): ', x_test.shape, y_test.shape)

# create dfs for training and test data for easy prediction later
train_df = x_train.join(y_train)
test_df = x_test.join(y_test)

Training (x, y):  (377, 4) (377, 8)
Testing (x, y):  (95, 4) (95, 8)


# SVM

In [111]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import svm

# model training
# since predict_proba only works with two classes at a time,
# eight svm models are trained for each mood and stored in the dictionary "svm_moods"
svms = {} # key:mood, value:svm model trained on that mood
params = {'C':[1,10], 'kernel':('linear', 'poly', 'rbf', 'sigmoid'), 'decision_function_shape':('ovr', 'ovo')}
for mood in y_train.columns:
    svm_pipe = Pipeline([('scaler', StandardScaler()), 
                       ('svm', GridSearchCV(svm.SVC(max_iter=100000, probability=True), params)), ])
    svm_pipe.fit(x_train, y_train.loc[:,mood].values)
    svms[mood] = svm_pipe



In [112]:
# Define functions for prediction and evaluation of SVM model

from sklearn.metrics import mean_squared_error

# Predict confidence scores for moods from sensor data
# For now, input_data is a Dataframe for convenience
def svm_predict(svms, input_data):
    pred_df = input_data.copy()
    for mood,svm in svms.items():
        prob = svm.predict_proba(pred_df.iloc[:,:-8].values)
        pred_df[mood] = prob[:,1]
    return pred_df
    
# Get loss (MSE) of predicted confidence scores
# Input: DataFrames of actual and predicted moods
def evaluate(df_actual, df_pred):
    df_moods_actual = df_actual.iloc[:,-8:]
    df_moods_pred = df_pred.iloc[:,-8:]
    mse = 0
    for mood in df_moods_actual.columns:
        mse += mean_squared_error(df_moods_actual[mood].values, df_moods_pred[mood].values)
    return mse

In [113]:
train_pred_svm_df = svm_predict(svms, train_df)
test_pred_svm_df = svm_predict(svms, test_df)

print('Test data mood labels (Actual):')
test_df_copy = test_df.copy()
test_df_copy['activity'] = le.inverse_transform(test_df_copy['activity'].values) # convert activity values back to strings
display(test_df_copy.head())
print('Test data mood labels (Predicted):')
test_pred_svm_df['activity'] = le.inverse_transform(test_pred_svm_df['activity'].values)
display(test_pred_svm_df.head())

print('MSE (train): ', evaluate(train_df, train_pred_svm_df))
print('MSE (test): ', evaluate(test_df, test_pred_svm_df))

Test data mood labels (Actual):


Unnamed: 0_level_0,optical,temp,humidity,activity,Aggressive,Athletic,Atmospheric,Celebratory,Depressive,Elegant,Passionate,Warm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
119,234.8,33.81897,58.093262,0,0,0,0,0,0,1,0,1
91,92.69,34.100952,75.98877,0,0,0,1,0,0,0,0,1
24,60.68,32.630615,61.993408,0,1,1,1,0,1,1,1,1
376,29688.32,38.139343,55.944824,0,0,0,0,0,1,0,0,0
207,5607.68,34.674988,70.166016,2,1,1,1,1,1,1,1,0


Test data mood labels (Predicted):


Unnamed: 0_level_0,optical,temp,humidity,activity,Aggressive,Athletic,Atmospheric,Celebratory,Depressive,Elegant,Passionate,Warm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
119,234.8,33.81897,58.093262,0,0.364163,0.470977,0.473201,0.48285,0.379714,0.430629,0.422601,0.5
91,92.69,34.100952,75.98877,0,0.397938,0.472652,0.46558,0.492654,0.524119,0.446799,0.474151,0.467038
24,60.68,32.630615,61.993408,0,0.379829,0.477083,0.472545,0.505162,0.45175,0.436369,0.490039,0.491873
376,29688.32,38.139343,55.944824,0,0.286228,0.433364,0.443132,0.505572,0.250904,0.423466,0.183609,0.366526
207,5607.68,34.674988,70.166016,2,0.404165,0.454697,0.437919,0.482558,0.454598,0.445726,0.471761,0.433614


MSE (train):  1.9362830514474423
MSE (test):  2.0432210952000456


# Random Forest

In [114]:
from sklearn.ensemble import RandomForestClassifier

params = {'criterion':('gini', 'entropy'), 'max_depth':[k for k in range(1,21)]}
rf = GridSearchCV(RandomForestClassifier(random_state=0), params)
rf.fit(x_train, y_train)
rf.best_params_

{'criterion': 'entropy', 'max_depth': 19}

In [115]:
# Predict confidence scores for moods from sensor data
# For now, input_data is a Dataframe for convenience
def rf_predict(rf, input_data):
    pred_df = input_data.copy()
    data = input_data.iloc[:,:-8]
    target = input_data.iloc[:,-8:]
    prob = np.array(rf.predict_proba(data.values))
    for i in range(len(target.columns)):
        mood = target.columns[i]
        pred_df[mood] = prob[i,:,1] # predict_proba returns shape (n_features, n_samples, probs)
    return pred_df

In [116]:
train_pred_rf_df = rf_predict(rf, train_df)
test_pred_rf_df = rf_predict(rf, test_df)

print('Test data mood labels (Actual):')
test_df_copy = test_df.copy()
test_df_copy['activity'] = le.inverse_transform(test_df_copy['activity'].values) # convert activity values back to strings
display(test_df_copy.head())
print('Test data mood labels (Predicted):')
test_pred_rf_df['activity'] = le.inverse_transform(test_pred_rf_df['activity'].values)
display(test_pred_rf_df.head())

print('MSE (train): ', svm_evaluate(train_df, train_pred_rf_df))
print('MSE (test): ', svm_evaluate(test_df, test_pred_rf_df))

Test data mood labels (Actual):


Unnamed: 0_level_0,optical,temp,humidity,activity,Aggressive,Athletic,Atmospheric,Celebratory,Depressive,Elegant,Passionate,Warm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
119,234.8,33.81897,58.093262,0,0,0,0,0,0,1,0,1
91,92.69,34.100952,75.98877,0,0,0,1,0,0,0,0,1
24,60.68,32.630615,61.993408,0,1,1,1,0,1,1,1,1
376,29688.32,38.139343,55.944824,0,0,0,0,0,1,0,0,0
207,5607.68,34.674988,70.166016,2,1,1,1,1,1,1,1,0


Test data mood labels (Predicted):


Unnamed: 0_level_0,optical,temp,humidity,activity,Aggressive,Athletic,Atmospheric,Celebratory,Depressive,Elegant,Passionate,Warm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
119,234.8,33.81897,58.093262,0,0.072857,0.46,0.142857,0.424286,0.152857,0.111429,0.472857,0.25
91,92.69,34.100952,75.98877,0,0.21,0.25,0.73,0.23,0.41,0.3,0.38,0.34
24,60.68,32.630615,61.993408,0,0.11,0.85,0.88,0.85,0.44,0.84,0.44,0.92
376,29688.32,38.139343,55.944824,0,0.13,0.19,0.09,0.7,0.13,0.17,0.67,0.31
207,5607.68,34.674988,70.166016,2,0.7,0.45,0.46,0.5,0.47,0.52,0.5,0.37


MSE (train):  0.3391838029537745
MSE (test):  2.454918387191275
