# Data cleaning

In [1]:
import pandas as pd
import numpy as np
import random
import json
# import pycaret

# Replace double quotes with single quotes for moods JSON arrays
f = open('song_data.csv','r+')
text = f.read()
text = text.replace('"["', '"[?').replace('", "', '?,?').replace('"]"', '?]"') # Changed to "[$Happy$,$Sad$]" for easy replacing later
f.seek(0)
f.write(text)
f.close()

# Importing data
df = pd.read_csv('song_data.csv', index_col=0)
for col in df.columns:
    if col=='uuid' or 'gyro' in col or 'accel' in col:
        df.drop(col, axis=1, inplace=True)
    elif col not in ['id','isSkipped']:
        if col == 'moods':
            df[col] = df[col].apply(lambda x:x.replace('?','"')) # Replaces mood values back to the form ["Happy","Sad"] so it can be loaded by json
        df[col] = df[col].apply(json.loads)
df['activity'] = df.apply(lambda _: random.choice(['Running','Walking','Working']), axis=1) # initialise random activity
df.head()
# df['activity'].value_counts()

Unnamed: 0_level_0,optical,temp,humidity,moods,isSkipped,activity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,"[139.64, 138.36, 139.64, 140.28]","[30.50567626953125, 30.50567626953125, 30.5056...","[71.3134765625, 71.3134765625, 71.3134765625]","[Depressive, Atmospheric]",0,Walking
2,"[123.24, 123.24, 139.32, 228.64]","[31.69403076171875, 31.69403076171875, 31.6940...","[67.05322265625, 67.05322265625, 67.0532226562...",[Depressive],0,Running
3,"[256.08, 307.84000000000003, 315.2, 301.36, 30...","[32.21771240234375, 32.21771240234375, 32.2177...","[65.850830078125, 65.850830078125, 65.85083007...","[Passionate, Depressive]",0,Working
4,"[127.08, 126.76, 125.48, 124.52]","[32.42919921875, 32.42919921875, 32.4291992187...","[64.6728515625, 64.6728515625, 64.6728515625, ...",[Elegant],1,Working
5,"[145.76, 144.48, 146.4, 144.8]","[32.42919921875, 32.42919921875, 32.4291992187...","[64.6728515625, 64.6728515625, 64.6728515625, ...","[Passionate, Depressive]",1,Running


In [2]:
# Filtering defective data

defective_ids = []
for id,row in df.iterrows():
    # defective if temp array only has -40 values
    # defective if any humidity values are above 99.99
    if len([k for k in row['temp'] if k==-40]) == len(row['temp']) or \
       len([k for k in row['humidity'] if k>99.99]) == len(row['humidity']):
        defective_ids.append(id)
    
    # if only some values are defective, keep the row, but remove defective values
    # remove -40 temp values and >99.99 humidity values
    elif (-40 in row['temp']) or len([k for k in row['humidity'] if k>99.99]):
        row['temp'] = [k for k in row['temp'] if k!=-40]
        row['humidity'] = [k for k in row['humidity'] if k<99.99]

filtered_df = df[~df.index.isin(defective_ids)].copy() # .copy() to avoid warning
print('Defective row ids are: ',defective_ids)

Defective row ids are:  [21, 22, 33, 214, 236, 238, 245, 246, 247, 248, 249, 250, 251, 252]


In [3]:
# Obtain mean optical, temp and humidity values

for col in df.columns:
    if col not in ['moods','isSkipped','activity']:
        filtered_df[col] = filtered_df[col].apply(np.mean)
filtered_df.head()

Unnamed: 0_level_0,optical,temp,humidity,moods,isSkipped,activity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,139.48,30.505676,71.313477,"[Depressive, Atmospheric]",0,Walking
2,153.61,31.694031,67.053223,[Depressive],0,Running
3,297.792,32.217712,65.85083,"[Passionate, Depressive]",0,Working
4,125.96,32.429199,64.672852,[Elegant],1,Working
5,145.36,32.429199,64.672852,"[Passionate, Depressive]",1,Running


In [4]:
# One-hot encoding for moods

moods = []
for k in filtered_df['moods'].values:
    moods += list(k)
moods = np.unique(np.array(moods))
for mood in moods:
    mood_values = filtered_df['moods'].astype(str).str.contains(mood)
    filtered_df[mood] = mood_values
filtered_df.drop('moods', axis=1, inplace=True)
print('Added one-hot encoded columns for moods:')
filtered_df.head()

Added one-hot encoded columns for moods:


Unnamed: 0_level_0,optical,temp,humidity,isSkipped,activity,Aggressive,Athletic,Atmospheric,Celebratory,Depressive,Elegant,Passionate,Warm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,139.48,30.505676,71.313477,0,Walking,False,False,True,False,True,False,False,False
2,153.61,31.694031,67.053223,0,Running,False,False,False,False,True,False,False,False
3,297.792,32.217712,65.85083,0,Working,False,False,False,False,True,False,True,False
4,125.96,32.429199,64.672852,1,Working,False,False,False,False,False,True,False,False
5,145.36,32.429199,64.672852,1,Running,False,False,False,False,True,False,True,False


In [5]:
# Invert mood boolean values based on "isSkipped"

for mood in moods:
    filtered_df[mood] = np.abs(filtered_df[mood] - filtered_df['isSkipped'])
filtered_df.drop('isSkipped', axis=1, inplace=True)
print('Invert mood values based on "isSkipped" boolean:')
filtered_df.head()

Invert mood values based on "isSkipped" boolean:


Unnamed: 0_level_0,optical,temp,humidity,activity,Aggressive,Athletic,Atmospheric,Celebratory,Depressive,Elegant,Passionate,Warm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,139.48,30.505676,71.313477,Walking,0,0,1,0,1,0,0,0
2,153.61,31.694031,67.053223,Running,0,0,0,0,1,0,0,0
3,297.792,32.217712,65.85083,Working,0,0,0,0,1,0,1,0
4,125.96,32.429199,64.672852,Working,1,1,1,1,1,0,1,1
5,145.36,32.429199,64.672852,Running,1,1,1,1,0,1,0,1


# SVM

In [6]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn import svm

# label encoding for activity
le = LabelEncoder()
filtered_df['activity'] = le.fit_transform(filtered_df['activity'].values)

# split into training & testing
x = filtered_df.iloc[:,:4]
y = filtered_df.iloc[:,4:]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=0)

# model training
# eight svm models are trained for each mood and stored in the dictionary "svm_moods"
svms = {} # key:mood, value:svm model trained on that mood
params = {'C':[1,10], 'kernel':('linear', 'poly', 'rbf', 'sigmoid'), 'decision_function_shape':('ovr', 'ovo')}
for mood in y_train.columns:
    svm_pipe = Pipeline([('scaler', StandardScaler()), 
                       ('svm', GridSearchCV(svm.SVC(max_iter=100000, probability=True), params)), ])
    svm_pipe.fit(x_train, y_train.loc[:,mood].values)
    svms[mood] = svm_pipe

In [7]:
# Define functions for training, prediction and evaluation of SVM model

from sklearn.metrics import mean_squared_error

# Predict confidence scores for moods from sensor data
# For now, input_data is a Dataframe for convenience
def svm_predict(svms, input_data):
    pred_df = input_data.copy()
    for mood,svm in svms.items():
        prob = svm.predict_proba(pred_df.iloc[:,:-8].values)
        pred_df[mood] = prob[:,1]
    return pred_df
    
# Get loss (MSE) of predicted confidence scores
# Input: DataFrames of actual and predicted moods
def svm_evaluate(df_actual, df_pred):
    df_moods_actual = df_actual.iloc[:,-8:]
    df_moods_pred = df_pred.iloc[:,-8:]
    mse = 0
    for mood in df_moods_actual.columns:
        mse += mean_squared_error(df_moods_actual[mood].values, df_moods_pred[mood].values)
    return mse

In [11]:
# create dfs for training and test data, as well as the prediction dfs with confidence scores
train_df = x_train.join(y_train)
test_df = x_test.join(y_test)

train_pred_df = svm_predict(svms, train_df)
test_pred_df = svm_predict(svms, test_df)

print('Test data mood labels (Actual):')
test_df['activity'] = le.inverse_transform(test_df['activity'].values) # convert activity values back to strings
display(test_df.head())
print('Test data mood labels (Predicted):')
test_pred_df['activity'] = le.inverse_transform(test_pred_df['activity'].values)
display(test_pred_df.head())

print('MSE (train): ', svm_evaluate(train_df, train_pred_df))
print('MSE (test): ', svm_evaluate(test_df, test_pred_df))

Test data mood labels (Actual):


Unnamed: 0_level_0,optical,temp,humidity,activity,Aggressive,Athletic,Atmospheric,Celebratory,Depressive,Elegant,Passionate,Warm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
367,38535.68,31.039429,66.339111,Working,0,0,0,1,0,0,0,0
79,85.65,34.137878,74.582926,Running,1,0,1,1,1,1,1,1
355,191.46,32.610474,80.212402,Working,1,1,1,1,1,1,0,1
196,10368.64,34.07074,69.805908,Walking,0,0,0,0,0,0,1,0
7,211.18,32.499695,63.62915,Walking,1,1,1,1,1,0,0,0


Test data mood labels (Predicted):


Unnamed: 0_level_0,optical,temp,humidity,activity,Aggressive,Athletic,Atmospheric,Celebratory,Depressive,Elegant,Passionate,Warm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
367,38535.68,31.039429,66.339111,Working,0.378934,0.409409,0.543248,0.595658,0.379473,0.414566,0.523752,0.41236
79,85.65,34.137878,74.582926,Running,0.433751,0.414717,0.485236,0.595428,0.585104,0.483797,0.517114,0.546597
355,191.46,32.610474,80.212402,Working,0.454516,0.456269,0.473122,0.490739,0.637206,0.40055,0.517348,0.407601
196,10368.64,34.07074,69.805908,Walking,0.36163,0.396822,0.484925,0.425043,0.310247,0.444532,0.517248,0.389342
7,211.18,32.499695,63.62915,Walking,0.515621,0.5692,0.479086,0.608577,0.60373,0.5,0.517479,0.593891


MSE (train):  1.8377856858440735
MSE (test):  2.0210878029892534
