In [260]:
# Import data related libraries
import pandas as pd
import numpy as np
import random
import json

# Import label encoder
from sklearn.preprocessing import LabelEncoder

# Import train test split
from sklearn.model_selection import train_test_split

# Import neural network libraries
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import SGD

In [261]:
# Set up GPU
%tensorflow_version 2.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


# Load and clean data

In [262]:
# Replace double quotes with single quotes for moods JSON arrays
f = open('song_data.csv','r+')
text = f.read()
text = text.replace('"["', '"[?').replace('", "', '?,?').replace('"]"', '?]"') # Changed to "[$Happy$,$Sad$]" for easy replacing later
f.seek(0)
f.write(text)
f.close()

In [263]:
# Importing data
df = pd.read_csv('song_data.csv', index_col=0)
for col in df.columns:
    # drops motion data and uuid
    if col == 'uuid' or 'gyro' in col or 'accel' in col:
        df.drop(col, axis=1, inplace=True)
df.dropna(inplace=True) # drop rows with nan values
df.size

2395

In [264]:
# Format contents of columns properly
for col in df.columns:
  if col not in ['id','isSkipped']:
    if col == 'moods':
      df[col] = df[col].apply(lambda x:str(x).replace('?','"'))
    df[col] = df[col].apply(lambda x:json.loads(x))

# Add random activity to rows
df['activity'] = df.apply(lambda _: random.choice(['Running','Walking','Working']), axis=1) # initialise random activity
df.head()

Unnamed: 0_level_0,optical,temp,humidity,moods,isSkipped,activity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,"[139.64, 138.36, 139.64, 140.28]","[30.50567626953125, 30.50567626953125, 30.5056...","[71.3134765625, 71.3134765625, 71.3134765625]","[Depressive, Atmospheric]",0.0,Walking
2,"[123.24, 123.24, 139.32, 228.64]","[31.69403076171875, 31.69403076171875, 31.6940...","[67.05322265625, 67.05322265625, 67.0532226562...",[Depressive],0.0,Walking
3,"[256.08, 307.84000000000003, 315.2, 301.36, 30...","[32.21771240234375, 32.21771240234375, 32.2177...","[65.850830078125, 65.850830078125, 65.85083007...","[Passionate, Depressive]",0.0,Walking
4,"[127.08, 126.76, 125.48, 124.52]","[32.42919921875, 32.42919921875, 32.4291992187...","[64.6728515625, 64.6728515625, 64.6728515625, ...",[Elegant],1.0,Walking
5,"[145.76, 144.48, 146.4, 144.8]","[32.42919921875, 32.42919921875, 32.4291992187...","[64.6728515625, 64.6728515625, 64.6728515625, ...","[Passionate, Depressive]",1.0,Working


In [265]:
# Remove defective data
defective_ids = []
for id,row in df.iterrows():
    # defective if temp array only has -40 values
    # defective if any humidity values are above 99.99

    if len([k for k in row['temp'] if k == -40]) == len(row['temp']) or \
       len([k for k in row['humidity'] if k > 99.99]) == len(row['humidity']):
        defective_ids.append(id)
    
    # if only some values are defective, keep the row, but remove defective values
    # remove -40 temp values and >99.99 humidity values
    elif (-40 in row['temp']) or len([k for k in row['humidity'] if k>99.99]):
        row['temp'] = [k for k in row['temp'] if k!=-40]
        row['humidity'] = [k for k in row['humidity'] if k < 99.99]

filtered_df = df[~df.index.isin(defective_ids)].copy() # .copy() to avoid warning
print('Defective row ids are: ',defective_ids)

Defective row ids are:  ['21', '22', '33', '214', '236', '238', '245', '246', '247', '248', '249', '250', '251', '252', '386', '387', '388', '389', '390', '391', '392', '393', '394']


In [266]:
# Obtain mean optical, temp and humidity values
for col in df.columns:
    if col not in ['moods','isSkipped','activity']:
        filtered_df[col] = filtered_df[col].apply(np.mean)
filtered_df.head()

Unnamed: 0_level_0,optical,temp,humidity,moods,isSkipped,activity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,139.48,30.505676,71.313477,"[Depressive, Atmospheric]",0.0,Walking
2,153.61,31.694031,67.053223,[Depressive],0.0,Walking
3,297.792,32.217712,65.85083,"[Passionate, Depressive]",0.0,Walking
4,125.96,32.429199,64.672852,[Elegant],1.0,Walking
5,145.36,32.429199,64.672852,"[Passionate, Depressive]",1.0,Working


In [267]:
# Label encoding for activity
labelencoder = LabelEncoder()

# Assigning numerical values and storing in another column
filtered_df['activity'] = labelencoder.fit_transform(filtered_df['activity'])
filtered_df.head()

Unnamed: 0_level_0,optical,temp,humidity,moods,isSkipped,activity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,139.48,30.505676,71.313477,"[Depressive, Atmospheric]",0.0,1
2,153.61,31.694031,67.053223,[Depressive],0.0,1
3,297.792,32.217712,65.85083,"[Passionate, Depressive]",0.0,1
4,125.96,32.429199,64.672852,[Elegant],1.0,1
5,145.36,32.429199,64.672852,"[Passionate, Depressive]",1.0,2


In [268]:
# One-hot encoding for moods
moods = []
for k in filtered_df['moods'].values:
    moods += list(k)
moods = np.unique(np.array(moods))
for mood in moods:
    mood_values = filtered_df['moods'].astype(str).str.contains(mood)
    filtered_df[mood] = mood_values
filtered_df.drop('moods', axis=1, inplace=True)
print('Added one-hot encoded columns for moods:')
filtered_df.head()

Added one-hot encoded columns for moods:


Unnamed: 0_level_0,optical,temp,humidity,isSkipped,activity,Aggressive,Athletic,Atmospheric,Celebratory,Depressive,Elegant,Passionate,Warm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,139.48,30.505676,71.313477,0.0,1,False,False,True,False,True,False,False,False
2,153.61,31.694031,67.053223,0.0,1,False,False,False,False,True,False,False,False
3,297.792,32.217712,65.85083,0.0,1,False,False,False,False,True,False,True,False
4,125.96,32.429199,64.672852,1.0,1,False,False,False,False,False,True,False,False
5,145.36,32.429199,64.672852,1.0,2,False,False,False,False,True,False,True,False


In [269]:
# Invert mood boolean values based on "isSkipped"
for mood in moods:
    filtered_df[mood] = np.abs(filtered_df[mood] - filtered_df['isSkipped'])
filtered_df.drop('isSkipped', axis=1, inplace=True)
print('Invert mood values based on "isSkipped" boolean:')
filtered_df.head()

Invert mood values based on "isSkipped" boolean:


Unnamed: 0_level_0,optical,temp,humidity,activity,Aggressive,Athletic,Atmospheric,Celebratory,Depressive,Elegant,Passionate,Warm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,139.48,30.505676,71.313477,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,153.61,31.694031,67.053223,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,297.792,32.217712,65.85083,1,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4,125.96,32.429199,64.672852,1,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0
5,145.36,32.429199,64.672852,2,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0


# MLP Classifier
* Normalize input data before training, to control loss

In [270]:
# Split train and test
X = filtered_df[['optical', 'temp', 'humidity', 'activity']]
Y = filtered_df[['Aggressive', 'Athletic', 'Atmospheric', 'Celebratory', \
                 'Depressive', 'Elegant', 'Passionate', 'Warm']]

# Normalize input data using MinMaxScaler
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

# fit scaler on data
scaler.fit(X)

# apply transform
X = scaler.transform(X)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state = 0, test_size = 0.2)

In [271]:
# MLP model training
model = Sequential()

# Input layer
model.add(Dense(8, input_dim=4, activation='relu'))

model.add(Dropout(0.3))

# Hidden layer
model.add(Dense(8, activation='relu'))

# Output layer
model.add(Dense(8, activation='softmax'))

# Create a Stochastic Gradient Descent optimizer
sgd  = SGD(learning_rate = 0.0001, decay = 1e-4, momentum = 0.9)

# Compile the model.
model.compile(loss = 'categorical_crossentropy', optimizer = sgd,
             metrics = 'accuracy')

model.fit(x = X_train, y = Y_train, shuffle = True, batch_size = 60, 
          epochs = 150, validation_data = (X_test, Y_test))

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

<keras.callbacks.History at 0x7f7881c44910>