# Music-Mood Classifier (CSI 4106 - Project - Group 29)
# Afrah Ali - 300049798 - aali179@uottawa.ca 
# Ribhav Khosla - 300087647 - rkhos052@uottawa.ca 
# Zain Malik - 300071476 - zmali081@uottawa.ca 

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plot

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

np.random.seed(42)

In [14]:
# load the dataset
df = pd.read_csv('dataset.csv')
print('Dataframe shape: ', df.shape)

# dropping non-features
dataset = df.drop(columns=['mood', 'track_name', 'artist', 'track_id'])
columns = list(dataset.columns)
print(columns)

print("Class distribution:")
print(df['mood'].value_counts())

#Label encoder
le = LabelEncoder()
df['mood_N'] = le.fit_transform(df['mood'])
df = df.drop(columns=['mood'])
df = df.rename(columns={'mood_N': 'mood'})
print(df)

# split dataset for training and testing
target = df['mood']
print(target)

# Set Training and Testing Data as 8:2
x_train, x_test, y_train, y_test = train_test_split(dataset, 
                                                    target, 
                                                    shuffle = True, 
                                                    test_size=0.2, 
                                                    random_state=1)

# Show the Training and Testing Data
print('Shape of training feature:', x_train.shape)
print('Shape of testing feature:', x_test.shape)
print('Shape of training label:', y_train.shape)
print('Shape of training label:', y_test.shape)

# Determine features with notable correlation
print("Features with notable correlation: ")
all_corr = dataset.corr()
for i in range(len(all_corr)):
    for j in range(i):
        if all_corr.iloc[i, j] > 0.5 or all_corr.iloc[i, j] < -0.5:
            print(str(all_corr.columns[i]) + " and " + str(all_corr.columns[j]) + " = " + str(all_corr.iloc[i, j]))

# Comparing energy values across moods
print("Mean energy for happy songs:", df.loc[df['mood'] == 'happy']['energy'].mean())
print("Mean energy for calm songs:", df.loc[df['mood'] == 'calm']['energy'].mean())
print("Mean energy for stressful songs:", df.loc[df['mood'] == 'stressful']['energy'].mean())
print("Mean energy for sad songs:", df.loc[df['mood'] == 'sad']['energy'].mean())

# Proportional class distribution
print("Class distribution of train set: ", y_train.value_counts())
print("Class distribution of test set: ", y_test.value_counts())


Dataframe shape:  (260, 16)
['acousticness', 'danceability', 'energy', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'speechiness', 'tempo', 'time_signature', 'valence']
Class distribution:
happy        65
sad          65
stressful    65
calm         65
Name: mood, dtype: int64
          track_name               artist                track_id  \
0        Upside Down         Jack Johnson  6shRGWCtBUOPFLFTTqXZIC   
1        Someone New               Hozier  0efT4YKQLQx2YHbp6vgRX8   
2       Little Talks  Of Monsters and Men  3a2tuvXCHbW5nuUckuHkKT   
3    Heart's Content       Brandi Carlile  0pegFWSUOTiG0sLVEfxtvA   
4     Sunday Morning             Maroon 5  4T5cqerbDXueYSVfXkIITo   
..               ...                  ...                     ...   
255    am ersten Tag       Hugo Vanbrooke  2gwhISMkdlhEqEP60P93Z1   
256    Amour naturel       Massimo Pavoni  39bh8hsTP2ZBQWH0E308rT   
257      Dawn Of Day          Sarah Seing  635M2GuMSoVunGBe7D7vWz   
258         Lumino

In [3]:
#Numerical pipeline for all data
num_pipeline = Pipeline([
        ('std_scaler', StandardScaler()),
    ])

# Sending data through pipeline to get fully transformed data
# 2D Numpy array
columns_transformed = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'key', 'liveness', 'loudness', 'mode', 'speechiness', 'tempo', 'time_signature', 'valence']
train_data_transformed = pd.DataFrame(num_pipeline.fit_transform(x_train), columns=columns_transformed)
print(train_data_transformed)

# Creating pipelines for models
pipeline_svm = Pipeline([('scaler', StandardScaler())])
pipeline_dtc = Pipeline([('scaler', StandardScaler())])
pipeline_rfc = Pipeline([('scaler', StandardScaler())])
pipeline_gnb = Pipeline([('scaler', StandardScaler())])

     acousticness  danceability    energy  instrumentalness       key  \
0       -0.514931      1.492572  0.722318         -0.521550  0.840425   
1       -1.219636     -0.862821  1.798226          0.938070 -1.130040   
2        0.624283      0.145786 -1.325003         -0.513296 -0.567050   
3       -1.151568      0.869609  1.028061         -0.522125  0.840425   
4       -1.195381      0.181384  1.666640         -0.522150 -0.567050   
..            ...           ...       ...               ...       ...   
203     -1.204702      0.549228  1.763394         -0.522150  1.684910   
204      1.622533     -1.729035 -1.855216          2.741031 -0.567050   
205      1.490200     -1.645974 -0.740607          1.891319 -0.004060   
206     -0.799735      1.878216  0.335301         -0.522150 -0.004060   
207     -1.109279      1.130660  1.105465         -0.522150 -1.130040   

     liveness  loudness      mode  speechiness     tempo  time_signature  \
0   -0.338574  1.054692  0.814862    -0.462733 

In [4]:
# Support Vector Machine
svm = SVC()
pipeline_svm.steps.append(['model', svm])

# Decision Trees
dtc = DecisionTreeClassifier()
pipeline_dtc.steps.append(['model', dtc])

# Random Forests
rfc = RandomForestClassifier()
pipeline_rfc.steps.append(['model', rfc])

# Naive Bayes
gnb = GaussianNB()
pipeline_gnb.steps.append(['model', gnb])

Support Vector Machine

In [7]:
# Run GridSearchCV for support vector machine
gscv_svm = GridSearchCV(pipeline_svm, param_grid={'model__kernel':['linear', 'rbf'], 'model__C':[500, 600, 700, 1000], 'model__gamma':[0.001, 0.01, 0.1, 1]})

# Setting the scoring metrics for the GridSearchCV
scoring = {"accuracy": "accuracy", "bal_accuracy": "balanced_accuracy", "F1_macro": "f1_macro"}
gscv_svm.set_params(scoring=scoring, refit='bal_accuracy')

# fitting data to models
gscv_svm.fit(train_data_transformed, y_train)

# best parameters
print("Support Vector Machine Best Parameters: ", gscv_svm.best_params_)

# best estimator
print("Support Vector Machine Best Estimator: ", gscv_svm.best_estimator_)

# best score
print("Support Vector Machine Best Score: ", gscv_svm.best_score_)

# Model Results
print("Support Vector Machine Results: ", gscv_svm.cv_results_)

Support Vector Machine Best Parameters:  {'model__C': 500, 'model__gamma': 0.001, 'model__kernel': 'rbf'}
Support Vector Machine Best Estimator:  Pipeline(steps=[('scaler', StandardScaler()),
                ['model', SVC(C=500, gamma=0.001)]])
Support Vector Machine Best Score:  0.6372727272727273
Support Vector Machine Results:  {'mean_fit_time': array([0.43823705, 0.00516772, 0.45218987, 0.00717363, 0.45736904,
       0.00478787, 0.45577245, 0.00817938, 0.578654  , 0.00538759,
       0.58183298, 0.00817795, 0.62712526, 0.00578403, 0.62493591,
       0.00458894, 0.68196921, 0.00498686, 0.69893765, 0.00717463,
       0.69604216, 0.00498629, 0.74083223, 0.00458698, 1.14515314,
       0.00579243, 1.0725348 , 0.00798931, 1.07734623, 0.00540175,
       1.11861434, 0.00497751]), 'std_fit_time': array([1.56054517e-01, 7.35360882e-04, 1.73074977e-01, 7.54092836e-04,
       1.67247956e-01, 3.99313111e-04, 1.64666281e-01, 1.71637730e-03,
       1.64224182e-01, 7.99342019e-04, 1.75562834e-01, 1

Decision Tree Classifier

In [6]:
# Run GridSearchCV for decision tree classifier
gscv_dtc = GridSearchCV(pipeline_dtc, param_grid={'model__max_depth':[1, 3, 5, 8, 10], 'model__min_samples_split':[5,10,15,20,25,30,35,40,45,50], 'model__min_samples_leaf':[10, 20, 30, 40, 50, 60, 70, 80, 90, 100]})

# Setting the scoring metrics for the GridSearchCV
scoring = {"accuracy": "accuracy", "bal_accuracy": "balanced_accuracy", "F1_macro": "f1_macro"}
gscv_dtc.set_params(scoring=scoring, refit='bal_accuracy')

# fitting data to models
gscv_dtc.fit(train_data_transformed, y_train)

# best parameters
print("Decision Trees Best Parameters: ", gscv_dtc.best_params_)

# best estimator
print("Decision Trees Best Estimator: ", gscv_dtc.best_estimator_)

# best score
print("Decision Trees Best Score: ", gscv_dtc.best_score_)

# Model Results
print("Decision Trees Results: ", gscv_dtc.cv_results_)

Decision Trees Best Parameters:  {'model__max_depth': 5, 'model__min_samples_leaf': 20, 'model__min_samples_split': 45}
Decision Trees Best Estimator:  Pipeline(steps=[('scaler', StandardScaler()),
                ['model',
                 DecisionTreeClassifier(max_depth=5, min_samples_leaf=20,
                                        min_samples_split=45)]])
Decision Trees Best Score:  0.494090909090909
Decision Trees Results:  {'mean_fit_time': array([0.00338049, 0.00278621, 0.00299239, 0.0027925 , 0.00277829,
       0.00300174, 0.00258632, 0.00278702, 0.00238767, 0.0029933 ,
       0.00220466, 0.00259762, 0.00239353, 0.00258851, 0.0023942 ,
       0.00259671, 0.00239577, 0.00260057, 0.00219378, 0.00239449,
       0.00239267, 0.00240316, 0.00258679, 0.0025702 , 0.00258698,
       0.00357203, 0.00298481, 0.00301399, 0.00259585, 0.00239558,
       0.00260181, 0.00237975, 0.00258646, 0.00261607, 0.00240026,
       0.00298433, 0.00240002, 0.00239711, 0.00237966, 0.00279317,
       0.002

Random Forest Classifier

In [8]:
# Run GridSearchCV for random forest classifier
gscv_rfc = GridSearchCV(pipeline_rfc, param_grid={'model__n_estimators':[50, 100, 200, 300, 400], 'model__max_depth':[3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25], 'model__bootstrap':[True, False]})

# Setting the scoring metrics for the GridSearchCV
scoring = {"accuracy": "accuracy", "bal_accuracy": "balanced_accuracy", "F1_macro": "f1_macro"}
gscv_rfc.set_params(scoring=scoring, refit='bal_accuracy')

# fitting data to models
gscv_rfc.fit(train_data_transformed, y_train)

# best parameters
print("Random Forests Best Parameters: ", gscv_rfc.best_params_)

# best estimator
print("Random Forests Best Estimator: ", gscv_rfc.best_estimator_)

# best score
print("Random Forests Best Score: ", gscv_rfc.best_score_)

# Model Results
print("Random Forests Results: ", gscv_rfc.cv_results_)

Random Forests Best Parameters:  {'model__bootstrap': True, 'model__max_depth': 25, 'model__n_estimators': 200}
Random Forests Best Estimator:  Pipeline(steps=[('scaler', StandardScaler()),
                ['model',
                 RandomForestClassifier(max_depth=25, n_estimators=200)]])
Random Forests Best Score:  0.6504545454545454
Random Forests Results:  {'mean_fit_time': array([0.05505524, 0.10472226, 0.22300291, 0.30837579, 0.41108613,
       0.05463982, 0.10671902, 0.21542978, 0.32293544, 0.43344655,
       0.05587587, 0.11311235, 0.2184052 , 0.32054358, 0.42985725,
       0.05665684, 0.10992017, 0.22440815, 0.32473454, 0.4350287 ,
       0.057658  , 0.11349645, 0.22519288, 0.34228411, 0.43323898,
       0.05723958, 0.11050391, 0.21701274, 0.32515068, 0.43941822,
       0.05764551, 0.11169395, 0.22639408, 0.32952566, 0.4312469 ,
       0.05546017, 0.11168661, 0.22200623, 0.32054095, 0.43344097,
       0.05784402, 0.11729841, 0.22220507, 0.34567199, 0.43963156,
       0.0574658

Naive Bayes

In [10]:
# Run GridSearchCV for naive bayes
gscv_gnb = GridSearchCV(pipeline_gnb, param_grid={})

# Setting the scoring metrics for the GridSearchCV
scoring = {"accuracy": "accuracy", "bal_accuracy": "balanced_accuracy", "F1_macro": "f1_macro"}
gscv_gnb.set_params(scoring=scoring, refit='bal_accuracy')

# fitting data to models
gscv_gnb.fit(train_data_transformed, y_train)

# best parameters
print("Naive Bayes Best Parameters: ", gscv_gnb.best_params_)

# best estimator
print("Naive Bayes Best Estimator: ", gscv_gnb.best_estimator_)

# best score
print("Naive Bayes Best Score: ", gscv_gnb.best_score_)

# Model Results
print("Support Vector Machine Results: ", gscv_svm.cv_results_)

Naive Bayes Best Parameters:  {}
Naive Bayes Best Estimator:  Pipeline(steps=[('scaler', StandardScaler()), ['model', GaussianNB()]])
Naive Bayes Best Score:  0.5522727272727271
Support Vector Machine Results:  {'mean_fit_time': array([0.43823705, 0.00516772, 0.45218987, 0.00717363, 0.45736904,
       0.00478787, 0.45577245, 0.00817938, 0.578654  , 0.00538759,
       0.58183298, 0.00817795, 0.62712526, 0.00578403, 0.62493591,
       0.00458894, 0.68196921, 0.00498686, 0.69893765, 0.00717463,
       0.69604216, 0.00498629, 0.74083223, 0.00458698, 1.14515314,
       0.00579243, 1.0725348 , 0.00798931, 1.07734623, 0.00540175,
       1.11861434, 0.00497751]), 'std_fit_time': array([1.56054517e-01, 7.35360882e-04, 1.73074977e-01, 7.54092836e-04,
       1.67247956e-01, 3.99313111e-04, 1.64666281e-01, 1.71637730e-03,
       1.64224182e-01, 7.99342019e-04, 1.75562834e-01, 1.93347234e-03,
       2.24448386e-01, 1.32331215e-03, 1.79881840e-01, 4.81670644e-04,
       2.74534515e-01, 4.15696997e-0

In [12]:
# test data prediction
test_data_transformed = pd.DataFrame(num_pipeline.fit_transform(x_test), columns=columns_transformed)
predict_labels_svm = gscv_svm.predict(test_data_transformed)
predict_labels_dtc = gscv_dtc.predict(test_data_transformed)
predict_labels_rfc = gscv_rfc.predict(test_data_transformed)
predict_labels_gnb = gscv_gnb.predict(test_data_transformed)

# classification report and the confusion matrix
print(classification_report(y_test, predict_labels_svm))
print(confusion_matrix(y_test, predict_labels_svm))

print(classification_report(y_test, predict_labels_dtc))
print(confusion_matrix(y_test, predict_labels_dtc))

print(classification_report(y_test, predict_labels_rfc))
print(confusion_matrix(y_test, predict_labels_rfc))

print(classification_report(y_test, predict_labels_gnb))
print(confusion_matrix(y_test, predict_labels_gnb))

              precision    recall  f1-score   support

           0       0.80      0.67      0.73        12
           1       0.67      0.77      0.71        13
           2       0.93      0.93      0.93        14
           3       0.69      0.69      0.69        13

    accuracy                           0.77        52
   macro avg       0.77      0.76      0.77        52
weighted avg       0.77      0.77      0.77        52

[[ 8  2  1  1]
 [ 1 10  0  2]
 [ 0  0 13  1]
 [ 1  3  0  9]]
              precision    recall  f1-score   support

           0       0.45      0.75      0.56        12
           1       0.67      0.77      0.71        13
           2       0.83      0.36      0.50        14
           3       0.64      0.54      0.58        13

    accuracy                           0.60        52
   macro avg       0.65      0.60      0.59        52
weighted avg       0.65      0.60      0.59        52

[[ 9  2  1  0]
 [ 2 10  0  1]
 [ 6  0  5  3]
 [ 3  3  0  7]]
        

In [13]:

results = {
    'Model': ['SVM', 'Decision Tree', 'Random Forest Classifier', 'Naive Bayes'], 
    'Best Parameters (train)': [gscv_svm.best_params_, gscv_dtc.best_params_, gscv_rfc.best_params_, gscv_gnb.best_params_],
    'Best accuracy (train)':[max(gscv_svm.cv_results_['mean_test_accuracy']), max(gscv_dtc.cv_results_['mean_test_accuracy']), max(gscv_rfc.cv_results_['mean_test_accuracy']), max(gscv_gnb.cv_results_['mean_test_accuracy'])],
    'Best F1 Macro (train)': [max(gscv_svm.cv_results_['mean_test_F1_macro']), max(gscv_dtc.cv_results_['mean_test_F1_macro']), max(gscv_rfc.cv_results_['mean_test_F1_macro']), max(gscv_gnb.cv_results_['mean_test_F1_macro'])],
    'Best Accuracy (test)': [accuracy_score(y_test, predict_labels_svm), accuracy_score(y_test, predict_labels_dtc), accuracy_score(y_test, predict_labels_rfc), accuracy_score(y_test, predict_labels_gnb)],
    'Best F1 Macro (test)': [f1_score(y_test, predict_labels_svm, average='macro'), f1_score(y_test, predict_labels_dtc, average='macro'), f1_score(y_test, predict_labels_rfc, average='macro'), f1_score(y_test, predict_labels_gnb, average='macro')]
}

table = pd.DataFrame(results)
print(table)

                      Model  \
0                       SVM   
1             Decision Tree   
2  Random Forest Classifier   
3               Naive Bayes   

                             Best Parameters (train)  Best accuracy (train)  \
0  {'model__C': 500, 'model__gamma': 0.001, 'mode...               0.634611   
1  {'model__max_depth': 5, 'model__min_samples_le...               0.494657   
2  {'model__bootstrap': True, 'model__max_depth':...               0.649245   
3                                                 {}               0.547619   

   Best F1 Macro (train)  Best Accuracy (test)  Best F1 Macro (test)  
0               0.624296              0.769231              0.765609  
1               0.486439              0.596154              0.590030  
2               0.641926              0.711538              0.696621  
3               0.509743              0.711538              0.709358  
