In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time
import datetime
import missingno as msno
import tensorflow as tf

# Import from sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, MinMaxScaler, OneHotEncoder, LabelBinarizer, LabelEncoder
import category_encoders as ce
from sklearn.feature_extraction import FeatureHasher
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, ConfusionMatrixDisplay, confusion_matrix, plot_confusion_matrix, multilabel_confusion_matrix, accuracy_score, recall_score, precision_score, roc_auc_score, roc_curve, plot_roc_curve, f1_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor, KerasClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, RandomForestClassifier, BaggingClassifier
from xgboost import XGBClassifier

# Set a random seed
from numpy.random import seed
seed(8)
from tensorflow.random import set_seed
set_seed(8)

# Data Visualization
sns.set_theme(context='notebook', style='darkgrid', palette='bright')

In [3]:
df = pd.read_csv('../data/df_moodadj_cluster.csv')
df.head()

Unnamed: 0,name,artist,track_id,instrumentalness,danceability,energy,liveness,loudness,acousticness,valence,speechiness,tempo,key,time_signature,length,popularity,mood,mood_map,cluster
0,Kiss Me More (feat. SZA),Doja Cat,748mdHapucXQri7IAO8yFK,0.000158,0.762,0.701,0.123,-3.541,0.235,0.742,0.0286,110.968,8,4,208866,92,Energetic,2,0
1,seaside_demo,SEB,73M0rMVx5CWE8M4uATSsto,0.000133,0.706,0.824,0.111,-7.959,0.355,0.86,0.0694,94.98,0,4,132000,54,Energetic,2,0
2,Shivers,Ed Sheeran,6bQfNiqyCX7UaQSvVVGo4I,0.0,0.788,0.859,0.0424,-2.724,0.281,0.822,0.0856,141.02,2,4,207853,78,Energetic,2,0
3,Heat Waves,Glass Animals,02MWAaffLxlfxAUY7c5dvx,7e-06,0.761,0.525,0.0921,-6.9,0.44,0.531,0.0944,80.87,11,4,238805,85,Energetic,2,0
4,Electric Love,BØRNS,2GiJYvgVaD2HtM8GqD9EgQ,0.00137,0.611,0.797,0.26,-7.627,0.00543,0.518,0.0533,120.041,6,4,218106,82,Energetic,2,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5849 entries, 0 to 5848
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              5849 non-null   object 
 1   artist            5849 non-null   object 
 2   track_id          5849 non-null   object 
 3   instrumentalness  5849 non-null   float64
 4   danceability      5849 non-null   float64
 5   energy            5849 non-null   float64
 6   liveness          5849 non-null   float64
 7   loudness          5849 non-null   float64
 8   acousticness      5849 non-null   float64
 9   valence           5849 non-null   float64
 10  speechiness       5849 non-null   float64
 11  tempo             5849 non-null   float64
 12  key               5849 non-null   int64  
 13  time_signature    5849 non-null   int64  
 14  length            5849 non-null   int64  
 15  popularity        5849 non-null   int64  
 16  mood              5849 non-null   object 


#### **Null model:** 
###### Evaluate class imbalances
###### Classes are fairly balanced

In [7]:
df['mood'].value_counts(normalize=True)

Energetic    0.472559
Chill        0.283125
Sad          0.244315
Name: mood, dtype: float64

#### **Multiclass Multiclassification Models w/o PCA**

In [12]:
# make copy of dataframe
df = df.copy()

# X audio features
X = df[['danceability', 
        'energy', 
        'loudness', 
        'speechiness', 
        'acousticness',
        'instrumentalness', 
        'liveness', 
        'valence',
        'cluster']]

y = df['mood_map']

# # set up train_test_split with stratification to include equal classes of each
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.3, 
                                                    stratify=y, 
                                                    random_state=42)

# standardize X data
x = MinMaxScaler()
X_train_x = x.fit_transform(X_train)
X_test_x = x.transform(X_test)

In [13]:
logreg=LogisticRegression(multi_class='multinomial', solver='lbfgs')
logreg.fit(X_train_x, y_train)
# Score on training and testing sets.
print(f'Training Score: {round(logreg.score(X_train_x, y_train),4)}')
print(f'Testing Score: {round(logreg.score(X_test_x, y_test),4)}')

Training Score: 0.7169
Testing Score: 0.735


In [14]:
abc = AdaBoostClassifier(random_state=42, n_estimators=4)
abc.fit(X_train_x,y_train)
print(abc.score(X_train_x, y_train))
print(abc.score(X_test_x, y_test))

0.7205666829506595
0.7316239316239316


In [17]:
xg = XGBClassifier(objective='multi:softprob')
xg.fit(X_train_x, y_train)
print(xg.score(X_train_x, y_train))
print(xg.score(X_test_x, y_test))



0.975085490962384
0.7350427350427351


#### **With PCA**

In [57]:
# X audio features
X_audio = df[['danceability', 
              'energy', 
              'loudness', 
              'speechiness', 
              'acousticness',
              'instrumentalness', 
              'liveness', 
              'valence',
              'cluster']]

# response variable
y = df['mood_map']


pf = PolynomialFeatures(degree = 3)
X = pf.fit_transform(X_audio)

# # set up train_test_split with stratification to include equal classes of each
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.3, 
                                                    stratify=y, 
                                                    random_state=42)

# standardize X data
x = MinMaxScaler()
X_train_x = x.fit_transform(X_train)
X_test_x = x.transform(X_test)

# instantiate PCA
pca = PCA(n_components=40, random_state = 42)
Z_train = pca.fit_transform(X_train_x)
Z_test = pca.transform(X_test_x)

In [58]:
logreg=LogisticRegression(multi_class='multinomial', max_iter=200, solver='lbfgs')
logreg.fit(Z_train, y_train)
# Score on training and testing sets.
print(f'Training Score: {round(logreg.score(Z_train, y_train),4)}')
print(f'Testing Score: {round(logreg.score(Z_test, y_test),4)}')

Training Score: 0.7311
Testing Score: 0.7345


In [59]:
abc = AdaBoostClassifier(random_state=42, n_estimators=10)
abc.fit(Z_train,y_train)
print(abc.score(Z_train, y_train))
print(abc.score(Z_test, y_test))

0.714704445530044
0.7225071225071225


In [60]:
gbc = GradientBoostingClassifier(n_estimators = 3, max_depth=1, random_state=42)
gbc.fit(Z_train, y_train)
print(gbc.score(Z_train, y_train))
print(gbc.score(Z_test, y_test))

0.7017586712261846
0.7168091168091169


In [61]:
xg = XGBClassifier(objective='multi:softprob')
xg.fit(Z_train, y_train)
print(xg.score(Z_train, y_train))
print(xg.score(Z_test, y_test))





0.9760625305324866
0.7299145299145299
