In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from time import time
from datetime import timedelta

from joblib import dump, load
import os
import json

import pickle as pkl

from sklearn.model_selection import train_test_split, KFold, cross_validate, GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, f1_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis

from sklearn.metrics import confusion_matrix, \
                  classification_report, accuracy_score,  precision_score, recall_score, f1_score

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
models_dir = '/content/drive/MyDrive/Machine Learning Deliverable/models'

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Machine Learning Deliverable/data/spotify_ml_21_5.csv')
df.shape

(37667, 14)

In [None]:
results_df = pd.read_csv('/content/drive/MyDrive/Machine Learning Deliverable/data/ML_results21_5.csv')
results_df.shape
results_df

Unnamed: 0,Algorithm,Accuracy,F1_Avg,F1_Dark Trap,F1_Emo,F1_Trap Metal,F1_techhouse,F1_Underground Rap,F1_Hiphop,F1_psytrance,F1_techno,F1_hardstyle,F1_trap,F1_trance,F1_RnB,F1_dnb,F1_Rap
0,VotingHard,0.650576,0.665067,0.523572,0.671679,0.369231,0.851741,0.485897,0.457652,0.897785,0.832309,0.83965,0.775862,0.826337,0.386517,0.954631,0.43807
1,Stacking,0.653939,0.659081,0.5,0.668508,0.316591,0.839836,0.53466,0.446256,0.895189,0.816638,0.845015,0.769866,0.826299,0.361416,0.962072,0.444785
2,MLP-best,0.653939,0.659028,0.491882,0.65328,0.350299,0.841159,0.535264,0.459556,0.899317,0.82646,0.821138,0.771637,0.819218,0.37469,0.946984,0.435511
3,ExtrTrees-best2,0.638329,0.655252,0.469866,0.699862,0.329177,0.842528,0.450266,0.4,0.91018,0.830287,0.851958,0.778797,0.830986,0.418511,0.961111,0.4
4,ExtrTrees-best,0.637488,0.654126,0.469604,0.698061,0.326633,0.841683,0.450437,0.4,0.911111,0.826923,0.851145,0.778455,0.83125,0.408493,0.960222,0.403743
5,RandomForest-best,0.635927,0.653906,0.485569,0.669271,0.365517,0.839879,0.439761,0.405363,0.907534,0.826575,0.840271,0.778797,0.82923,0.395604,0.954334,0.416979
6,RandomForest-default,0.633045,0.64497,0.476835,0.689557,0.270504,0.846311,0.47082,0.3772,0.907376,0.832192,0.844575,0.778243,0.824859,0.364061,0.95514,0.391913
7,MLP-default,0.64121,0.644428,0.502179,0.638239,0.341085,0.800817,0.52973,0.402053,0.883682,0.825234,0.8,0.758985,0.781457,0.41326,0.929163,0.416107
8,ExtraTrees-default,0.619476,0.626704,0.454877,0.679831,0.246533,0.819939,0.478049,0.349398,0.883362,0.826873,0.809886,0.770031,0.781919,0.355342,0.938776,0.379045
9,LogisticRegression-best,0.565802,0.554464,0.433135,0.392573,0.341463,0.730382,0.52053,0.394366,0.829841,0.790102,0.587452,0.661856,0.657729,0.301691,0.759176,0.362205


In [None]:
def confusion(true, pred):
    """
    Function for pretty printing confusion matrices
    """
    pred = pd.Series(pred)
    true = pd.Series(true)

    true.name = 'target'
    pred.name = 'predicted'
    cm = pd.crosstab(true.reset_index(drop=True), pred.reset_index(drop=True))
    cm = cm[cm.index]
    return cm

In [None]:
np.random.seed(6046)

In [None]:
labels = list(df['genre'].unique())
# results_df = pd.DataFrame()
def compute_metrics(y_true, y_pred, labels, algorithm_name, results_df=None):
    accuracy = accuracy_score(y_true, y_pred)
    f1_scores = f1_score(y_true, y_pred, average=None, labels=labels)
    f1_score_avg = f1_score(y_true, y_pred, average='macro')

    # If results_df is not provided or empty, create a new DataFrame
    if results_df is None or results_df.empty:
        columns = ['Algorithm', 'Accuracy', 'F1_Avg'] + [f'F1_{label}' for label in labels]
        results_df = pd.DataFrame(columns=columns)

    # Append results to the DataFrame with algorithm name as index
    results_df.loc[len(results_df)] = [algorithm_name, accuracy, f1_score_avg] + list(f1_scores)

    return results_df

In [None]:
with open("/content/drive/MyDrive/Machine Learning Deliverable/data/split/train.pkl", "rb") as f:
    X_train, y_train = pkl.load(f)

with open("/content/drive/MyDrive/Machine Learning Deliverable/data/split/val.pkl", "rb") as f:
    X_val, y_val = pkl.load(f)

with open("/content/drive/MyDrive/Machine Learning Deliverable/data/split/test.pkl", "rb") as f:
    X_test, y_test = pkl.load(f)

In [None]:
random_forest = load(os.path.join(models_dir, "RandomForest_best.joblib"))

In [None]:
y_pred = random_forest.predict(X_val)
print(confusion(y_val, y_pred))

predicted        Dark Trap  Emo  Hiphop  Rap  RnB  Trap Metal  \
target                                                          
Dark Trap              429   21      24   22   34         100   
Emo                     11  257       8    2   32           5   
Hiphop                  41   22     257   50  128          12   
Rap                     11    4      34  167   49           9   
RnB                     25   33      86   18  198           3   
Trap Metal              74   11      13   13    4         159   
Underground Rap        127   21     187  126   92         142   
dnb                      2   10       3    0    5           0   
hardstyle                9   12       0    0    1           2   
psytrance                5    0       0    0    0           1   
techhouse                3    5       1    0    2           5   
techno                   3    0       0    0    0           0   
trance                  13    8       0    1    1           5   
trap                    1

### QDA

In [None]:
qda = QuadraticDiscriminantAnalysis().fit(X_train, y_train)
y_pred = qda.predict(X_val)
print(confusion(y_val, y_pred))

predicted        Dark Trap  Emo  Hiphop  Rap  RnB  Trap Metal  \
target                                                          
Dark Trap              250  121     278   43   58          40   
Emo                      3  289      20    4   14           2   
Hiphop                  16   54     380   22  157           4   
Rap                      3   35     141  116   84           4   
RnB                      5  105     130   10  187           1   
Trap Metal              24   44     176   26   23          69   
Underground Rap         31  100     694  148  169          45   
dnb                      5   20      15    0    7           3   
hardstyle                7   47       5    0    4           1   
psytrance                2    2       0    0    0           0   
techhouse                3    7       9    3    1           1   
techno                  12    0       0    1    1           0   
trance                   4   27       6    2    4           1   
trap                     

### QDA Cross-Validation

In [None]:
scoring_dict = {
    'f1_mac': 'f1_macro',
    'acc': 'accuracy'
}

In [None]:
init_time = time()
qda = QuadraticDiscriminantAnalysis()
qda_cv = GridSearchCV(estimator=qda,
                   scoring=scoring_dict,
                   param_grid={
                       'reg_param': [0.0, 0.00001, 0.0001, 0.001, 0.01, 0.02, 0.1],
                   },
                   cv=5,
                   return_train_score=False,
                   refit='f1_mac',
                   n_jobs=-1)
qda_5CV = qda_cv.fit(X_train, y_train)
print(timedelta(seconds=(time() - init_time)))

0:00:04.018289


In [None]:
y_pred = qda_5CV.predict(X_val)

results_df = compute_metrics(y_val, y_pred, labels, "QDA-best", results_df)
results_df.sort_values(by='F1_Avg', ascending=False,inplace=True)
results_df

Unnamed: 0,Algorithm,Accuracy,F1_Avg,F1_Dark Trap,F1_Emo,F1_Trap Metal,F1_techhouse,F1_Underground Rap,F1_Hiphop,F1_psytrance,F1_techno,F1_hardstyle,F1_trap,F1_trance,F1_RnB,F1_dnb,F1_Rap
0,MLP-best,0.653939,0.659028,0.491882,0.65328,0.350299,0.841159,0.535264,0.459556,0.899317,0.82646,0.821138,0.771637,0.819218,0.37469,0.946984,0.435511
1,ExtrTrees-best2,0.638329,0.655252,0.469866,0.699862,0.329177,0.842528,0.450266,0.4,0.91018,0.830287,0.851958,0.778797,0.830986,0.418511,0.961111,0.4
2,ExtrTrees-best,0.637488,0.654126,0.469604,0.698061,0.326633,0.841683,0.450437,0.4,0.911111,0.826923,0.851145,0.778455,0.83125,0.408493,0.960222,0.403743
3,MLP-default,0.64121,0.644428,0.502179,0.638239,0.341085,0.800817,0.52973,0.402053,0.883682,0.825234,0.8,0.758985,0.781457,0.41326,0.929163,0.416107
4,ExtraTrees-default,0.619476,0.626704,0.454877,0.679831,0.246533,0.819939,0.478049,0.349398,0.883362,0.826873,0.809886,0.770031,0.781919,0.355342,0.938776,0.379045
11,RandomForest-best,0.595341,0.595507,0.423077,0.516981,0.268987,0.785351,0.489886,0.375326,0.855754,0.789069,0.783658,0.662069,0.707739,0.353612,0.949763,0.375821
10,RandomForest-best,0.59414,0.592595,0.414062,0.526621,0.264331,0.765286,0.492127,0.375546,0.852273,0.783576,0.777372,0.658199,0.712144,0.354067,0.946378,0.374346
5,LogisticRegression-best,0.565802,0.554464,0.433135,0.392573,0.341463,0.730382,0.52053,0.394366,0.829841,0.790102,0.587452,0.661856,0.657729,0.301691,0.759176,0.362205
6,GuassianNB-default,0.548151,0.549127,0.335694,0.394756,0.232479,0.779847,0.46356,0.399027,0.786834,0.736089,0.73258,0.608076,0.659341,0.311759,0.926214,0.321519
7,GuassianNB-best,0.548151,0.54905,0.335932,0.395044,0.232479,0.777656,0.46356,0.399027,0.786834,0.735504,0.733772,0.608076,0.658288,0.311759,0.927255,0.321519


In [None]:
best_params = qda_5CV.best_params_
best_params

{'reg_param': 1e-05}

In [None]:
with open(os.path.join(models_dir, "QDA_bestparams.json"), "w") as f:
  f.write(json.dumps(best_params))
dump(qda, os.path.join(models_dir, "QDA.joblib"))

['/content/drive/MyDrive/Machine Learning Deliverable/models/QDA.joblib']

## Random Forest

In [None]:
model_rf1 = RandomForestClassifier(oob_score=True).fit(X_train, y_train)
y_pred = model_rf1.predict(X_val)

print('OOB accuracy=', model_rf1.oob_score_)

print(confusion(y_val,y_pred))

OOB accuracy= 0.6184646321268039
predicted        Dark Trap  Emo  Hiphop  Rap  RnB  Trap Metal  \
target                                                          
Dark Trap              458   10      21   15   22          64   
Emo                     16  241       9    1   33           6   
Hiphop                  49   11     225   22   92           6   
Rap                     20    2      29  126   29           7   
RnB                     42   25      84    9  156           0   
Trap Metal              99    6       9    7    4          94   
Underground Rap        168   12     153   62   58          86   
dnb                      3    8       5    0    3           0   
hardstyle               10    9       0    0    3           2   
psytrance                5    0       0    0    0           1   
techhouse                3    5       0    0    2           3   
techno                   5    0       0    0    0           0   
trance                  21    8       0    0    1        

In [None]:
results_df = compute_metrics(y_val, y_pred, labels, "RandomForest-default", results_df)
results_df = results_df.sort_values(by='F1_Avg', ascending=False)
results_df


Unnamed: 0,Algorithm,Accuracy,F1_Avg,F1_Dark Trap,F1_Emo,F1_Trap Metal,F1_techhouse,F1_Underground Rap,F1_Hiphop,F1_psytrance,F1_techno,F1_hardstyle,F1_trap,F1_trance,F1_RnB,F1_dnb,F1_Rap
0,MLP-best,0.653939,0.659028,0.491882,0.65328,0.350299,0.841159,0.535264,0.459556,0.899317,0.82646,0.821138,0.771637,0.819218,0.37469,0.946984,0.435511
1,ExtrTrees-best2,0.638329,0.655252,0.469866,0.699862,0.329177,0.842528,0.450266,0.4,0.91018,0.830287,0.851958,0.778797,0.830986,0.418511,0.961111,0.4
2,ExtrTrees-best,0.637488,0.654126,0.469604,0.698061,0.326633,0.841683,0.450437,0.4,0.911111,0.826923,0.851145,0.778455,0.83125,0.408493,0.960222,0.403743
10,RandomForest-default,0.633045,0.64497,0.476835,0.689557,0.270504,0.846311,0.47082,0.3772,0.907376,0.832192,0.844575,0.778243,0.824859,0.364061,0.95514,0.391913
3,MLP-default,0.64121,0.644428,0.502179,0.638239,0.341085,0.800817,0.52973,0.402053,0.883682,0.825234,0.8,0.758985,0.781457,0.41326,0.929163,0.416107
4,ExtraTrees-default,0.619476,0.626704,0.454877,0.679831,0.246533,0.819939,0.478049,0.349398,0.883362,0.826873,0.809886,0.770031,0.781919,0.355342,0.938776,0.379045
5,LogisticRegression-best,0.565802,0.554464,0.433135,0.392573,0.341463,0.730382,0.52053,0.394366,0.829841,0.790102,0.587452,0.661856,0.657729,0.301691,0.759176,0.362205
6,GuassianNB-default,0.548151,0.549127,0.335694,0.394756,0.232479,0.779847,0.46356,0.399027,0.786834,0.736089,0.73258,0.608076,0.659341,0.311759,0.926214,0.321519
7,GuassianNB-best,0.548151,0.54905,0.335932,0.395044,0.232479,0.777656,0.46356,0.399027,0.786834,0.735504,0.733772,0.608076,0.658288,0.311759,0.927255,0.321519
8,LogisticRegression-default,0.556556,0.541332,0.427497,0.39031,0.268293,0.724696,0.523452,0.383731,0.791209,0.757601,0.580584,0.660277,0.6245,0.304348,0.757895,0.384252


In [None]:
dump(model_rf1, os.path.join(models_dir, "RandomForest_default.joblib"))

['/content/drive/MyDrive/Machine Learning Deliverable/models/RandomForest_default.joblib']

### Random Forest Cross-Validation

In [None]:
scoring_dict = {
    'f1_mac': 'f1_macro',
    'acc': 'accuracy'
}
scoring_cols = [
    'param_n_estimators', 'param_max_depth', 'param_min_samples_split', 'param_min_samples_leaf', 'param_min_impurity_decrease', 'param_class_weight', 'mean_test_f1_mac', 'mean_test_acc'
]

In [None]:
init_time = time()
rf_cv = GridSearchCV(estimator=model_rf1,
                   scoring=scoring_dict,
                   param_grid={
                       'n_estimators': [100, 500, 1000],
                       'criterion': ['gini'],
                       'max_depth': [2,4,6] + [None],
                       'min_samples_split': [2,4,6,8,10,12],
                      'min_samples_leaf': [1,2,3,5,8],
                       'max_features': ['sqrt', 'log2', None],
                       'min_impurity_decrease': [0.0, 0.0001, 0.001, 0.01, 0.02],
                       'class_weight': ['balanced', None],
                   },
                   cv=5,
                   return_train_score=False,
                   refit='f1_mac',
                   n_jobs=-1)
rf_5CV = rf_cv.fit(X_train, y_train)
print(timedelta(seconds=(time() - init_time)))

KeyboardInterrupt: 

In [None]:
best_params = qda_5CV.best_params_
best_params

In [None]:
y_pred = rf_5cv.predict(X_val)

results_df = compute_metrics(y_val, y_pred, labels, "RandomForest-best", results_df)
results_df.sort_values(by='F1_Avg', ascending=False,inplace=True)
results_df

Unnamed: 0,Algorithm,Accuracy,F1_Avg,F1_Dark Trap,F1_Emo,F1_Trap Metal,F1_techhouse,F1_Underground Rap,F1_Hiphop,F1_psytrance,F1_techno,F1_hardstyle,F1_trap,F1_trance,F1_RnB,F1_dnb,F1_Rap
0,MLP-best,0.653939,0.659028,0.491882,0.65328,0.350299,0.841159,0.535264,0.459556,0.899317,0.82646,0.821138,0.771637,0.819218,0.37469,0.946984,0.435511
1,ExtrTrees-best2,0.638329,0.655252,0.469866,0.699862,0.329177,0.842528,0.450266,0.4,0.91018,0.830287,0.851958,0.778797,0.830986,0.418511,0.961111,0.4
2,ExtrTrees-best,0.637488,0.654126,0.469604,0.698061,0.326633,0.841683,0.450437,0.4,0.911111,0.826923,0.851145,0.778455,0.83125,0.408493,0.960222,0.403743
11,RandomForest-best,0.635927,0.653906,0.485569,0.669271,0.365517,0.839879,0.439761,0.405363,0.907534,0.826575,0.840271,0.778797,0.82923,0.395604,0.954334,0.416979
10,RandomForest-default,0.633045,0.64497,0.476835,0.689557,0.270504,0.846311,0.47082,0.3772,0.907376,0.832192,0.844575,0.778243,0.824859,0.364061,0.95514,0.391913
3,MLP-default,0.64121,0.644428,0.502179,0.638239,0.341085,0.800817,0.52973,0.402053,0.883682,0.825234,0.8,0.758985,0.781457,0.41326,0.929163,0.416107
4,ExtraTrees-default,0.619476,0.626704,0.454877,0.679831,0.246533,0.819939,0.478049,0.349398,0.883362,0.826873,0.809886,0.770031,0.781919,0.355342,0.938776,0.379045
5,LogisticRegression-best,0.565802,0.554464,0.433135,0.392573,0.341463,0.730382,0.52053,0.394366,0.829841,0.790102,0.587452,0.661856,0.657729,0.301691,0.759176,0.362205
6,GuassianNB-default,0.548151,0.549127,0.335694,0.394756,0.232479,0.779847,0.46356,0.399027,0.786834,0.736089,0.73258,0.608076,0.659341,0.311759,0.926214,0.321519
7,GuassianNB-best,0.548151,0.54905,0.335932,0.395044,0.232479,0.777656,0.46356,0.399027,0.786834,0.735504,0.733772,0.608076,0.658288,0.311759,0.927255,0.321519


In [None]:
results_df.to_csv('/content/drive/MyDrive/Machine Learning Deliverable/data/ML_results21_5.csv', index=False)

In [None]:
with open(os.path.join(models_dir, "RandomForest_bestparams.json"), "w") as f:
  f.write(json.dumps(best_params))
dump(rf_5cv, os.path.join(models_dir, 'RandomForest_best.joblib'))

['/content/drive/MyDrive/Machine Learning Deliverable/models/RandomForest_best.joblib']

In [None]:
print(confusion(y_val, y_pred))