In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# Uncomment to upgrade packages
# !pip install pandas --upgrade --user --quiet
# !pip install numpy --upgrade --user --quiet
# !pip install scipy --upgrade --user --quiet
# !pip install statsmodels --upgrade --user --quiet
# !pip install scikit-learn --upgrade --user --quiet
# !pip install tensorflow --user
%load_ext autoreload

In [None]:
import pickle as pkl
import numpy as np
import pandas as pd
from pandas.api.types import CategoricalDtype

import matplotlib.pyplot as plt
import seaborn as sns

from time import time
from datetime import timedelta

from sklearn.model_selection import train_test_split,  KFold, cross_validate, GridSearchCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVR, SVR, SVC

from sklearn.metrics import confusion_matrix, \
                  classification_report, accuracy_score,  precision_score, recall_score, f1_score

from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.precision', 3)

In [None]:
# Extra imports
from pandas import read_csv
from sklearn.tree import DecisionTreeClassifier
import graphviz
from sklearn.tree import export_graphviz

from sklearn.model_selection import cross_val_predict, StratifiedKFold

from joblib import dump
import os
import json

In [None]:
import warnings
warnings.filterwarnings('ignore')

np.random.seed(6046) # for reproducibility

In [None]:
models_dir = '/content/drive/MyDrive/Machine Learning Deliverable/models'

In [None]:
spotify = pd.read_csv('/content/drive/MyDrive/Machine Learning Deliverable/data/spotify_ml_21_5.csv')
spotify.shape
column_list = list(spotify.columns.values)
genres_classes = spotify['genre'].unique()

(37667, 14)

In [None]:
def split_data(df):
  """
  Function that reads a dataframe and splits data into training, validation and testing set. The split is stratified, so they distribution of the classes in our target column is equal.
  """
  X = df.loc[:, df.columns != 'genre']
  y = df['genre']

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=df.loc[:,'genre'])

  X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.33, stratify=y_train, random_state=42)
  return X_train, X_test, X_val, y_train, y_test, y_val

In [None]:
labels = list(spotify['genre'].unique())   # extract in a list all the different labels of our target genre column
# results_df = pd.DataFrame()   # this SHOULD READ the already existed dataframe, so it appends and not replace
results_df = pd.read_csv('/content/drive/MyDrive/Machine Learning Deliverable/data/ML_results.csv')
def compute_metrics(y_true, y_pred, labels, algorithm_name, results_df=None):
  """
  This function computes accuracy, F1 macro average on all classes, and F1 of every different class of our target.
  It also appends the results in a given dataframe. The parameters given are the predicted labels, the actual labels, the name of the algorithm to distinct the experiments and a datafarme.
  """
  accuracy = accuracy_score(y_true, y_pred)
  f1_scores = f1_score(y_true, y_pred, average=None, labels=labels)
  f1_score_avg = f1_score(y_true, y_pred, average='macro')

  # If results_df is not provided or empty, create a new DataFrame
  if results_df is None or results_df.empty:
      columns = ['Algorithm', 'Accuracy', 'F1_Avg'] + [f'F1_{label}' for label in labels]
      results_df = pd.DataFrame(columns=columns)

  # Append results to the DataFrame with algorithm name as index
  results_df.loc[len(results_df)] = [algorithm_name, accuracy, f1_score_avg] + list(f1_scores)

  return results_df

In [None]:
def confusion(true, pred):
    """
    Function for pretty printing confusion matrices for every different class in our target
    """
    pred = pd.Series(pred)
    true = pd.Series(true)

    true.name = 'target'
    pred.name = 'predicted'
    cm = pd.crosstab(true.reset_index(drop=True), pred.reset_index(drop=True))
    cm = cm[cm.index]
    return cm

# Decision tree
Now we proceed to retrieve the train, validation and test datasets to train the decision tree

In [None]:
with open('/content/drive/MyDrive/Machine Learning Deliverable/data/split/train.pkl','rb') as f:
  X_train, y_train = pkl.load(f)

with open('/content/drive/MyDrive/Machine Learning Deliverable/data/split/val.pkl','rb') as f:
  X_val, y_val = pkl.load(f)

with open('/content/drive/MyDrive/Machine Learning Deliverable/data/split/test.pkl','rb') as f:
  X_test, y_test = pkl.load(f)

In [None]:
model_tree = DecisionTreeClassifier().fit(X_train, y_train)

In [None]:
results_df.drop(7, inplace=True)
# results_df

In [None]:
dump(model_tree, os.path.join(models_dir, "DecisionTrees_default.joblib"))

['/content/drive/MyDrive/Machine Learning Deliverable/models/DecisionTrees_default.joblib']

In [None]:
y_pred = model_tree.predict(X_val)

# metrics_result = compute_metrics(y_val, y_pred)
results_df = compute_metrics(y_val, y_pred, labels, "DecisionTrees-default", results_df)
results_df.sort_values(by='F1_Avg', ascending=False,inplace=True)
results_df

confusion(y_val, y_pred)

Unnamed: 0,Algorithm,Accuracy,F1_Avg,F1_hardstyle,F1_Underground Rap,F1_Emo,F1_Dark Trap,F1_Hiphop,F1_techhouse,F1_Rap,F1_trance,F1_techno,F1_Trap Metal,F1_psytrance,F1_trap,F1_RnB,F1_dnb
0,RandomForest-best,0.634,0.65,0.796,0.455,0.678,0.497,0.429,0.86,0.439,0.812,0.831,0.326,0.898,0.699,0.413,0.961
1,ExtrTrees-best,0.635,0.648,0.837,0.461,0.693,0.494,0.43,0.854,0.415,0.821,0.828,0.274,0.906,0.709,0.39,0.962
2,ExtrTrees-best2,0.635,0.647,0.834,0.458,0.689,0.494,0.431,0.855,0.416,0.821,0.827,0.271,0.905,0.707,0.392,0.964
3,RandomForest-default,0.629,0.637,0.794,0.478,0.702,0.486,0.392,0.857,0.395,0.824,0.834,0.243,0.897,0.686,0.372,0.963
4,SVM,0.607,0.63,0.475,0.619,0.393,0.813,0.373,0.41,0.887,0.827,0.779,0.749,0.796,0.403,0.925,0.372
5,ExtraTrees-default,0.613,0.616,0.765,0.482,0.695,0.464,0.368,0.846,0.376,0.785,0.824,0.214,0.879,0.641,0.361,0.928
6,QDA,0.598,0.607,0.765,0.475,0.562,0.465,0.387,0.827,0.377,0.73,0.81,0.302,0.859,0.616,0.37,0.949
8,DecisionTrees-best,0.596,0.594,0.413,0.545,0.244,0.81,0.502,0.392,0.866,0.756,0.787,0.664,0.769,0.261,0.91,0.401
9,GuassianNB-best,0.574,0.578,0.737,0.488,0.471,0.456,0.414,0.832,0.382,0.649,0.781,0.267,0.79,0.585,0.308,0.932
10,GuassianNB-default,0.574,0.578,0.737,0.488,0.471,0.456,0.414,0.832,0.382,0.649,0.781,0.267,0.79,0.585,0.308,0.932


predicted,Dark Trap,Emo,Hiphop,Rap,RnB,Trap Metal,Underground Rap,dnb,hardstyle,psytrance,techhouse,techno,trance,trap
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Dark Trap,371,22,67,42,32,110,193,12,16,7,21,20,45,41
Emo,32,178,28,7,25,13,28,8,13,0,1,1,11,9
Hiphop,61,23,197,53,103,21,174,4,1,0,5,1,2,8
Rap,48,4,44,129,37,17,112,0,1,0,3,0,0,5
RnB,45,37,98,24,115,14,100,2,4,0,2,0,6,4
Trap Metal,122,12,21,14,9,84,123,2,11,1,3,1,4,12
Underground Rap,264,22,211,145,90,139,358,2,6,2,13,2,5,17
dnb,11,12,16,1,4,3,9,476,0,0,0,0,0,0
hardstyle,21,21,1,1,4,9,4,0,369,12,0,0,1,60
psytrance,10,0,2,0,0,1,0,0,17,479,2,31,29,6


In [None]:
print('Tree depth: {}\nNodes: {}'.format(model_tree.tree_.max_depth, model_tree.tree_.node_count))

Tree depth: 32
Nodes: 9207


There is a lot of depth, so we can try different hyperparameters to improve the results. We use grid search with 5-fold to find the best parameters.

In [None]:
criterion = ['gini', 'entropy']

max_dephts = [None, 5, 10, 15, 20]
min_samples_split = [1, 2, 3, 4, 5]
min_samples_leaf = [1, 2, 3, 4, 5]
max_features = ['auto', 'sqrt', 'log2', None]

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

init_time = time()
model_tree = DecisionTreeClassifier()

f1_class_0_scorer = make_scorer(f1_score, pos_label=0)
f1_class_1_scorer = make_scorer(f1_score, pos_label=1)
f1_class_2_scorer = make_scorer(f1_score, pos_label=2)
f1_class_3_scorer = make_scorer(f1_score, pos_label=3)
f1_class_4_scorer = make_scorer(f1_score, pos_label=4)
f1_class_5_scorer = make_scorer(f1_score, pos_label=5)
f1_class_6_scorer = make_scorer(f1_score, pos_label=6)
f1_class_7_scorer = make_scorer(f1_score, pos_label=7)
f1_class_8_scorer = make_scorer(f1_score, pos_label=8)
f1_class_9_scorer = make_scorer(f1_score, pos_label=9)
f1_class_10_scorer = make_scorer(f1_score, pos_label=10)
f1_class_11_scorer = make_scorer(f1_score, pos_label=11)
f1_class_12_scorer = make_scorer(f1_score, pos_label=12)
f1_class_13_scorer = make_scorer(f1_score, pos_label=13)

scoring_dict = {
    'f1_mac': 'f1_macro',
    'f1_class_0': f1_class_0_scorer,
    'f1_class_1': f1_class_1_scorer,
    'f1_class_2': f1_class_2_scorer,
    'f1_class_3': f1_class_3_scorer,
    'f1_class_4': f1_class_4_scorer,
    'f1_class_5': f1_class_5_scorer,
    'f1_class_6': f1_class_6_scorer,
    'f1_class_7': f1_class_7_scorer,
    'f1_class_8': f1_class_8_scorer,
    'f1_class_9': f1_class_9_scorer,
    'f1_class_10': f1_class_10_scorer,
    'f1_class_11': f1_class_11_scorer,
    'f1_class_12': f1_class_12_scorer,
    'f1_class_13': f1_class_13_scorer,
    'acc': 'accuracy'
}

trc = GridSearchCV(estimator=model_tree,
                   scoring=scoring_dict,
                   param_grid={
                       'criterion': criterion,
                       'max_depth': max_dephts,
                       'min_samples_split': min_samples_split,
                       'min_samples_leaf': min_samples_leaf,
                       'max_features': max_features
                   },
                   cv=5,
                   return_train_score=False,
                   refit='f1_mac')

model_5CV = trc.fit(X_train, y_train)
print(timedelta(seconds=(time() - init_time)))

KeyboardInterrupt: 

In [None]:
scoring_cols = [
    'param_criterion', 'param_max_depth', 'param_max_features',
    'param_min_samples_leaf', 'param_min_samples_split', 'mean_test_f1_mac',
    'mean_test_f1_class_0', 'mean_test_f1_class_1', 'mean_test_acc'
]

pd.DataFrame(model_5CV.cv_results_).sort_values(by='mean_test_f1_mac', ascending=False)[scoring_cols].head()

NameError: name 'model_5CV' is not defined

The best hyperparameters are the following: 'criterion': ‘entropy’, 'max_depth’: 10, 'min_samples_split': 5, 'min_samples_leaf’: 4, 'max_features': None

In [None]:
best_params = model_5CV.best_params_
best_params

In [None]:
with open(os.path.join(models_dir, "DecisionTrees_bestparams.txt"), "w") as f:
  f.write(json.dumps(best_params))
dump(model_5CV, os.path.join(models_dir, "DecisionTrees_best.joblib"))

In [None]:
y_pred = model_5CV.predict(X_val)

results_df = compute_metrics(y_val, y_pred, labels, "DecisionTrees-best", results_df)
results_df.sort_values(by='F1_Avg', ascending=False,inplace=True)
results_df

confusion(y_val, y_pred)

In [None]:
best_tree_model = model_5CV.best_estimator_
dot_data = export_graphviz(best_tree_model, out_file=None,
                         feature_names=column_list[1:14],
                         class_names=genres_classes,
                         filled=True, rounded=True,
                         special_characters=True,
                         rotate=True,
                         proportion=True,
                         max_depth=3)



# Draw the decision tree
graph = graphviz.Source(dot_data)
graph.render("/content/drive/MyDrive/Machine Learning Deliverable/decision_tree_best")
graph.view()

# LDA
Now we proceed to retrieve the train, validation and test datasets to train the LDA

In [None]:
with open('/content/drive/MyDrive/Machine Learning Deliverable/data/split/train.pkl','rb') as f:
  X_train, y_train = pkl.load(f)

with open('/content/drive/MyDrive/Machine Learning Deliverable/data/split/val.pkl','rb') as f:
  X_val, y_val = pkl.load(f)

with open('/content/drive/MyDrive/Machine Learning Deliverable/data/split/test.pkl','rb') as f:
  X_test, y_test = pkl.load(f)

We use grid search with 5-fold to find the best parameters.

In [None]:
lda_model = LinearDiscriminantAnalysis()

scoring_dict = {
    'f1_mac': 'f1_macro',
    'f1_class_0': f1_class_0_scorer,
    'f1_class_1': f1_class_1_scorer,
    'f1_class_2': f1_class_2_scorer,
    'f1_class_3': f1_class_3_scorer,
    'f1_class_4': f1_class_4_scorer,
    'f1_class_5': f1_class_5_scorer,
    'f1_class_6': f1_class_6_scorer,
    'f1_class_7': f1_class_7_scorer,
    'f1_class_8': f1_class_8_scorer,
    'f1_class_9': f1_class_9_scorer,
    'f1_class_10': f1_class_10_scorer,
    'f1_class_11': f1_class_11_scorer,
    'f1_class_12': f1_class_12_scorer,
    'f1_class_13': f1_class_13_scorer,
    'acc': 'accuracy'
}

param_grid = {
    'solver': ['svd', 'lsqr', 'eigen'],
    'shrinkage': [None, 'auto'] if 'lsqr' or 'eigen' in ['lsqr', 'eigen'] else [None]
}

trc = GridSearchCV(estimator=lda_model,
                   scoring=scoring_dict,
                   param_grid=param_grid,
                   cv=5,
                   return_train_score=False,
                   refit='f1_mac')

model_5CV = trc.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
scoring_cols = [
    'mean_test_f1_mac', 'mean_test_acc', 'param_solver', 'param_shrinkage'
]

pd.DataFrame(model_5CV.cv_results_).sort_values(by='mean_test_f1_mac', ascending=False)[scoring_cols].head()

The best hyperparameters are the following: 'solver': ‘lsqr’, 'shrinkage’: ‘auto’

In [None]:
cf = confusion(y_train, pd.Series(model_5CV.predict(X_train)))
print(cf)

In [None]:
dump(model_5CV, os.path.join(models_dir, "LDA.joblib"))

In [None]:
y_pred = model_5CV.predict(X_val)

results_df = compute_metrics(y_val, y_pred, labels, "LDA", results_df)
results_df.sort_values(by='F1_Avg', ascending=False,inplace=True)
results_df

# KNN
Now we proceed to retrieve the train, validation and test datasets to train the KNN

In [None]:
with open('/content/drive/MyDrive/Machine Learning Deliverable/data/split/train.pkl','rb') as f:
  X_train, y_train = pkl.load(f)

with open('/content/drive/MyDrive/Machine Learning Deliverable/data/split/val.pkl','rb') as f:
  X_val, y_val = pkl.load(f)

with open('/content/drive/MyDrive/Machine Learning Deliverable/data/split/test.pkl','rb') as f:
  X_test, y_test = pkl.load(f)

We use grid search with 5-fold to find the best parameters.

In [None]:
knn = KNeighborsClassifier()

knn_cv = GridSearchCV(
    estimator=knn,
    param_grid={
        'n_neighbors': [1, 3, 5, 7, 10, 15, 20],
        'metric': ['euclidean', 'minkowski', 'manhattan']
    },
    scoring=scoring_dict,
    refit='f1_mac',
    cv=5
)

knn_model = knn_cv.fit(X_train, y_train)
results_cv = pd.DataFrame(knn_model.cv_results_)

KeyboardInterrupt: 

In [None]:
cols = [
    'param_n_neighbors', 'param_metric',
    'mean_test_acc', 'mean_test_f1_mac',
    'std_test_acc', 'std_test_f1_mac'
]
results_cv[cols].sort_values(by='mean_test_f1_mac',ascending=False)

The best hyperparameters are the following: 'n_neighbors': 15, 'metrics’: ‘manhattan’

In [None]:
y_pred = knn_model.predict(X_val)

results_df = compute_metrics(y_val, y_pred, labels, "KNN_best", results_df)
results_df.sort_values(by='F1_Avg', ascending=False,inplace=True)
results_df

In [None]:
with open(os.path.join(models_dir, "KNN_bestparams.txt"), "w") as f:
  f.write(json.dumps(best_params))
dump(knn_model, os.path.join(models_dir, "KNN_best.joblib"))

# Super vector machine
Now we proceed to retrieve the train, validation and test datasets to train the SVM

In [None]:
with open('/content/drive/MyDrive/Machine Learning Deliverable/data/split/train.pkl','rb') as f:
  X_train, y_train = pkl.load(f)

with open('/content/drive/MyDrive/Machine Learning Deliverable/data/split/val.pkl','rb') as f:
  X_val, y_val = pkl.load(f)

with open('/content/drive/MyDrive/Machine Learning Deliverable/data/split/test.pkl','rb') as f:
  X_test, y_test = pkl.load(f)

We use grid search with 5-fold to find the best parameters.

In [None]:
init_time = time()

svm = SVC(class_weight='balanced')

# Gamma for rbf and poly, degree and coef for poly
param_grid = {
    'C': [0.1, 1, 3, 5, 6, 7, 9],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto'],
    'degree': [2, 3, 4, 5],
}

trc = GridSearchCV(estimator=svm,
                   param_grid=param_grid,
                   scoring=['accuracy', 'recall_macro', 'f1_macro'],
                   cv=5,
                   return_train_score=True,
                   refit='f1_macro')

model_5CV = trc.fit(X_train, y_train)
print(timedelta(seconds=(time() - init_time)))

model_5CV.best_score_
model_5CV.best_params_

0:03:20.120481


0.6204327501208546

{'C': 6, 'degree': 4, 'gamma': 'scale', 'kernel': 'poly'}

In [None]:
pd.DataFrame(model_5CV.cv_results_).loc[:, [
    'mean_fit_time',
    'std_fit_time',
    'param_C',
    'param_kernel',
    'param_degree',
    'param_gamma',
    'mean_test_accuracy',
    'std_test_accuracy',
    'mean_test_recall_macro',
    'std_test_recall_macro',
    'mean_test_f1_macro',
    'std_test_f1_macro',
]].sort_values(by='mean_test_f1_macro',ascending=False)

Unnamed: 0,mean_fit_time,std_fit_time,param_C,param_kernel,param_degree,param_gamma,mean_test_accuracy,std_test_accuracy,mean_test_recall_macro,std_test_recall_macro,mean_test_f1_macro,std_test_f1_macro
0,23.362,4.695,6,poly,4,scale,0.599,0.007,0.641,0.01,0.62,0.006


The best hyperparameters are the following: 'C': 6, ‘kernel’: ‘poly’, 'gamma': ‘scale’, 'degree’: 4

In [None]:
model_5CV.best_score_

0.6204327501208546

In [None]:
y_pred = model_5CV.predict(X_val)

print(classification_report(y_val,y_pred))

                 precision    recall  f1-score   support

      Dark Trap       0.60      0.40      0.48      1491
            Emo       0.53      0.73      0.61       528
         Hiphop       0.44      0.41      0.42       975
            Rap       0.29      0.56      0.38       597
            RnB       0.36      0.47      0.41       673
     Trap Metal       0.33      0.49      0.39       626
Underground Rap       0.50      0.29      0.37      1905
            dnb       0.94      0.93      0.94       794
      hardstyle       0.76      0.81      0.79       752
      psytrance       0.90      0.89      0.89       861
      techhouse       0.77      0.85      0.81       723
         techno       0.83      0.83      0.83       869
         trance       0.78      0.79      0.79       902
           trap       0.78      0.73      0.75       735

       accuracy                           0.61     12431
      macro avg       0.63      0.66      0.63     12431
   weighted avg       0.63   

In [None]:
dump(model_5CV, os.path.join(models_dir, "SVM.joblib"))

['/content/drive/MyDrive/Machine Learning Deliverable/models/SVM.joblib']

In [None]:
results_df = compute_metrics(y_test, y_pred, labels, "SVM_best", results_df)
results_df.sort_values(by='F1_Avg', ascending=False,inplace=True)
results_df

Unnamed: 0,Algorithm,Accuracy,F1_Avg,F1_hardstyle,F1_Underground Rap,F1_Emo,F1_Dark Trap,F1_Hiphop,F1_techhouse,F1_Rap,F1_trance,F1_techno,F1_Trap Metal,F1_psytrance,F1_trap,F1_RnB,F1_dnb
0,RandomForest-best,0.634,0.65,0.796,0.455,0.678,0.497,0.429,0.86,0.439,0.812,0.831,0.326,0.898,0.699,0.413,0.961
1,ExtrTrees-best,0.635,0.648,0.837,0.461,0.693,0.494,0.43,0.854,0.415,0.821,0.828,0.274,0.906,0.709,0.39,0.962
2,ExtrTrees-best2,0.635,0.647,0.834,0.458,0.689,0.494,0.431,0.855,0.416,0.821,0.827,0.271,0.905,0.707,0.392,0.964
3,RandomForest-default,0.629,0.637,0.794,0.478,0.702,0.486,0.392,0.857,0.395,0.824,0.834,0.243,0.897,0.686,0.372,0.963
17,SVM_best,0.612,0.634,0.483,0.614,0.394,0.809,0.371,0.424,0.894,0.829,0.787,0.753,0.788,0.41,0.936,0.38
4,SVM,0.607,0.63,0.475,0.619,0.393,0.813,0.373,0.41,0.887,0.827,0.779,0.749,0.796,0.403,0.925,0.372
5,ExtraTrees-default,0.613,0.616,0.765,0.482,0.695,0.464,0.368,0.846,0.376,0.785,0.824,0.214,0.879,0.641,0.361,0.928
6,QDA,0.598,0.607,0.765,0.475,0.562,0.465,0.387,0.827,0.377,0.73,0.81,0.302,0.859,0.616,0.37,0.949
7,QDA,0.598,0.607,0.765,0.475,0.562,0.465,0.387,0.827,0.377,0.73,0.81,0.302,0.859,0.616,0.37,0.949
8,DecisionTrees-best,0.596,0.594,0.413,0.545,0.244,0.81,0.502,0.392,0.866,0.756,0.787,0.664,0.769,0.261,0.91,0.401


In [None]:
results_df.to_csv('/content/drive/MyDrive/Machine Learning Deliverable/data/ML_results.csv', index=False)