In [2]:
import pandas as pd
import numpy as np
from preprocessing import preprocess_data
from sklearn.preprocessing import LabelEncoder
from notebooks_classification.classification import compute_classifier
from sklearn.metrics import classification_report
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

## Data importation

In [6]:
df = pd.read_csv('../data/Spotify_train_dataset.csv')

features_columns = ['danceability', 'energy', 'key', 'loudness', 'mode',
        'speechiness', 'acousticness', 'instrumentalness', 'liveness',
        'valence', 'tempo', 'duration_ms', 'time_signature']
label = ['genre']

## Preprocess

### Without Strandardization

In [7]:
df_normal, labels = preprocess_data(df, features_columns, label, z_score=False, standardize=False)

#Encode data
le = LabelEncoder()
labels_encoded = le.fit_transform(labels)

------------------------------------------
            Preprocessing data            
------------------------------------------
Get dataset
Shape of the data to process : (31728, 20)
------------------------------------------
Extract labels ...
Extract inputs ...
------------------------------------------
Data shape after preprocessing : (31728, 13)
Labels shape : (31728,)
Return dataset(s) ...
Preprocessing finished
------------------------------------------


### With Strandardization

In [8]:
df_stand, labels = preprocess_data(df, features_columns, label, z_score=False, standardize=True)

#Encode data
le = LabelEncoder()
labels_encoded = le.fit_transform(labels)

------------------------------------------
            Preprocessing data            
------------------------------------------
Get dataset
Shape of the data to process : (31728, 20)
------------------------------------------
Extract labels ...
Extract inputs ...
Center and reduce inputs ...
------------------------------------------
Data shape after preprocessing : (31728, 13)
Labels shape : (31728,)
Return dataset(s) ...
Preprocessing finished
------------------------------------------


### Basic Model

In [5]:
print("Adaboost classifier basic param")
ABC = AdaBoostClassifier(n_estimators=100, random_state=0)
print("Without Standardization : ")
y_pred_normal = compute_classifier(df_normal, labels_encoded, ABC)
print("\nWith Standardization : ")
y_pred_stand = compute_classifier(df_stand, labels_encoded, ABC)
print(classification_report(labels_encoded, y_pred_stand, target_names=le.classes_))


Adaboost classifier basic param
Without Standardization : 
Accuracy : 0.3533787191124559

With Standardization : 
Accuracy : 0.3533787191124559
                 precision    recall  f1-score   support

      Dark Trap       0.29      0.18      0.22      3378
            Emo       0.19      0.04      0.07      1262
         Hiphop       0.27      0.12      0.16      2255
            Pop       0.07      0.00      0.01       336
            Rap       0.16      0.30      0.21      1420
            RnB       0.09      0.02      0.03      1605
     Trap Metal       0.07      0.03      0.04      1474
Underground Rap       0.28      0.51      0.36      4378
            dnb       0.57      0.49      0.53      2213
      hardstyle       0.39      0.40      0.40      2178
      psytrance       0.75      0.70      0.72      2214
      techhouse       0.28      0.11      0.16      2254
         techno       0.43      0.67      0.52      2226
         trance       0.35      0.56      0.43      2275


In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

cf_matrix = confusion_matrix(labels_encoded, y_pred_stand)

fig, ax = plt.subplots(figsize=(15,15))

sns.heatmap(cf_matrix, annot=True, ax=ax, cmap='hot_r')
plt.show()
