In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import seaborn as sns

In [2]:
dataOrig = pd.read_csv("feature_extraction/dataOrig.csv",usecols=lambda column: column != 'filename')

In [3]:
dataOrig.head()

Unnamed: 0,chroma_C,chroma_Csharp,chroma_D,chroma_Dsharp,chroma_E,chroma_F,chroma_Fsharp,chroma_G,chroma_Gsharp,chroma_A,...,mfcc12,mfcc13,mfcc14,mfcc15,mfcc16,mfcc17,mfcc18,mfcc19,mfcc20,genre
0,0.391512,0.429697,0.524729,0.385698,0.36634,0.433856,0.445597,0.593311,0.438685,0.413015,...,-0.841709,2.132198,1.083206,-0.077104,0.648073,-1.484534,3.725629,4.553656,4.274244,pop
1,0.544175,0.420254,0.424198,0.399732,0.300949,0.312941,0.363381,0.445987,0.342582,0.325424,...,2.602134,4.048971,2.384604,1.780818,-2.392593,-1.346176,-2.879179,-0.849864,-2.598134,pop
2,0.344232,0.477342,0.461315,0.383294,0.430534,0.360271,0.595679,0.406584,0.354102,0.464613,...,0.874806,-1.936897,-2.78313,-2.666644,-3.311479,1.708261,-0.152766,4.84682,2.906235,pop
3,0.392071,0.428158,0.523789,0.385555,0.366227,0.433113,0.444056,0.591267,0.437267,0.413407,...,-0.844541,2.09968,1.122588,0.006671,0.74826,-1.474304,3.672843,4.469373,4.198666,pop
4,0.396111,0.442692,0.408399,0.486187,0.353345,0.326342,0.372,0.300399,0.334868,0.334352,...,-0.441958,-1.08406,-3.450706,-2.475977,-3.302751,-2.228499,-2.649557,-2.744086,0.305748,pop


In [4]:
mappingGenresToIntegers = {
    'pop': 0,
    'metal': 1,
    'disco': 2,
    'blues': 3,
    'reggae': 4,
    'classical': 5,
    'rock': 6,
    'hiphop': 7,
    'country': 8,
    'jazz': 9
}


dataOrig['genre'] = dataOrig['genre'].map(mappingGenresToIntegers)
dataOrig.head()

Unnamed: 0,chroma_C,chroma_Csharp,chroma_D,chroma_Dsharp,chroma_E,chroma_F,chroma_Fsharp,chroma_G,chroma_Gsharp,chroma_A,...,mfcc12,mfcc13,mfcc14,mfcc15,mfcc16,mfcc17,mfcc18,mfcc19,mfcc20,genre
0,0.391512,0.429697,0.524729,0.385698,0.36634,0.433856,0.445597,0.593311,0.438685,0.413015,...,-0.841709,2.132198,1.083206,-0.077104,0.648073,-1.484534,3.725629,4.553656,4.274244,0
1,0.544175,0.420254,0.424198,0.399732,0.300949,0.312941,0.363381,0.445987,0.342582,0.325424,...,2.602134,4.048971,2.384604,1.780818,-2.392593,-1.346176,-2.879179,-0.849864,-2.598134,0
2,0.344232,0.477342,0.461315,0.383294,0.430534,0.360271,0.595679,0.406584,0.354102,0.464613,...,0.874806,-1.936897,-2.78313,-2.666644,-3.311479,1.708261,-0.152766,4.84682,2.906235,0
3,0.392071,0.428158,0.523789,0.385555,0.366227,0.433113,0.444056,0.591267,0.437267,0.413407,...,-0.844541,2.09968,1.122588,0.006671,0.74826,-1.474304,3.672843,4.469373,4.198666,0
4,0.396111,0.442692,0.408399,0.486187,0.353345,0.326342,0.372,0.300399,0.334868,0.334352,...,-0.441958,-1.08406,-3.450706,-2.475977,-3.302751,-2.228499,-2.649557,-2.744086,0.305748,0


In [5]:
X = dataOrig.iloc[:, :-1]
y = dataOrig.iloc[:, -1]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.15, random_state = 42)

In [7]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# GridSearchCV

In [8]:
from sklearn.tree import DecisionTreeClassifier

param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

dtc = DecisionTreeClassifier()

DTCGridSearch = GridSearchCV(estimator=dtc, param_grid=param_grid, cv=5, scoring='accuracy', verbose=10)

DTCGridSearch.fit(X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV 1/5; 1/72] START criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2
[CV 1/5; 1/72] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2;, score=0.435 total time=   0.1s
[CV 2/5; 1/72] START criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2
[CV 2/5; 1/72] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2;, score=0.453 total time=   0.0s
[CV 3/5; 1/72] START criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2
[CV 3/5; 1/72] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2;, score=0.506 total time=   0.0s
[CV 4/5; 1/72] START criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2
[CV 4/5; 1/72] END criterion=gini, max_depth=None, min_samples_leaf=1, min_samples_split=2;, score=0.441 total time=   0.0s
[CV 5/5; 1/72] START criterion=gini, max_depth=None, min_samples_leaf=

[CV 3/5; 9/72] END criterion=gini, max_depth=None, min_samples_leaf=4, min_samples_split=10;, score=0.459 total time=   0.1s
[CV 4/5; 9/72] START criterion=gini, max_depth=None, min_samples_leaf=4, min_samples_split=10
[CV 4/5; 9/72] END criterion=gini, max_depth=None, min_samples_leaf=4, min_samples_split=10;, score=0.482 total time=   0.0s
[CV 5/5; 9/72] START criterion=gini, max_depth=None, min_samples_leaf=4, min_samples_split=10
[CV 5/5; 9/72] END criterion=gini, max_depth=None, min_samples_leaf=4, min_samples_split=10;, score=0.435 total time=   0.0s
[CV 1/5; 10/72] START criterion=gini, max_depth=10, min_samples_leaf=1, min_samples_split=2
[CV 1/5; 10/72] END criterion=gini, max_depth=10, min_samples_leaf=1, min_samples_split=2;, score=0.482 total time=   0.0s
[CV 2/5; 10/72] START criterion=gini, max_depth=10, min_samples_leaf=1, min_samples_split=2
[CV 2/5; 10/72] END criterion=gini, max_depth=10, min_samples_leaf=1, min_samples_split=2;, score=0.482 total time=   0.0s
[CV 3/5

[CV 4/5; 17/72] END criterion=gini, max_depth=10, min_samples_leaf=4, min_samples_split=5;, score=0.482 total time=   0.0s
[CV 5/5; 17/72] START criterion=gini, max_depth=10, min_samples_leaf=4, min_samples_split=5
[CV 5/5; 17/72] END criterion=gini, max_depth=10, min_samples_leaf=4, min_samples_split=5;, score=0.418 total time=   0.0s
[CV 1/5; 18/72] START criterion=gini, max_depth=10, min_samples_leaf=4, min_samples_split=10
[CV 1/5; 18/72] END criterion=gini, max_depth=10, min_samples_leaf=4, min_samples_split=10;, score=0.465 total time=   0.0s
[CV 2/5; 18/72] START criterion=gini, max_depth=10, min_samples_leaf=4, min_samples_split=10
[CV 2/5; 18/72] END criterion=gini, max_depth=10, min_samples_leaf=4, min_samples_split=10;, score=0.476 total time=   0.0s
[CV 3/5; 18/72] START criterion=gini, max_depth=10, min_samples_leaf=4, min_samples_split=10
[CV 3/5; 18/72] END criterion=gini, max_depth=10, min_samples_leaf=4, min_samples_split=10;, score=0.441 total time=   0.0s
[CV 4/5; 18

[CV 4/5; 25/72] END criterion=gini, max_depth=20, min_samples_leaf=4, min_samples_split=2;, score=0.465 total time=   0.0s
[CV 5/5; 25/72] START criterion=gini, max_depth=20, min_samples_leaf=4, min_samples_split=2
[CV 5/5; 25/72] END criterion=gini, max_depth=20, min_samples_leaf=4, min_samples_split=2;, score=0.441 total time=   0.0s
[CV 1/5; 26/72] START criterion=gini, max_depth=20, min_samples_leaf=4, min_samples_split=5
[CV 1/5; 26/72] END criterion=gini, max_depth=20, min_samples_leaf=4, min_samples_split=5;, score=0.465 total time=   0.0s
[CV 2/5; 26/72] START criterion=gini, max_depth=20, min_samples_leaf=4, min_samples_split=5
[CV 2/5; 26/72] END criterion=gini, max_depth=20, min_samples_leaf=4, min_samples_split=5;, score=0.471 total time=   0.0s
[CV 3/5; 26/72] START criterion=gini, max_depth=20, min_samples_leaf=4, min_samples_split=5
[CV 3/5; 26/72] END criterion=gini, max_depth=20, min_samples_leaf=4, min_samples_split=5;, score=0.441 total time=   0.0s
[CV 4/5; 26/72] S

[CV 4/5; 33/72] END criterion=gini, max_depth=30, min_samples_leaf=2, min_samples_split=10;, score=0.459 total time=   0.0s
[CV 5/5; 33/72] START criterion=gini, max_depth=30, min_samples_leaf=2, min_samples_split=10
[CV 5/5; 33/72] END criterion=gini, max_depth=30, min_samples_leaf=2, min_samples_split=10;, score=0.435 total time=   0.0s
[CV 1/5; 34/72] START criterion=gini, max_depth=30, min_samples_leaf=4, min_samples_split=2
[CV 1/5; 34/72] END criterion=gini, max_depth=30, min_samples_leaf=4, min_samples_split=2;, score=0.447 total time=   0.0s
[CV 2/5; 34/72] START criterion=gini, max_depth=30, min_samples_leaf=4, min_samples_split=2
[CV 2/5; 34/72] END criterion=gini, max_depth=30, min_samples_leaf=4, min_samples_split=2;, score=0.465 total time=   0.0s
[CV 3/5; 34/72] START criterion=gini, max_depth=30, min_samples_leaf=4, min_samples_split=2
[CV 3/5; 34/72] END criterion=gini, max_depth=30, min_samples_leaf=4, min_samples_split=2;, score=0.418 total time=   0.0s
[CV 4/5; 34/72

[CV 4/5; 41/72] END criterion=entropy, max_depth=None, min_samples_leaf=2, min_samples_split=5;, score=0.435 total time=   0.0s
[CV 5/5; 41/72] START criterion=entropy, max_depth=None, min_samples_leaf=2, min_samples_split=5
[CV 5/5; 41/72] END criterion=entropy, max_depth=None, min_samples_leaf=2, min_samples_split=5;, score=0.500 total time=   0.0s
[CV 1/5; 42/72] START criterion=entropy, max_depth=None, min_samples_leaf=2, min_samples_split=10
[CV 1/5; 42/72] END criterion=entropy, max_depth=None, min_samples_leaf=2, min_samples_split=10;, score=0.371 total time=   0.0s
[CV 2/5; 42/72] START criterion=entropy, max_depth=None, min_samples_leaf=2, min_samples_split=10
[CV 2/5; 42/72] END criterion=entropy, max_depth=None, min_samples_leaf=2, min_samples_split=10;, score=0.465 total time=   0.0s
[CV 3/5; 42/72] START criterion=entropy, max_depth=None, min_samples_leaf=2, min_samples_split=10
[CV 3/5; 42/72] END criterion=entropy, max_depth=None, min_samples_leaf=2, min_samples_split=10

[CV 4/5; 49/72] END criterion=entropy, max_depth=10, min_samples_leaf=2, min_samples_split=2;, score=0.418 total time=   0.0s
[CV 5/5; 49/72] START criterion=entropy, max_depth=10, min_samples_leaf=2, min_samples_split=2
[CV 5/5; 49/72] END criterion=entropy, max_depth=10, min_samples_leaf=2, min_samples_split=2;, score=0.476 total time=   0.0s
[CV 1/5; 50/72] START criterion=entropy, max_depth=10, min_samples_leaf=2, min_samples_split=5
[CV 1/5; 50/72] END criterion=entropy, max_depth=10, min_samples_leaf=2, min_samples_split=5;, score=0.394 total time=   0.0s
[CV 2/5; 50/72] START criterion=entropy, max_depth=10, min_samples_leaf=2, min_samples_split=5
[CV 2/5; 50/72] END criterion=entropy, max_depth=10, min_samples_leaf=2, min_samples_split=5;, score=0.435 total time=   0.0s
[CV 3/5; 50/72] START criterion=entropy, max_depth=10, min_samples_leaf=2, min_samples_split=5
[CV 3/5; 50/72] END criterion=entropy, max_depth=10, min_samples_leaf=2, min_samples_split=5;, score=0.447 total tim

[CV 2/5; 57/72] END criterion=entropy, max_depth=20, min_samples_leaf=1, min_samples_split=10;, score=0.476 total time=   0.0s
[CV 3/5; 57/72] START criterion=entropy, max_depth=20, min_samples_leaf=1, min_samples_split=10
[CV 3/5; 57/72] END criterion=entropy, max_depth=20, min_samples_leaf=1, min_samples_split=10;, score=0.435 total time=   0.1s
[CV 4/5; 57/72] START criterion=entropy, max_depth=20, min_samples_leaf=1, min_samples_split=10
[CV 4/5; 57/72] END criterion=entropy, max_depth=20, min_samples_leaf=1, min_samples_split=10;, score=0.394 total time=   0.0s
[CV 5/5; 57/72] START criterion=entropy, max_depth=20, min_samples_leaf=1, min_samples_split=10
[CV 5/5; 57/72] END criterion=entropy, max_depth=20, min_samples_leaf=1, min_samples_split=10;, score=0.453 total time=   0.0s
[CV 1/5; 58/72] START criterion=entropy, max_depth=20, min_samples_leaf=2, min_samples_split=2
[CV 1/5; 58/72] END criterion=entropy, max_depth=20, min_samples_leaf=2, min_samples_split=2;, score=0.406 to

[CV 2/5; 65/72] END criterion=entropy, max_depth=30, min_samples_leaf=1, min_samples_split=5;, score=0.435 total time=   0.0s
[CV 3/5; 65/72] START criterion=entropy, max_depth=30, min_samples_leaf=1, min_samples_split=5
[CV 3/5; 65/72] END criterion=entropy, max_depth=30, min_samples_leaf=1, min_samples_split=5;, score=0.441 total time=   0.0s
[CV 4/5; 65/72] START criterion=entropy, max_depth=30, min_samples_leaf=1, min_samples_split=5
[CV 4/5; 65/72] END criterion=entropy, max_depth=30, min_samples_leaf=1, min_samples_split=5;, score=0.429 total time=   0.0s
[CV 5/5; 65/72] START criterion=entropy, max_depth=30, min_samples_leaf=1, min_samples_split=5
[CV 5/5; 65/72] END criterion=entropy, max_depth=30, min_samples_leaf=1, min_samples_split=5;, score=0.494 total time=   0.0s
[CV 1/5; 66/72] START criterion=entropy, max_depth=30, min_samples_leaf=1, min_samples_split=10
[CV 1/5; 66/72] END criterion=entropy, max_depth=30, min_samples_leaf=1, min_samples_split=10;, score=0.371 total t

[CV 5/5; 72/72] END criterion=entropy, max_depth=30, min_samples_leaf=4, min_samples_split=10;, score=0.482 total time=   0.1s


In [9]:
pd_res = pd.concat([pd.DataFrame(DTCGridSearch.cv_results_["params"]),pd.DataFrame(DTCGridSearch.cv_results_["mean_test_score"], columns=["Accuracy"])],axis=1)
pd_res = pd_res.sort_values('Accuracy', ascending=False)
pd_res.head(5)

Unnamed: 0,criterion,max_depth,min_samples_leaf,min_samples_split,Accuracy
26,gini,20.0,4,10,0.463529
8,gini,,4,10,0.463529
27,gini,30.0,1,2,0.461176
69,entropy,30.0,4,2,0.461176
32,gini,30.0,2,10,0.461176


The accuracy in the table above was training accuracy. Below, I'll calculate test accuracy:

In [10]:
from sklearn.metrics import accuracy_score, f1_score, recall_score

print("Best params: ", DTCGridSearch.best_params_)

bestDTC = DTCGridSearch.best_estimator_

bestDTCPreds = bestDTC.predict(X_test)

print("Precision : ", accuracy_score(y_test, bestDTCPreds))

print("Recall : ", recall_score(y_test, bestDTCPreds, average = 'macro'))

print("F1-Score : ", f1_score(y_test, bestDTCPreds, average = 'macro'))

Best params:  {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 10}
Precision :  0.47333333333333333
Recall :  0.4747979797979798
F1-Score :  0.4629363801992561
