In [1]:
import pandas as pd
import numpy as np
import os
from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

In [2]:
df = pd.read_csv('C:\\Users\\Saad\\Desktop\\Project\\MGC\\Data\\features_3_sec.csv')
df.head()

Unnamed: 0,filename,length,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,...,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var,label
0,blues.00000.0.wav,66149,0.335406,0.091048,0.130405,0.003521,1773.065032,167541.630869,1972.744388,117335.771563,...,39.687145,-3.24128,36.488243,0.722209,38.099152,-5.050335,33.618073,-0.243027,43.771767,blues
1,blues.00000.1.wav,66149,0.343065,0.086147,0.112699,0.00145,1816.693777,90525.690866,2010.051501,65671.875673,...,64.748276,-6.055294,40.677654,0.159015,51.264091,-2.837699,97.03083,5.784063,59.943081,blues
2,blues.00000.2.wav,66149,0.346815,0.092243,0.132003,0.00462,1788.539719,111407.437613,2084.565132,75124.921716,...,67.336563,-1.76861,28.348579,2.378768,45.717648,-1.938424,53.050835,2.517375,33.105122,blues
3,blues.00000.3.wav,66149,0.363639,0.086856,0.132565,0.002448,1655.289045,111952.284517,1960.039988,82913.639269,...,47.739452,-3.841155,28.337118,1.218588,34.770935,-3.580352,50.836224,3.630866,32.023678,blues
4,blues.00000.4.wav,66149,0.335579,0.088129,0.143289,0.001701,1630.656199,79667.267654,1948.503884,60204.020268,...,30.336359,0.664582,45.880913,1.689446,51.363583,-3.392489,26.738789,0.536961,29.146694,blues


In [3]:
df.shape

(9990, 60)

In [4]:
df['label'].value_counts()

blues        1000
jazz         1000
metal        1000
pop          1000
reggae       1000
disco         999
classical     998
hiphop        998
rock          998
country       997
Name: label, dtype: int64

In [5]:
df['label'].dtype

dtype('O')

In [6]:
label_index = dict()
index_label = dict()

for i, x in enumerate(df['label'].unique()):
    label_index[x] = i
    index_label[i] = x
    
print(label_index)
print(index_label)

{'blues': 0, 'classical': 1, 'country': 2, 'disco': 3, 'hiphop': 4, 'jazz': 5, 'metal': 6, 'pop': 7, 'reggae': 8, 'rock': 9}
{0: 'blues', 1: 'classical', 2: 'country', 3: 'disco', 4: 'hiphop', 5: 'jazz', 6: 'metal', 7: 'pop', 8: 'reggae', 9: 'rock'}


In [7]:
df['label'] = df['label'].replace(label_index)
df['label']

0       0
1       0
2       0
3       0
4       0
       ..
9985    9
9986    9
9987    9
9988    9
9989    9
Name: label, Length: 9990, dtype: int64

In [8]:
X = df.drop(['label','filename'], axis=1)
y = df['label'] 

In [9]:
X.shape, y.shape

((9990, 58), (9990,))

In [10]:
cols           = X.columns
min_max_scaler = preprocessing.MinMaxScaler()
np_scaled      = min_max_scaler.fit_transform(X)

X = pd.DataFrame(np_scaled, columns = cols)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((6993, 58), (2997, 58), (6993,), (2997,))

In [12]:
def models(model, X_train, y_train, X_test, title = "Default"):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print('Accuracy', title, ':', round(accuracy_score(y_test, preds), 3), '\n')

In [13]:
tree = DecisionTreeClassifier(max_features='auto')
models(tree, X_train, y_train, X_test,"Decission trees")

lg = LogisticRegression(random_state=0, max_iter=100)
models(lg, X_train, y_train, X_test,"Logistic Regression")

knn = KNeighborsClassifier(n_neighbors=19)
models(knn, X_train, y_train, X_test,"KNN")

nb = GaussianNB()
models(nb, X_train, y_train, X_test,"Naive Bayes")

xgb = XGBClassifier(n_estimators=700)
models(xgb, X_train, y_train, X_test, title = "xgb")

Accuracy Decission trees : 0.6 



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy Logistic Regression : 0.693 

Accuracy KNN : 0.794 

Accuracy Naive Bayes : 0.531 





Accuracy xgb : 0.9 



In [14]:
fi    = xgb.feature_importances_.tolist()
names = xgb.get_booster().feature_names

In [15]:
dict_ = {"Feature Name":names, "Importance":fi}
df    = pd.DataFrame(dict_)
df.sort_values("Importance", ascending=False, inplace=True)
df

Unnamed: 0,Feature Name,Importance
16,perceptr_var,0.084161
7,spectral_bandwidth_mean,0.064355
24,mfcc4_mean,0.043932
1,chroma_stft_mean,0.036792
2,chroma_stft_var,0.035507
10,rolloff_var,0.033952
4,rms_var,0.032132
19,mfcc1_var,0.03023
18,mfcc1_mean,0.025395
15,perceptr_mean,0.024153
