In [2]:
# IMPORTING LIBRARIES
# --------------------------------------------------------------------------------->>>

import pandas as pd
from pandas import DataFrame
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.mixture import GaussianMixture
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

In [3]:
# READING DATAFRAME THEN DROPPING COLOUMNS
# ---------------------------------------------------------------------------->>>

df=pd.read_csv("features_3_sec.csv")
label=df['label']
label_array=label.to_numpy()
DF=df.drop("filename",axis=1)
df0=DF.drop("length",axis=1)
dff=df0.drop("label",axis=1)

In [4]:
# Renaming coloumns
# --------------------------------------------------------------------------->>>

file=dff.copy()
title=[]
for i in range(0,57):
    title.append(i)
file.columns=title

In [5]:
# REMOVING OUTLIERS AND NORMALIZING DATA
# -------------------------------------------------------------------------->>>

# Calculating Quartile using Quantile function
Q1=file.quantile(0.25)
Q3=file.quantile(0.75)
# Finding IQR
IQR=Q3-Q1
# Finding Upper Limit and Lower Limit Outlier
UpperLimitForOultier=Q3+(1.5*IQR)
LowerLimitForOutlier=Q1-(1.5*IQR)
# Filtering outliers from the Data Frame 
UpperOutlier=file>UpperLimitForOultier
LowerOutlier=file<LowerLimitForOutlier
# replacing both upper and lower outliers with nan so that it can be replaced with median values
file[LowerOutlier]=np.nan
file[UpperOutlier]=np.nan
# Filling all the outliers with the median values
ProcessedDf=file.fillna(file.median())
# Finding Max value and Min value of the new Processed data Frame after Filling Outlier with median for normalization
max_value = ProcessedDf.max()
min_value = ProcessedDf.min()
# Normalizing data frame 
result = (ProcessedDf - min_value)/(max_value - min_value)
# ADDING LABELS BACK
Data=result.assign(label=label_array)
# CONVERTING CSV
Data.to_csv("Normalised.csv")

In [6]:
# REMOVING HIGHLY CORRELATED COLOUMNS 
# But Accuracy actualy decreases when we do so.
#---------------------------------------------------------------------------------------->>>

# Creating correlation matrix
#corr_matrix=Data.corr(method='pearson').abs()

# Selecting uppr triangle Matrix
#upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),k=1).astype(np.bool8))

# Find features with correlation greater than 0.8
#to_drop=[column for column in upper.columns if any(upper[column]>0.8)]
# coloumns that were dropped were ='spectral_bandwidth_mean', 'rolloff_mean', 'zero_crossing_rate_mean', 'mfcc2_mean'

# drop features
#Data.drop(to_drop,axis=1,inplace=True)

In [7]:
# DIVIDING DATASET ON THE BASIS OF CLASS
# ---------------------------------------------->>>

df1=Data[Data['label']=='blues']
print(df1.shape)
df2=Data[Data['label']=='classical']
print(df2.shape)
df3=Data[Data['label']=='country']
print(df3.shape)
df4=Data[Data['label']=='disco']
print(df4.shape)
df5=Data[Data['label']=='hiphop']
print(df5.shape)
df6=Data[Data['label']=='jazz']
print(df6.shape)
df7=Data[Data['label']=='metal']
print(df7.shape)
df8=Data[Data['label']=='pop']
print(df8.shape)
df9=Data[Data['label']=='reggae']
print(df9.shape)
df10=Data[Data['label']=='rock']
print(df10.shape)

(1000, 58)
(998, 58)
(997, 58)
(999, 58)
(998, 58)
(1000, 58)
(1000, 58)
(1000, 58)
(1000, 58)
(998, 58)


In [8]:
# SPLITTING INTO TRAIN AND TEST DATA
# ------------------------------------------------------------------------------------------------->>>

train1,test1 = train_test_split(df1, test_size=0.3)
train2,test2 = train_test_split(df2, test_size=0.3)
train3,test3 = train_test_split(df3, test_size=0.3)
train4,test4 = train_test_split(df4, test_size=0.3)
train5,test5 = train_test_split(df5, test_size=0.3)
train6,test6 = train_test_split(df6, test_size=0.3)
train7,test7 = train_test_split(df7, test_size=0.3)
train8,test8 = train_test_split(df8, test_size=0.3)
train9,test9 = train_test_split(df9, test_size=0.3)
train10,test10 = train_test_split(df10, test_size=0.3)

# CONCATENATING AND CONVERTING INTO CSV
# ---------------------------------------------------------------------------------------------->>>
train = pd.concat([train1,train2,train3,train4,train5,train6,train7,train8,train9,train10])
train.to_csv("Data-train.csv")
test = pd.concat([test1,test2,test3,test4,test5,test6,test7,test8,test9,test10])
test.to_csv("Data-test.csv")

In [9]:
# EXTRACTING DEPENDENT AND INDEPENDENT VARIABLE
# ---------------------------------------------------------->>>

X_train = train.iloc[:, :-1].values
Y_train = train.iloc[:, 57].values
X_test = test.iloc[:, :-1].values
Y_test = test.iloc[:, 57].values

In [10]:
# APPLYING KNN CLASSIFIER for K=(1,3,5)
#-------------------------------------------------------------------------------->>>

# FOR K=1
classifier = KNeighborsClassifier(n_neighbors=1)
classifier.fit(X_train,Y_train)
# PREDICTING THE TEST RESULTS
Y_pred =classifier.predict(X_test)
# CREATING CONFUSION MATRIX
result =confusion_matrix(Y_test,Y_pred)
print("Confusion Matrix for k=1: ")
print(result)
# FINDING ACCURACY SCORE
result2 = accuracy_score(Y_test,Y_pred)
print("Accuracy for k=1: ",result2)
print()

# FOR K=3
classifier1 = KNeighborsClassifier(n_neighbors=3)
classifier1.fit(X_train,Y_train)
# PREDICTING THE TEST RESULTS
Y_pred1 =classifier1.predict(X_test)
# CREATING CONFUSION MATRIX
result3 =confusion_matrix(Y_test,Y_pred1)
print("Confusion Matrix for k=3: ")
print(result3)
# FINDING ACCURACY SCORE
result4 = accuracy_score(Y_test,Y_pred1)
print("Accuracy for k=3: ",result4)
print()

# FOR K=5
classifier2 = KNeighborsClassifier(n_neighbors=5)
classifier2.fit(X_train,Y_train)
# PREDICTING THE TEST RESULTS
Y_pred2 =classifier2.predict(X_test)
# CREATING CONFUSION MATRIX
result5 =confusion_matrix(Y_test,Y_pred2)
print("Confusion Matrix for k=1: ")
print(result5)
# FINDING ACCURACY SCORE
result6 = accuracy_score(Y_test,Y_pred2)
print("Accuracy for k=5: ",result6)

Confusion Matrix for k=1: 
[[251   2  10   8   0   3   1   0  17   8]
 [  0 275   6   2   0  10   1   1   3   2]
 [ 13   2 216  18   4  13   2   5  10  17]
 [  6   0   7 244  10   2   5   3   7  16]
 [  4   0   5  11 243   0   5   8  20   4]
 [ 10  14  15   3   0 246   0   3   3   6]
 [  3   0   0  12   1   1 276   0   1   6]
 [  0   2   7  23   5   2   0 246  10   5]
 [  2   1   6   7  14   1   0   9 255   5]
 [ 11   1  10  28   3   7   7   8   8 217]]
Accuracy for k=1:  0.823

Confusion Matrix for k=3: 
[[259   0  13   6   1   3   1   0  12   5]
 [  4 279   6   1   0   6   1   0   1   2]
 [ 18   2 233  15   2   9   1   3  10   7]
 [ 11   0   6 246   7   1   7   2   6  14]
 [  5   0  10  21 234   0   1  10  16   3]
 [ 19  23  13   6   1 230   0   1   4   3]
 [  5   1   0  13   4   1 265   0   1  10]
 [  2   2  15  24  11   2   0 237   6   1]
 [  4   1   6  16  13   1   0   8 246   5]
 [ 18   1  23  35   5   7   5   3   7 196]]
Accuracy for k=3:  0.8083333333333333

Confusion Matrix fo

In [11]:
# APPLYING BAYES CLASSIFIER ON UNIMODAL GAUSSIAN DENSITY
#--------------------------------------------------------------------------------------->>>

nb = GaussianNB()
nb.fit(X_train, Y_train)
score=nb.score(X_test, Y_test)
print("Naive Bayes score:",score)

Naive Bayes score: 0.521


In [12]:
# APPLYING BAYES CLASSIFIER USING MULTIMODAL GAUSSIAN MIXTURE MODEL
# ------------------------------------------------------------------------------->>>\

GMM=GaussianMixture(n_components=10)
GMM.fit(X_train)
label=GMM.predict(X_test)

# Changing labels to integer value to fit to label

test=test.replace('blues',0)
test=test.replace('classical',1)
test=test.replace('country',2)
test=test.replace('disco',3)
test=test.replace('hiphop',4)
test=test.replace('jazz',5)
test=test.replace('metal',6)
test=test.replace('pop',7)
test=test.replace('reggae',8)
test=test.replace('rock',9)

# Defining new Y_Test
Y_testnew = test.iloc[:, 57].values

# Calculating score
acc= accuracy_score(Y_testnew,label)

print(Y_testnew)
print(label)
print(confusion_matrix(Y_testnew,label))
print('Accuracy score is: ',acc)

[0 0 0 ... 9 9 9]
[5 1 6 ... 1 6 6]
[[  2  44   0  25   0 125  64   1  11  28]
 [213   0   2   2   5   7   0   4  67   0]
 [  1   4  45   7  61 101  63   5   7   6]
 [  0  43  90  34  38   2  85   6   0   2]
 [  0   4  26  54  37   9  73  86   1  10]
 [ 61  14  29   6  35  85   4  12  51   3]
 [  0 146   2  19   3   1  29   0   0 100]
 [  0   0  97   6  36   8   0 152   1   0]
 [  0   1  11  37  83  50  71  46   0   1]
 [  4  67  44  20  51  27  70   1   6  10]]
Accuracy score is:  0.13133333333333333


In [13]:
# USING DECISION TREE TO PREDICT GENRE
# ---------------------------------------------------------------->>>

classifier = DecisionTreeClassifier()
classifier = classifier.fit(X_train,Y_train)
genre = classifier.predict(X_test)
dec_res=accuracy_score(Y_test,genre)
print('Accuracy Score:',dec_res)

Accuracy Score: 0.6183333333333333


In [14]:
# USING RANDOM FOREST TO PREDICT GENRE
#--------------------------------------------------------------------->>>

clf=RandomForestClassifier(n_estimators=100)
clf.fit(X_train,Y_train)
rf_pred=clf.predict(X_test)
rf_score=accuracy_score(Y_test,rf_pred)
print('Accuracy Score:',rf_score)

Accuracy Score: 0.841


In [15]:
# Using XGBoost
# ------------------------------------------------------------->>>

# Changing labels to integer value to fit to label

train=train.replace('blues',0)
train=train.replace('classical',1)
train=train.replace('country',2)
train=train.replace('disco',3)
train=train.replace('hiphop',4)
train=train.replace('jazz',5)
train=train.replace('metal',6)
train=train.replace('pop',7)
train=train.replace('reggae',8)
train=train.replace('rock',9)

#Defining new trains
X_trainnew = train.iloc[:, :-1].values
Y_trainnew = train.iloc[:, 57].values

# Applying XGBoost
xgb_classifier = xgb.XGBClassifier()
xgb_classifier.fit(X_trainnew,Y_trainnew)
preds = xgb_classifier.predict(X_test)
xgb_score=accuracy_score(Y_testnew,preds)
print('Accuracy Score : ',xgb_score)

Accuracy Score :  0.865


In [16]:
# COMPARING ACCURACY OF ALL CLASSIFIERS
#-------------------------------------------------------------------->>>

print('Accuracy of KNN is for k=1 : ',result2)
print('Accuracy of KNN is for k=3 : ',result4)
print('Accuracy of KNN is for k=5 : ',result6)
print('Accuracy of Unimodal Bayes Classifier is : ',score)
print('Accuracy of GMM Bayes classifier is : ',acc)
print('Accuracy Score of Decision Tree is : ',dec_res)
print('Accuracy Score of Random Forest is : ',rf_score)
print('Accuracy Score of XGBoost is : ',xgb_score)

Accuracy of KNN is for k=1 :  0.823
Accuracy of KNN is for k=3 :  0.8083333333333333
Accuracy of KNN is for k=5 :  0.806
Accuracy of Unimodal Bayes Classifier is :  0.521
Accuracy of GMM Bayes classifier is :  0.13133333333333333
Accuracy Score of Decision Tree is :  0.6183333333333333
Accuracy Score of Random Forest is :  0.841
Accuracy Score of XGBoost is :  0.865
