# Music Audio Genre Classification using Deep Learning

### Team Members
**Aditya Soni  
Aritra Chowdhury  
Chandler Wann  
Rhiannon Pytlak  
Yen Wen Ting**

## Dataset

The GTZAN dataset is used and stored in a folder named Audio Files. The dataset can be downloaded [here](https://www.kaggle.com/andradaolteanu/gtzan-dataset-music-genre-classification).

## Methodology

Classification using extracted Features from Audio Waveforms ([Reference Link](https://towardsdatascience.com/music-genre-classification-with-python-c714d032f0d8))

In [1]:
import sys
import os
import pickle
import random 
import operator
import math
import numpy as np
import pandas as pd
from collections import defaultdict
import sunau
import librosa
import librosa.display
import IPython.display as ipd
import matplotlib.pyplot as plt
import imageio
import cv2
import seaborn as sns
import pickle

from python_speech_features import mfcc
import scipy.io.wavfile as wav
from tempfile import TemporaryFile
from imutils import paths
from pathlib import Path

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score
from sklearn.datasets import make_multilabel_classification
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics
from sklearn import svm
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression

import tensorflow as tf
from tensorflow import keras
from tensorflow_docs.vis import embed
from keras.models import Sequential
from keras.layers import Dense, Conv2D , MaxPool2D , Flatten , Dropout 
from keras.preprocessing.image import ImageDataGenerator
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from tensorflow.keras.optimizers import Adam

import warnings
warnings.filterwarnings("ignore")

The following features are extracted for classification.

- Zero Crossing Rate
- Spectral Centroid
- Spectral Bandwidth
- Spectral Rolloff
- MFCC
- Chroma Frequencies
- RMSE

### Feature Extraction

In [2]:
def zero_crossing_rate(rate):
    zero_crossings = librosa.zero_crossings(rate, pad=False)
    return (sum(zero_crossings) / len(rate))

def spectral_centroid(rate, sig):
    spectral_centroids = librosa.feature.spectral_centroid(rate, sr=sig)[0]
    return np.mean(spectral_centroids)

def spectral_bandwidth(rate, sig):
    spectral_bandwidths = librosa.feature.spectral_bandwidth(rate, sr = sig)
    return np.mean(spectral_bandwidths)

def spectral_rolloff(rate, sig):
    spectral_rolloffs = librosa.feature.spectral_rolloff(rate, sr=sig)[0]
    return np.mean(spectral_rolloffs)

def mfcc(rate, sig):
    mfccs = librosa.feature.mfcc(rate, sr=sig)
    return list(mfccs.mean(axis=1))

def chroma_frequencies(rate, sig):
    hop_length = 512
    chromagram = librosa.feature.chroma_stft(rate, sr=sig, hop_length=hop_length)
    return np.mean(chromagram)

def rmse(rate):
    return np.sqrt(np.mean(rate**2))

In [3]:
labels = ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']

if Path('Audio Features.csv').exists():
    df_features = pd.read_csv('Audio Features.csv').iloc[:,1:]
else:
    current_wd = os.getcwd()
    df_features = pd.DataFrame(columns = ['file_name', 'zero_crossing_rate', 'spectral_centroid', 'spectral_bandwidth', 
                                          'spectral_rolloff', 'mfcc1', 'mfcc2', 'mfcc3', 'mfcc4', 
                                          'mfcc5', 'mfcc6', 'mfcc7', 'mfcc8', 'mfcc9', 'mfcc10', 
                                          'mfcc11', 'mfcc12', 'mfcc13', 'mfcc14', 'mfcc15', 'mfcc16', 
                                          'mfcc17', 'mfcc18', 'mfcc19', 'mfcc20', 'chroma_frequency', 
                                          'rmse', 'label'])

    for label in labels:
        path = os.path.join(os.getcwd(), 'Audio Files', label)
        os.chdir(path)
        for file in os.listdir(path):
            row = []
            x , sr = librosa.load(file)
            row.append(file)
            row.append(zero_crossing_rate(x))
            row.append(spectral_centroid(x, sr))
            row.append(spectral_bandwidth(x, sr))
            row.append(spectral_rolloff(x, sr))
            row.extend(mfcc(x, sr))
            row.append(chroma_frequencies(x, sr))
            row.append(rmse(x))
            row.append(label)
            df_features.loc[len(df_features)] = row

    os.chdir(current_wd)
    df_features.to_csv('Audio Features.csv')

df_features.head()

Unnamed: 0,file_name,zero_crossing_rate,spectral_centroid,spectral_bandwidth,spectral_rolloff,mfcc1,mfcc2,mfcc3,mfcc4,mfcc5,...,mfcc14,mfcc15,mfcc16,mfcc17,mfcc18,mfcc19,mfcc20,chroma_frequency,rmse,label
0,blues.00000.wav,0.083154,1784.16585,2002.44906,3805.839606,-113.570648,121.571793,-19.168142,42.366421,-6.364664,...,5.747995,-5.162882,0.75274,-1.690215,-0.408979,-2.303523,1.22129,0.350088,0.140688,blues
1,blues.00001.wav,0.056119,1530.176679,2039.036516,3550.522098,-207.501694,123.991264,8.955127,35.877647,2.90732,...,4.22014,-6.012148,0.927997,-0.731125,0.295073,-0.283518,0.531216,0.340914,0.107619,blues
2,blues.00002.wav,0.076403,1552.811865,1747.702312,3042.260232,-90.722595,140.446304,-29.093889,31.684334,-13.984505,...,-1.077,-9.229274,2.451689,-7.729093,-1.816407,-3.43972,-2.231259,0.363637,0.183227,blues
3,blues.00003.wav,0.033359,1070.106615,1596.412872,2184.745799,-199.544205,150.090897,5.662678,26.859079,1.771399,...,-1.079305,-2.870789,0.780873,-3.319597,0.636965,-0.619121,-3.407449,0.404785,0.162029,blues
4,blues.00004.wav,0.10158,1835.004266,1748.172116,3579.757627,-160.337708,126.219635,-35.587811,22.148071,-32.478603,...,-7.552725,-9.164666,-4.520576,-5.454034,-0.916874,-4.404827,-11.703234,0.308526,0.103356,blues


In [4]:
X = df_features.iloc[:,1:-1]
y = df_features.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.01, random_state = 11, stratify = y)

### Classification

The following models are used. 
- Neural Networks
- Random Forest
- Support Vector Machines
- Logistic Regression

#### Neural Networks

In [5]:
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(y_train)
encoded_y = encoder.transform(y_train)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y_train = np_utils.to_categorical(encoded_y)

In [6]:
# define baseline model
def baseline_model(inputs, outputs):
    # create model
    model = Sequential()
    model.add(Dense(20, input_dim=inputs, activation='relu'))
    model.add(Dense(outputs, activation='softmax'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [7]:
# test results

n_inputs, n_outputs = X_train.shape[1], dummy_y_train.shape[1]
# define model
model = baseline_model(n_inputs, n_outputs)
# fit model
model.fit(X_train.to_numpy(), dummy_y_train, verbose=0, epochs=100)
# predict
yhat = model.predict(X_test.to_numpy())

In [8]:
y_pred = []
predictions1 = np.argsort(yhat, axis = 1)
x=[]
for i in range(len(predictions1)):
    x.append(predictions1[i][[-1,-2]])
    if (y_test.iloc[i] == labels[predictions1[i][-1]]) or (y_test.iloc[i] == labels[predictions1[i][-2]]):
        y_pred.append(y_test.iloc[i])
    else:
        y_pred.append(labels[predictions1[i][-1]])
print(f'Top 2 Test Accuracy: {round(metrics.accuracy_score(y_pred, y_test)*100,2)}%')

Top 2 Test Accuracy: 40.0%


#### Random Forest

In [9]:
X = df_features.iloc[:,1:-1]
y = df_features.iloc[:,-1]
factor = pd.factorize(y)
y = factor[0]
definitions = factor[1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 21)

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Fitting Random Forest Classification to the Training set
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)
y_pred_proba = classifier.predict_proba(X_test)

In [10]:
predictions1 = np.argsort(y_pred_proba, axis = 1)
x=[]
for i in range(len(predictions1)):
    x.append(predictions1[i][[-1,-2]])
    if (y_test[i] == predictions1[i][-1]) or (y_test[i] == predictions1[i][-2]):
        y_pred[i] = y_test[i]
    else:
        y_pred[i] = predictions1[i][-1]
print(f'Top 2 Test Accuracy: {round(metrics.accuracy_score(y_pred, y_test)*100,2)}%')

Top 2 Test Accuracy: 70.0%


#### Support Vector Machines

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 21)
rbf = svm.SVC(kernel='rbf', gamma=0.5, C=0.1, probability=True).fit(X_train, y_train)
poly = svm.SVC(kernel='poly', degree=3, C=1, probability=True).fit(X_train, y_train)
poly_pred = poly.predict(X_test)
poly_pred_proba = poly.predict_proba(X_test)
rbf_pred = rbf.predict(X_test)
rbf_pred_proba = rbf.predict_proba(X_test)

In [12]:
predictions1 = np.argsort(poly_pred_proba, axis = 1)
x=[]
for i in range(len(predictions1)):
    x.append(predictions1[i][[-1,-2]])
    if (y_test[i] == predictions1[i][-1]) or (y_test[i] == predictions1[i][-2]):
        poly_pred[i] = y_test[i]
    else:
        poly_pred[i] = predictions1[i][-1]
print(f'Top 2 Test Accuracy (Polynomial Kernel): {round(metrics.accuracy_score(poly_pred, y_test)*100,2)}%')

predictions1 = np.argsort(rbf_pred_proba, axis = 1)
x=[]
for i in range(len(predictions1)):
    x.append(predictions1[i][[-1,-2]])
    if (y_test[i] == predictions1[i][-1]) or (y_test[i] == predictions1[i][-2]):
        rbf_pred[i] = y_test[i]
    else:
        rbf_pred[i] = predictions1[i][-1]
print(f'Top 2 Test Accuracy (RBF Kernel): {round(metrics.accuracy_score(rbf_pred, y_test)*100,2)}%')

Top 2 Test Accuracy (Polynomial Kernel): 45.6%
Top 2 Test Accuracy (RBF Kernel): 18.4%


#### Logistic Regression

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.01, random_state = 11, stratify = y)
# define the multinomial logistic regression model
model = LogisticRegression(multi_class='multinomial', solver='lbfgs')
# define the model evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate the model and collect the scores
n_scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)

In [14]:
predictions1 = np.argsort(y_pred_proba, axis = 1)
x=[]
for i in range(len(predictions1)):
    x.append(predictions1[i][[-1,-2]])
    if (y_test[i] == predictions1[i][-1]) or (y_test[i] == predictions1[i][-2]):
        y_pred[i] = y_test[i]
    else:
        y_pred[i] = predictions1[i][-1]
print(f'Top 2 Test Accuracy: {round(metrics.accuracy_score(y_pred, y_test)*100,2)}%')

Top 2 Test Accuracy: 50.0%
