In [1]:
# feature extraction and data preprocessing
import librosa
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import os
from PIL import Image
import pathlib
import csv

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

#Keras
import keras

import warnings
warnings.filterwarnings('ignore')

# Acquire and Prepare Dataset
- Read in audio features from the GTZAN dataset. (download from: https://www.kaggle.com/andradaolteanu/gtzan-dataset-music-genre-classification)

In [2]:
data = pd.read_csv('DATA/GTZAN/features_30_sec.csv')
data.head()

Unnamed: 0,filename,length,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,...,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var,label
0,blues.00000.wav,661794,0.350088,0.088757,0.130228,0.002827,1784.16585,129774.064525,2002.44906,85882.761315,...,52.42091,-1.690215,36.524071,-0.408979,41.597103,-2.303523,55.062923,1.221291,46.936035,blues
1,blues.00001.wav,661794,0.340914,0.09498,0.095948,0.002373,1530.176679,375850.073649,2039.036516,213843.755497,...,55.356403,-0.731125,60.314529,0.295073,48.120598,-0.283518,51.10619,0.531217,45.786282,blues
2,blues.00002.wav,661794,0.363637,0.085275,0.17557,0.002746,1552.811865,156467.643368,1747.702312,76254.192257,...,40.598766,-7.729093,47.639427,-1.816407,52.382141,-3.43972,46.63966,-2.231258,30.573025,blues
3,blues.00003.wav,661794,0.404785,0.093999,0.141093,0.006346,1070.106615,184355.942417,1596.412872,166441.494769,...,44.427753,-3.319597,50.206673,0.636965,37.31913,-0.619121,37.259739,-3.407448,31.949339,blues
4,blues.00004.wav,661794,0.308526,0.087841,0.091529,0.002303,1835.004266,343399.939274,1748.172116,88445.209036,...,86.099236,-5.454034,75.269707,-0.916874,53.613918,-4.404827,62.910812,-11.703234,55.19516,blues


In [3]:
data.shape

(1000, 60)

In [4]:
# Dropping unneccesary columns
data = data.drop(['filename', 'length'],axis=1)
data.head()

Unnamed: 0,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,rolloff_mean,rolloff_var,...,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var,label
0,0.350088,0.088757,0.130228,0.002827,1784.16585,129774.064525,2002.44906,85882.761315,3805.839606,901505.4,...,52.42091,-1.690215,36.524071,-0.408979,41.597103,-2.303523,55.062923,1.221291,46.936035,blues
1,0.340914,0.09498,0.095948,0.002373,1530.176679,375850.073649,2039.036516,213843.755497,3550.522098,2977893.0,...,55.356403,-0.731125,60.314529,0.295073,48.120598,-0.283518,51.10619,0.531217,45.786282,blues
2,0.363637,0.085275,0.17557,0.002746,1552.811865,156467.643368,1747.702312,76254.192257,3042.260232,784034.5,...,40.598766,-7.729093,47.639427,-1.816407,52.382141,-3.43972,46.63966,-2.231258,30.573025,blues
3,0.404785,0.093999,0.141093,0.006346,1070.106615,184355.942417,1596.412872,166441.494769,2184.745799,1493194.0,...,44.427753,-3.319597,50.206673,0.636965,37.31913,-0.619121,37.259739,-3.407448,31.949339,blues
4,0.308526,0.087841,0.091529,0.002303,1835.004266,343399.939274,1748.172116,88445.209036,3579.757627,1572978.0,...,86.099236,-5.454034,75.269707,-0.916874,53.613918,-4.404827,62.910812,-11.703234,55.19516,blues


### Get the genre list. We'll use it later to calculate the error

In [5]:
genre_list = data.iloc[:, -1]
encoder = LabelEncoder()
y = encoder.fit_transform(genre_list)

In [6]:
scaler = StandardScaler()
X = scaler.fit_transform(np.array(data.iloc[:, :-1], dtype = float))

# Setup Training vs Testing Data
- Dividing data into training and Testing set

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [8]:
len(y_train)

800

In [9]:
len(y_test)

200

In [10]:
X_train[10]

array([ 0.84943444, -0.58636491,  1.15751582, -0.04518112,  0.7280065 ,
       -0.09287929,  0.6212282 , -0.50238781,  0.57202371, -0.11900131,
        0.98124157,  0.27485989,  0.15615163,  0.74966623, -0.84625725,
        0.78207147, -0.41909135,  1.19464467, -0.79491179, -0.8256214 ,
       -0.29911508, -0.1305751 , -0.59425952, -0.47941743, -0.0898657 ,
        0.60861671, -0.49987315, -1.05134801, -0.26108764,  1.3636764 ,
       -0.14655296,  0.19910886, -0.33872346,  0.35475451, -0.5312916 ,
        0.72807479, -0.75364039,  1.15759393, -0.92530212, -0.8043186 ,
       -0.35553455,  0.86352379, -0.95369533,  0.13710404, -0.56750009,
        0.01293479, -0.72867285, -0.44959361, -0.54311949,  0.65416105,
        0.13296974, -1.02679642,  0.42153187,  0.364866  ,  0.00670302,
        0.28807068,  0.45143089])

# Setup Network Arhitecture
- We setup a feed forward deep learning neural network.
- It has 5 densely connected layers. The 1st 4 layers have relu activation, and the final layer is softmax.
- We have one layer for each genre

In [11]:
from keras import models
from keras import layers

model = models.Sequential()
model.add(layers.Dense(512, activation='relu', input_shape=(X_train.shape[1],)))

model.add(layers.Dense(256, activation='relu'))

model.add(layers.Dense(128, activation='relu'))

model.add(layers.Dense(64, activation='relu'))

model.add(layers.Dense(10, activation='softmax'))  # Last layer, finds the most probable genre

# Define Network Loss and Optimization

In [12]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Train the network
- We set a batch size of 256 and train for 30 epochs

In [13]:
history = model.fit(X_train,
                    y_train,
                    epochs=30,
                    batch_size=256)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


# Evaluate the model with our testing data

In [14]:
test_loss, test_acc = model.evaluate(X_test,y_test)



In [15]:
print('test_acc: ',test_acc)

test_acc:  0.7749999761581421


# Model Inference (TODO- calculate spectral features, and put into data structure that gets put into model)
- We can now use our model to make predictions. In this case, we use the test data to do so

In [16]:
predictions = model.predict(X_test)

In [17]:
predictions[0]

array([8.7058806e-06, 2.2571969e-05, 9.9588794e-01, 7.4310374e-04,
       2.7045140e-05, 2.2565426e-05, 1.6101396e-05, 1.8692220e-03,
       9.3007326e-04, 4.7268049e-04], dtype=float32)

In [18]:
np.sum(predictions[0])

1.0

In [19]:
np.argmax(predictions[0])

2