In [644]:
# feature extraction and data preprocessing
import librosa
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import os
from PIL import Image
import pathlib
import csv
import numpy as np
import IPython.display as ipd

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

#Keras
import keras

import warnings
warnings.filterwarnings('ignore')

# Acquire and Prepare Dataset
- Read in audio features from the GTZAN dataset. (download from: https://www.kaggle.com/andradaolteanu/gtzan-dataset-music-genre-classification)

In [645]:
data = pd.read_csv('DATA/GTZAN/features_30_sec.csv')
data.head()

Unnamed: 0,filename,length,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,...,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var,label
0,blues.00000.wav,661794,0.350088,0.088757,0.130228,0.002827,1784.16585,129774.064525,2002.44906,85882.761315,...,52.42091,-1.690215,36.524071,-0.408979,41.597103,-2.303523,55.062923,1.221291,46.936035,blues
1,blues.00001.wav,661794,0.340914,0.09498,0.095948,0.002373,1530.176679,375850.073649,2039.036516,213843.755497,...,55.356403,-0.731125,60.314529,0.295073,48.120598,-0.283518,51.10619,0.531217,45.786282,blues
2,blues.00002.wav,661794,0.363637,0.085275,0.17557,0.002746,1552.811865,156467.643368,1747.702312,76254.192257,...,40.598766,-7.729093,47.639427,-1.816407,52.382141,-3.43972,46.63966,-2.231258,30.573025,blues
3,blues.00003.wav,661794,0.404785,0.093999,0.141093,0.006346,1070.106615,184355.942417,1596.412872,166441.494769,...,44.427753,-3.319597,50.206673,0.636965,37.31913,-0.619121,37.259739,-3.407448,31.949339,blues
4,blues.00004.wav,661794,0.308526,0.087841,0.091529,0.002303,1835.004266,343399.939274,1748.172116,88445.209036,...,86.099236,-5.454034,75.269707,-0.916874,53.613918,-4.404827,62.910812,-11.703234,55.19516,blues


In [646]:
data.shape

(1000, 60)

In [647]:
# Dropping unneccesary columns
genre_list = data.iloc[:, -1]
data = data.drop(['filename', 'length',  'harmony_mean', 'harmony_var', 'perceptr_mean', 'perceptr_var', 'tempo'],axis=1)
data = data.drop(['label'],axis=1)
data.head()

Unnamed: 0,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,rolloff_mean,rolloff_var,...,mfcc16_mean,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var
0,0.350088,0.088757,0.130228,0.002827,1784.16585,129774.064525,2002.44906,85882.761315,3805.839606,901505.4,...,0.75274,52.42091,-1.690215,36.524071,-0.408979,41.597103,-2.303523,55.062923,1.221291,46.936035
1,0.340914,0.09498,0.095948,0.002373,1530.176679,375850.073649,2039.036516,213843.755497,3550.522098,2977893.0,...,0.927998,55.356403,-0.731125,60.314529,0.295073,48.120598,-0.283518,51.10619,0.531217,45.786282
2,0.363637,0.085275,0.17557,0.002746,1552.811865,156467.643368,1747.702312,76254.192257,3042.260232,784034.5,...,2.45169,40.598766,-7.729093,47.639427,-1.816407,52.382141,-3.43972,46.63966,-2.231258,30.573025
3,0.404785,0.093999,0.141093,0.006346,1070.106615,184355.942417,1596.412872,166441.494769,2184.745799,1493194.0,...,0.780874,44.427753,-3.319597,50.206673,0.636965,37.31913,-0.619121,37.259739,-3.407448,31.949339
4,0.308526,0.087841,0.091529,0.002303,1835.004266,343399.939274,1748.172116,88445.209036,3579.757627,1572978.0,...,-4.520576,86.099236,-5.454034,75.269707,-0.916874,53.613918,-4.404827,62.910812,-11.703234,55.19516


### Get the genre list. We'll use it later to calculate the error

In [648]:
genres_ordered = ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']
encoder = LabelEncoder()
y = encoder.fit_transform(genre_list)

In [649]:
scaler = StandardScaler()
X_arr = np.array(data.iloc[:, :-1], dtype = float)
X = scaler.fit_transform(X_arr)
X

array([[-0.35013678,  0.31258717, -0.01068969, ...,  0.00672291,
        -0.30059734,  0.60406407],
       [-0.46248155,  1.11757233, -0.53285232, ...,  0.54480563,
        -0.40708699,  0.42412706],
       [-0.18422456, -0.13770124,  0.67997762, ..., -0.29593404,
        -0.52729705, -0.29618888],
       ...,
       [ 0.65463736, -1.43198917, -0.75062494, ..., -2.73284378,
        -0.63865065, -0.26361549],
       [-0.19833855,  0.66814351, -0.71697762, ..., -0.72271696,
        -0.5114848 , -0.65064889],
       [-0.2483391 , -0.05894495, -1.1648952 , ...,  0.08070645,
         0.16033426,  0.5868411 ]])

# Setup Training vs Testing Data
- Dividing data into training and Testing set

In [650]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [651]:
len(y_train)

800

In [652]:
len(y_test)

200

In [653]:
X_train[10]

array([-1.32082059,  0.06934227,  0.69142866,  0.1655156 , -1.08547772,
       -0.50310525, -1.04854825, -0.18088302, -1.01106727, -0.45489234,
       -1.00215904, -0.67066568,  0.04905248, -0.60549178,  0.93416162,
       -0.26498787, -0.52131941, -0.34704224,  0.78809948, -0.08375989,
       -0.08320015,  0.04823067,  0.19752964,  0.35925542, -0.32120702,
       -0.21043034, -0.02944131,  0.05769589, -1.58612323, -0.22767106,
       -0.66025325,  1.50561237, -0.76537572,  0.20030494, -0.1046389 ,
       -0.02585947, -0.25975583,  0.71470206, -1.02360328,  1.42099952,
        0.19971847,  0.69624315,  0.18208864,  0.28137764, -0.04801231,
        0.47799011,  0.22717418,  0.17433341,  1.26945879,  0.10916874,
       -0.65711302])

# Setup Network Arhitecture
- We setup a feed forward deep learning neural network.
- It has 5 densely connected layers. The 1st 4 layers have relu activation, and the final layer is softmax.
- We have one layer for each genre

In [654]:
from keras import models
from keras import layers

model = models.Sequential()
model.add(layers.Dense(512, activation='relu', input_shape=(X_train.shape[1],)))

model.add(layers.Dense(256, activation='relu'))

model.add(layers.Dense(128, activation='relu'))

model.add(layers.Dense(64, activation='relu'))

model.add(layers.Dense(10, activation='softmax'))  # Last layer, finds the most probable genre

# Define Network Loss and Optimization

In [655]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Train the network
- We set a batch size of 256 and train for 80 epochs

In [656]:
history = model.fit(X_train,
                    y_train,
                    epochs=80,
                    batch_size=256)

Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80
Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80
Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80
Epoch 62/80
Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80


# Evaluate the model with our testing data

In [657]:
test_loss, test_acc = model.evaluate(X_test,y_test)



In [658]:
print('test accuracy: ',test_acc)

test accuracy:  0.7649999856948853


# Model Inference
- We can now use our model to make predictions for other songs. First, we need to extract audio features from those songs, and put them in a .csv

In [659]:
def create_csv(track_name, csv_name):
    input_freq_data, sr = librosa.load('./audio/'+track_name, mono=True, duration=30)
    # to oshow songs


    chroma_stft = librosa.feature.chroma_stft(y=input_freq_data, sr=sr)
    rmse = librosa.feature.rms(y=input_freq_data)
    spec_cent = librosa.feature.spectral_centroid(y=input_freq_data, sr=sr)
    spec_bw = librosa.feature.spectral_bandwidth(y=input_freq_data, sr=sr)
    rolloff = librosa.feature.spectral_rolloff(y=input_freq_data, sr=sr)
    zcr = librosa.feature.zero_crossing_rate(input_freq_data)
    mfcc = librosa.feature.mfcc(y=input_freq_data, sr=sr)


    to_append = f'{track_name[:-4]} {np.mean(chroma_stft)} {np.var(chroma_stft)} {np.mean(rmse)} {np.var(rmse)} {np.mean(spec_cent)} {np.var(spec_cent)} {np.mean(spec_bw)} {np.var(spec_bw)} {np.mean(rolloff)} {np.var(rolloff)} {np.mean(zcr)} {np.var(zcr)}'
    for e in mfcc:
        to_append += f' {np.mean(e)}'
        to_append += f' {np.var(e)}'

    file = open(csv_name, 'a', newline='')
    with file:
        writer = csv.writer(file)
        writer.writerow(to_append.split())

def create_header(csv_name):
    header = 'filename chroma_stft_mean chroma_stft_var rms_mean rms_var spectral_centroid_mean spectral_centroid_var spectral_bandwidth_mean spectral_bandwidth_var rolloff_mean rolloff_var zero_crossing_rate_mean zero_crossing_rate_var'
    for i in range(1, 21):
        header += f' mfcc{i}_mean'
        header += f' mfcc{i}_var'
    header += ' label'
    header = header.split()

    file = open(csv_name, 'w', newline='')
    with file:
        writer = csv.writer(file)
        writer.writerow(header)


In [660]:
csv_name = 'data.csv'
tracks = os.listdir('./audio')
# track_name = tracks[2]
print(tracks)
# Create CSV
create_header(csv_name)
for track in tracks:
    create_csv(track, csv_name)

['stayin_alive_the_begees.wav', 'feels_like_home.wav', 'cliffs_of_dover_eric_johnson.wav', 'gangplank_galleon.wav', 'accoustic_guitar.wav', 'moonlight_sonata.wav', '1812_overture.wav', 'subnautica_below_zero.wav', 'bluegrass_riff.wav', 'in_a_sentimental_mood.wav']


### Now let's read our input data.

In [661]:
# now let's
in_data = None
for track in tracks:
    in_data = pd.read_csv(csv_name)
    in_data = in_data.drop(['filename', 'label'],axis=1)

In [662]:
print("Here's what our input data looks like")
in_data.head()

Here's what our input data looks like


Unnamed: 0,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,rolloff_mean,rolloff_var,...,mfcc16_mean,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var
0,0.42754,0.080148,0.104428,0.001269,2581.373094,574020.384324,2701.174231,193066.189264,5718.865419,2858199.0,...,3.403892,44.253399,-2.210808,49.015297,-0.424155,40.163124,-3.985377,58.086464,0.577901,46.912777
1,0.208752,0.086042,0.094637,0.002692,976.691592,261171.317088,1263.216057,211062.187858,1534.515721,857256.1,...,-2.265531,104.791649,1.500498,138.381638,1.371981,138.501846,-0.68032,108.932625,0.270372,117.022636
2,0.215425,0.082682,0.107642,0.00059,1918.587656,243321.709205,1852.27543,156375.306821,3845.101749,1048404.0,...,-0.91106,70.341888,-14.665382,75.236107,-1.14388,79.274567,4.029745,77.978836,-1.453487,79.916092
3,0.338958,0.087362,0.116228,0.002324,1789.672791,344021.655175,2041.964027,130313.19843,3665.066528,1588140.0,...,3.19045,119.261002,-1.293239,111.640648,1.197762,71.990517,-3.442649,110.102531,5.27369,118.395699
4,0.308963,0.086878,0.033036,0.000128,2169.362507,627848.593395,2412.743061,178148.157028,4576.697325,3301635.0,...,-4.022608,36.650227,-8.984497,47.335041,0.512731,60.669468,-7.834329,60.399014,4.032901,65.190453


In [663]:
in_arr = np.array(in_data.iloc[:, :-1], dtype = float)
scaler = StandardScaler()
arr = scaler.fit_transform(in_arr)  # THIS NEEDS TO HAVE MORE THAN 1 SONG----- WHAT

# arr

In [664]:
predictions = model.predict(arr)



In [665]:
print("Here's the prediction strengths of each genre for song 1")
predictions[0]

Here's the prediction strengths of each genre for song 1


array([1.7471024e-05, 1.9341944e-05, 3.2186499e-04, 9.8478252e-01,
       8.8735595e-03, 2.7978160e-05, 9.7119441e-04, 6.4714928e-05,
       2.2298724e-03, 2.6915418e-03], dtype=float32)

In [666]:
np.sum(predictions[0]) # checks that add to ~1

1.0000001

## The Result!

In [667]:
for i in range(len(tracks)):
    print("The predicted genre for " + str(tracks[i]) + " is " +str(genres_ordered[ np.argmax(predictions[i]) ])+ "!\n")

The predicted genre for stayin_alive_the_begees.wav is disco!

The predicted genre for feels_like_home.wav is blues!

The predicted genre for cliffs_of_dover_eric_johnson.wav is rock!

The predicted genre for gangplank_galleon.wav is jazz!

The predicted genre for accoustic_guitar.wav is disco!

The predicted genre for moonlight_sonata.wav is jazz!

The predicted genre for 1812_overture.wav is disco!

The predicted genre for subnautica_below_zero.wav is hiphop!

The predicted genre for bluegrass_riff.wav is pop!

The predicted genre for in_a_sentimental_mood.wav is blues!



In [668]:
print("Here's the song whose genre wasn't in the NN's catalog")
ipd.Audio('./audio/subnautica_below_zero.wav')


Here's the song whose genre wasn't in the NN's catalog
