In [200]:
# feature extraction and data preprocessing
import librosa
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import os
from PIL import Image
import pathlib
import csv
import numpy as np
import IPython.display as ipd

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

#Keras
import keras

import warnings
warnings.filterwarnings('ignore')

# Acquire and Prepare Dataset
- Read in audio features from the GTZAN dataset. (download from: https://www.kaggle.com/andradaolteanu/gtzan-dataset-music-genre-classification)

In [201]:
data = pd.read_csv('DATA/GTZAN/features_30_sec.csv')
data.head()

Unnamed: 0,filename,length,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,...,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var,label
0,blues.00000.wav,661794,0.350088,0.088757,0.130228,0.002827,1784.16585,129774.064525,2002.44906,85882.761315,...,52.42091,-1.690215,36.524071,-0.408979,41.597103,-2.303523,55.062923,1.221291,46.936035,blues
1,blues.00001.wav,661794,0.340914,0.09498,0.095948,0.002373,1530.176679,375850.073649,2039.036516,213843.755497,...,55.356403,-0.731125,60.314529,0.295073,48.120598,-0.283518,51.10619,0.531217,45.786282,blues
2,blues.00002.wav,661794,0.363637,0.085275,0.17557,0.002746,1552.811865,156467.643368,1747.702312,76254.192257,...,40.598766,-7.729093,47.639427,-1.816407,52.382141,-3.43972,46.63966,-2.231258,30.573025,blues
3,blues.00003.wav,661794,0.404785,0.093999,0.141093,0.006346,1070.106615,184355.942417,1596.412872,166441.494769,...,44.427753,-3.319597,50.206673,0.636965,37.31913,-0.619121,37.259739,-3.407448,31.949339,blues
4,blues.00004.wav,661794,0.308526,0.087841,0.091529,0.002303,1835.004266,343399.939274,1748.172116,88445.209036,...,86.099236,-5.454034,75.269707,-0.916874,53.613918,-4.404827,62.910812,-11.703234,55.19516,blues


In [202]:
data.shape

(1000, 60)

In [203]:
# Dropping unneccesary columns
genre_list = data.iloc[:, -1]
data = data.drop(['filename', 'length',  'harmony_mean', 'harmony_var', 'perceptr_mean', 'perceptr_var', 'tempo'],axis=1)
data = data.drop(['label'],axis=1)
data.head()

Unnamed: 0,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,rolloff_mean,rolloff_var,...,mfcc16_mean,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var
0,0.350088,0.088757,0.130228,0.002827,1784.16585,129774.064525,2002.44906,85882.761315,3805.839606,901505.4,...,0.75274,52.42091,-1.690215,36.524071,-0.408979,41.597103,-2.303523,55.062923,1.221291,46.936035
1,0.340914,0.09498,0.095948,0.002373,1530.176679,375850.073649,2039.036516,213843.755497,3550.522098,2977893.0,...,0.927998,55.356403,-0.731125,60.314529,0.295073,48.120598,-0.283518,51.10619,0.531217,45.786282
2,0.363637,0.085275,0.17557,0.002746,1552.811865,156467.643368,1747.702312,76254.192257,3042.260232,784034.5,...,2.45169,40.598766,-7.729093,47.639427,-1.816407,52.382141,-3.43972,46.63966,-2.231258,30.573025
3,0.404785,0.093999,0.141093,0.006346,1070.106615,184355.942417,1596.412872,166441.494769,2184.745799,1493194.0,...,0.780874,44.427753,-3.319597,50.206673,0.636965,37.31913,-0.619121,37.259739,-3.407448,31.949339
4,0.308526,0.087841,0.091529,0.002303,1835.004266,343399.939274,1748.172116,88445.209036,3579.757627,1572978.0,...,-4.520576,86.099236,-5.454034,75.269707,-0.916874,53.613918,-4.404827,62.910812,-11.703234,55.19516


### Get the genre list. We'll use it later to calculate the error

In [204]:
genres_ordered = ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']
encoder = LabelEncoder()
y = encoder.fit_transform(genre_list)

In [205]:
scaler = StandardScaler()
X_arr = np.array(data.iloc[:, :-1], dtype = float)
X = scaler.fit_transform(X_arr)
X

array([[-0.35013678,  0.31258717, -0.01068969, ...,  0.00672291,
        -0.30059734,  0.60406407],
       [-0.46248155,  1.11757233, -0.53285232, ...,  0.54480563,
        -0.40708699,  0.42412706],
       [-0.18422456, -0.13770124,  0.67997762, ..., -0.29593404,
        -0.52729705, -0.29618888],
       ...,
       [ 0.65463736, -1.43198917, -0.75062494, ..., -2.73284378,
        -0.63865065, -0.26361549],
       [-0.19833855,  0.66814351, -0.71697762, ..., -0.72271696,
        -0.5114848 , -0.65064889],
       [-0.2483391 , -0.05894495, -1.1648952 , ...,  0.08070645,
         0.16033426,  0.5868411 ]])

# Setup Training vs Testing Data
- Dividing data into training and Testing set

In [206]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [207]:
len(y_train)

800

In [208]:
len(y_test)

200

In [209]:
X_train[10]

array([ 0.64468403,  0.31965862,  1.58319774,  3.8178156 ,  1.8286    ,
        2.13140695,  1.72196131,  0.40201976,  1.70515201,  1.36852215,
        1.63504985,  2.70011217,  1.01467115,  1.54023377, -1.42633869,
        0.92518843,  1.19751093, -0.08218568, -0.9905032 ,  0.99567326,
        0.65557728,  0.21145343, -1.002249  ,  1.90743523,  1.70634453,
        0.69594474, -0.85196625,  0.85545223,  1.80417942,  0.13755236,
       -0.83176804,  0.41597102,  0.28496323, -0.12699305, -0.64772396,
       -0.17268251,  0.09011084,  0.06928128, -1.2456496 ,  0.12985688,
        0.27844974, -0.14929141, -0.88292683, -0.30101456,  1.00570128,
        0.2381269 ,  0.09783079, -0.09698539,  1.23642637,  0.28056403,
        0.78828072])

# Setup Network Arhitecture
- We setup a feed forward deep learning neural network.
- It has 5 densely connected layers. The 1st 4 layers have relu activation, and the final layer is softmax.
- We have one layer for each genre

In [210]:
from keras import models
from keras import layers

model = models.Sequential()
model.add(layers.Dense(512, activation='relu', input_shape=(X_train.shape[1],)))

model.add(layers.Dense(256, activation='relu'))

model.add(layers.Dense(128, activation='relu'))

model.add(layers.Dense(64, activation='relu'))

model.add(layers.Dense(10, activation='softmax'))  # Last layer, finds the most probable genre

# Define Network Loss and Optimization

In [211]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Train the network
- We set a batch size of 256 and train for 80 epochs

In [212]:
history = model.fit(X_train,
                    y_train,
                    epochs=80,
                    batch_size=256)

Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80
Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80
Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80
Epoch 62/80
Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80


# Evaluate the model with our testing data

In [213]:
test_loss, test_acc = model.evaluate(X_test,y_test)



In [214]:
print('test accuracy: ',test_acc)

test accuracy:  0.7049999833106995


# Model Inference (TODO- calculate spectral features, and put into data structure that gets put into model. add your own audio tracks)
- We can now use our model to make predictions. In this case, we use the test data to do so
- need to extract the audio features that the model uses from ur song

### Create a CSV file to store audio features

In [215]:
def create_csv(track_name, csv_name):
    input_freq_data, sr = librosa.load('./audio/'+track_name, mono=True, duration=30)


    chroma_stft = librosa.feature.chroma_stft(y=input_freq_data, sr=sr)
    rmse = librosa.feature.rms(y=input_freq_data)
    spec_cent = librosa.feature.spectral_centroid(y=input_freq_data, sr=sr)
    spec_bw = librosa.feature.spectral_bandwidth(y=input_freq_data, sr=sr)
    rolloff = librosa.feature.spectral_rolloff(y=input_freq_data, sr=sr)
    zcr = librosa.feature.zero_crossing_rate(input_freq_data)
    mfcc = librosa.feature.mfcc(y=input_freq_data, sr=sr)


    to_append = f'{track_name[:-4]} {np.mean(chroma_stft)} {np.var(chroma_stft)} {np.mean(rmse)} {np.var(rmse)} {np.mean(spec_cent)} {np.var(spec_cent)} {np.mean(spec_bw)} {np.var(spec_bw)} {np.mean(rolloff)} {np.var(rolloff)} {np.mean(zcr)} {np.var(zcr)}'
    for e in mfcc:
        to_append += f' {np.mean(e)}'
        to_append += f' {np.var(e)}'

    file = open(csv_name, 'a', newline='')
    with file:
        writer = csv.writer(file)
        writer.writerow(to_append.split())

def create_header(csv_name):
    header = 'filename chroma_stft_mean chroma_stft_var rms_mean rms_var spectral_centroid_mean spectral_centroid_var spectral_bandwidth_mean spectral_bandwidth_var rolloff_mean rolloff_var zero_crossing_rate_mean zero_crossing_rate_var'
    for i in range(1, 21):
        header += f' mfcc{i}_mean'
        header += f' mfcc{i}_var'
    header += ' label'
    header = header.split()

    file = open(csv_name, 'w', newline='')
    with file:
        writer = csv.writer(file)
        writer.writerow(header)


In [216]:
csv_name = 'data.csv'
tracks = os.listdir('./audio')
# track_name = tracks[2]
print(tracks)
create_header(csv_name)
for track in tracks:
    create_csv(track, csv_name)

['hiphop.00000.wav', 'metal.00019.wav', 'country.00000.wav', 'forward.wav', 'gangplank_galleon.wav', 'blues.00000.wav', 'classical.00000.wav']


### Now let's read our input data.

In [217]:
# now let's
in_data = None
for track in tracks:
    in_data = pd.read_csv(csv_name)
    in_data = in_data.drop(['filename', 'label'],axis=1)

In [218]:
in_data.head()

Unnamed: 0,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,rolloff_mean,rolloff_var,...,mfcc16_mean,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var
0,0.460648,0.085179,0.125286,0.002467,2667.598264,418696.624101,2350.982431,107871.586945,5474.52523,1564568.0,...,7.326542,55.438866,-5.37604,69.462151,7.688968,62.261711,-5.545129,63.314182,1.526966,40.949604
1,0.555233,0.062266,0.100896,0.000967,2699.573852,214777.169714,2324.221639,56802.723919,5385.359102,764867.1,...,5.403974,34.778641,-11.11848,23.151911,2.6868,33.295479,-5.69047,24.402334,3.097618,25.05122
2,0.383708,0.081253,0.248499,0.000705,2797.736904,355754.332506,2847.642919,105961.88814,6220.395719,1801483.0,...,0.361101,68.738335,0.585364,54.161255,4.028057,74.087738,0.928209,68.705605,4.090144,73.815948
3,0.208752,0.086042,0.094637,0.002692,976.691592,261171.317088,1263.216057,211062.187858,1534.515721,857256.1,...,-2.265531,104.791649,1.500498,138.381638,1.371981,138.501846,-0.68032,108.932625,0.270372,117.022636
4,0.338958,0.087362,0.116228,0.002324,1789.672791,344021.655175,2041.964027,130313.19843,3665.066528,1588140.0,...,3.19045,119.261002,-1.293239,111.640648,1.197762,71.990517,-3.442649,110.102531,5.27369,118.395699


In [219]:
data.head()

Unnamed: 0,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,rolloff_mean,rolloff_var,...,mfcc16_mean,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var
0,0.350088,0.088757,0.130228,0.002827,1784.16585,129774.064525,2002.44906,85882.761315,3805.839606,901505.4,...,0.75274,52.42091,-1.690215,36.524071,-0.408979,41.597103,-2.303523,55.062923,1.221291,46.936035
1,0.340914,0.09498,0.095948,0.002373,1530.176679,375850.073649,2039.036516,213843.755497,3550.522098,2977893.0,...,0.927998,55.356403,-0.731125,60.314529,0.295073,48.120598,-0.283518,51.10619,0.531217,45.786282
2,0.363637,0.085275,0.17557,0.002746,1552.811865,156467.643368,1747.702312,76254.192257,3042.260232,784034.5,...,2.45169,40.598766,-7.729093,47.639427,-1.816407,52.382141,-3.43972,46.63966,-2.231258,30.573025
3,0.404785,0.093999,0.141093,0.006346,1070.106615,184355.942417,1596.412872,166441.494769,2184.745799,1493194.0,...,0.780874,44.427753,-3.319597,50.206673,0.636965,37.31913,-0.619121,37.259739,-3.407448,31.949339
4,0.308526,0.087841,0.091529,0.002303,1835.004266,343399.939274,1748.172116,88445.209036,3579.757627,1572978.0,...,-4.520576,86.099236,-5.454034,75.269707,-0.916874,53.613918,-4.404827,62.910812,-11.703234,55.19516


In [220]:
in_arr = np.array(in_data.iloc[:, :-1], dtype = float)
scaler = StandardScaler()
arr = scaler.fit_transform(in_arr)  # THIS NEEDS TO HAVE MORE THAN 1 SONG----- WHAT
arr

# arr

array([[ 8.81809158e-01,  3.60605868e-01,  6.01135976e-02,
         7.26317223e-01,  9.80229458e-01,  1.34608545e+00,
         6.05665250e-01,  3.76296095e-02,  8.77572159e-01,
         8.95072059e-01,  1.20080321e+00,  1.41313336e+00,
         6.97736582e-01,  5.57581679e-01, -8.20819230e-01,
         8.86508886e-01, -1.05949459e+00,  1.22457511e+00,
        -6.23540238e-03,  2.31801561e+00, -1.37266162e+00,
         1.27367201e+00,  1.18015418e+00,  1.96949115e+00,
        -1.38727223e+00,  5.91851054e-01,  8.16526675e-01,
         1.07582321e+00, -9.17918352e-01,  1.40836445e+00,
         1.10429363e+00, -6.70618930e-01, -1.25255581e+00,
        -2.33242652e-01,  9.46683889e-01, -3.92747494e-01,
        -9.61825035e-01, -4.24396760e-02,  1.41637303e+00,
        -1.65032925e-01, -1.04770196e+00, -5.69863247e-01,
         1.54432887e+00, -6.78729079e-01, -7.02603385e-01,
        -1.74576093e-01,  2.09052108e+00, -3.56137205e-01,
        -1.27286485e+00, -4.12320075e-01, -2.81741440e-0

In [221]:
print('in_arr:')
in_arr

in_arr:


array([[ 4.60647821e-01,  8.51791427e-02,  1.25285760e-01,
         2.46655359e-03,  2.66759826e+03,  4.18696624e+05,
         2.35098243e+03,  1.07871587e+05,  5.47452523e+03,
         1.56456796e+06,  1.47340757e-01,  2.89199111e-03,
        -8.09067764e+01,  4.88270068e+03,  8.47926636e+01,
         7.50995667e+02, -3.15501862e+01,  5.49292664e+02,
         3.39019356e+01,  4.07607819e+02, -1.98550491e+01,
         1.58834610e+02,  2.35616112e+01,  1.35544418e+02,
        -2.21317692e+01,  9.35224228e+01,  2.00208302e+01,
         9.66761017e+01, -1.67695675e+01,  9.35621643e+01,
         1.92089195e+01,  5.57985191e+01, -1.31147356e+01,
         7.51671066e+01,  1.15034895e+01,  5.24700546e+01,
        -1.07319489e+01,  7.40129776e+01,  1.05702105e+01,
         7.19252090e+01, -8.42681408e+00,  6.49149857e+01,
         7.32654238e+00,  5.54388657e+01, -5.37603998e+00,
         6.94621506e+01,  7.68896770e+00,  6.22617111e+01,
        -5.54512882e+00,  6.33141823e+01,  1.52696586e+0

In [224]:
print(in_arr.shape)
print(X_test.shape)


(7, 51)
(200, 51)


In [225]:
predictions = model.predict(arr)



In [226]:
print("Here's the prediction strengths of each genre")
predictions[0]

Here's the prediction strengths of each genre


array([1.2254246e-12, 2.7869546e-10, 4.1412273e-12, 3.6798634e-03,
       9.9631482e-01, 6.2377772e-12, 2.4131002e-06, 6.1520407e-09,
       4.9883408e-07, 2.4884396e-06], dtype=float32)

In [227]:
np.sum(predictions[0]) # chesk to add to 1

1.0000001

In [228]:
# predictions holds each song
np.argmax(predictions[1])  # the genre

6

In [230]:

for i in range(len(tracks)):
    print("The predicted genre for " + str(tracks[i]) + " is " +str(genre_list[ np.argmax(predictions[i]) ])+ "!\n")

The predicted genre for hiphop.00000.wav is blues!

The predicted genre for metal.00019.wav is blues!

The predicted genre for country.00000.wav is blues!

The predicted genre for forward.wav is blues!

The predicted genre for gangplank_galleon.wav is blues!

The predicted genre for blues.00000.wav is blues!

The predicted genre for classical.00000.wav is blues!

