In [None]:
import pandas as pd
import numpy as np
import os
import pathlib
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Flatten
from tensorflow.keras.optimizers import RMSprop
import tensorflow.compat.v1 as tfv1
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
%cd '/content/drive/My Drive/Colab Notebooks/OSU/CS467_shared'

/content/drive/.shortcut-targets-by-id/1Y2mRkkRB6wyTJ1-1zFboT6i8X4wUbIq6/CS467_shared


In [None]:
#declare hyperparameters 
BATCH_SIZE = 32 
NUM_EPOCHS = 20 

#and k-fold params and per-fold containers
num_folds = 10
fold_no = 1
acc_per_fold = []
loss_per_fold = []

#and other globals
IMG_PIXELS = 67000
img_width = 335 
img_height = 200 

In [None]:
label_strings = ['blues', 'classical', 'country', 'disco', 'hiphop', 'jazz', 'metal', 'pop', 'reggae', 'rock']

In [None]:
#convert GTZAN to a tensorflow dataset. This skips the steps of importing all the images,
#then converting them thus saving on memory
GTZAN_dataset = tf.data.experimental.load(
    './Data/GTZAN_dataset',
    (tf.TensorSpec(shape=(IMG_PIXELS,), dtype=tf.int32, name=None),
     tf.TensorSpec(shape=(), dtype=tf.int32, name=None)))

In [None]:
#build images and labels array from 
images = []
labels = []
for x, y in GTZAN_dataset:
  images.append(x.numpy())
  labels.append(y.numpy())

images = np.array(images)
labels = np.array(labels)

In [None]:
# Normalize pixel values to be between 0 and 1 and reshape
images = images / 255.0
images = images.reshape(images.shape[0], img_height, img_width, 1)

# one hot encode labels
labels = tf.keras.utils.to_categorical(labels, 10)

In [None]:
#importing feature dataset
data = pd.read_csv('./Data/features_30_sec.csv')
data.head()

Unnamed: 0,filename,length,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,rolloff_mean,rolloff_var,zero_crossing_rate_mean,zero_crossing_rate_var,harmony_mean,harmony_var,perceptr_mean,perceptr_var,tempo,mfcc1_mean,mfcc1_var,mfcc2_mean,mfcc2_var,mfcc3_mean,mfcc3_var,mfcc4_mean,mfcc4_var,mfcc5_mean,mfcc5_var,mfcc6_mean,mfcc6_var,mfcc7_mean,mfcc7_var,mfcc8_mean,mfcc8_var,mfcc9_mean,mfcc9_var,mfcc10_mean,mfcc10_var,mfcc11_mean,mfcc11_var,mfcc12_mean,mfcc12_var,mfcc13_mean,mfcc13_var,mfcc14_mean,mfcc14_var,mfcc15_mean,mfcc15_var,mfcc16_mean,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var,label
0,blues.00000.wav,661794,0.350088,0.088757,0.130228,0.002827,1784.16585,129774.064525,2002.44906,85882.761315,3805.839606,901505.4,0.083045,0.000767,-4.529724e-05,0.008172,8e-06,0.005698,123.046875,-113.570648,2564.20752,121.571793,295.913818,-19.168142,235.574432,42.366421,151.106873,-6.364664,167.934799,18.623499,89.18084,-13.704891,67.660492,15.34315,68.932579,-12.27411,82.204201,10.976572,63.386311,-8.326573,61.773094,8.803792,51.244125,-3.6723,41.217415,5.747995,40.554478,-5.162882,49.775421,0.75274,52.42091,-1.690215,36.524071,-0.408979,41.597103,-2.303523,55.062923,1.221291,46.936035,blues
1,blues.00001.wav,661794,0.340914,0.09498,0.095948,0.002373,1530.176679,375850.073649,2039.036516,213843.755497,3550.522098,2977893.0,0.05604,0.001448,0.0001395807,0.005099,-0.000178,0.003063,67.999589,-207.501694,7764.555176,123.991264,560.259949,8.955127,572.810913,35.877647,264.506104,2.90732,279.932922,21.510466,156.477097,-8.560436,200.849182,23.370686,142.555954,-10.099661,166.108521,11.900497,104.358612,-5.555639,105.17363,5.376327,96.197212,-2.23176,64.914291,4.22014,73.152534,-6.012148,52.422142,0.927998,55.356403,-0.731125,60.314529,0.295073,48.120598,-0.283518,51.10619,0.531217,45.786282,blues
2,blues.00002.wav,661794,0.363637,0.085275,0.17557,0.002746,1552.811865,156467.643368,1747.702312,76254.192257,3042.260232,784034.5,0.076291,0.001007,2.105576e-06,0.016342,-1.9e-05,0.007458,161.499023,-90.722595,3319.044922,140.446304,508.765045,-29.093889,411.781219,31.684334,144.090317,-13.984504,155.493759,25.764742,74.548401,-13.664875,106.981827,11.639934,106.574875,-11.783643,65.447945,9.71876,67.908859,-13.133803,57.781425,5.791199,64.480209,-8.907628,60.385151,-1.077,57.711136,-9.229274,36.580986,2.45169,40.598766,-7.729093,47.639427,-1.816407,52.382141,-3.43972,46.63966,-2.231258,30.573025,blues
3,blues.00003.wav,661794,0.404785,0.093999,0.141093,0.006346,1070.106615,184355.942417,1596.412872,166441.494769,2184.745799,1493194.0,0.033309,0.000423,4.583644e-07,0.019054,-1.4e-05,0.002712,63.024009,-199.544205,5507.51709,150.090897,456.505402,5.662678,257.161163,26.859079,158.267303,1.771399,268.034393,14.234031,126.794128,-4.832006,155.912079,9.286494,81.273743,-0.759186,92.11409,8.137607,71.314079,-3.200653,110.236687,6.079319,48.251999,-2.480174,56.7994,-1.079305,62.289902,-2.870789,51.651592,0.780874,44.427753,-3.319597,50.206673,0.636965,37.31913,-0.619121,37.259739,-3.407448,31.949339,blues
4,blues.00004.wav,661794,0.308526,0.087841,0.091529,0.002303,1835.004266,343399.939274,1748.172116,88445.209036,3579.757627,1572978.0,0.101461,0.001954,-1.756129e-05,0.004814,-1e-05,0.003094,135.999178,-160.337708,5195.291992,126.219635,853.784729,-35.587811,333.792938,22.148071,193.4561,-32.4786,336.276825,10.852294,134.831573,-23.352329,93.257095,0.498434,124.672127,-11.793437,130.073349,1.207256,99.675575,-13.088418,80.254066,-2.813867,86.430626,-6.933385,89.555443,-7.552725,70.943336,-9.164666,75.793404,-4.520576,86.099236,-5.454034,75.269707,-0.916874,53.613918,-4.404827,62.910812,-11.703234,55.19516,blues


In [None]:
#build our array of feat labels
genre_list = data.iloc[:, -1]
feat_labels = []
for x in genre_list:
  feat_labels.append(label_strings.index(x))

feat_labels = np.array(feat_labels)

In [None]:
#clean up the 
data = data.drop(['filename'],axis=1)
scaler = StandardScaler()
feat_values = scaler.fit_transform(np.array(data.iloc[:, :-1], dtype = float))

#one hot encode the new labels 
feat_labels = tf.keras.utils.to_categorical(feat_labels, 10)

## Now we get to the K-fold bits.
Our k-fold harness only understands 2 dimsnational arrays. The features dataframe fits this stipualtion, but our images array does not. Moreover, we can only pass a single data object to the split function that parses the data into randomly allocated train and test sets. Therefore we need to reshape the image files into a single dimensional array, then append those onto our features, but only temporarily.

In [None]:
print(feat_values.shape)
print(images.shape)

flat_images = images
flat_images = flat_images.flatten().reshape(999, 67000)
print(flat_images.shape)

data_master = np.concatenate((feat_values, flat_images), axis=1)

print(data_master.shape)

(999, 58)
(999, 200, 335, 1)
(999, 67000)
(999, 67058)


In [None]:
#k-fold cross validation test harness
kf = KFold(n_splits=num_folds, shuffle=True)

# K-fold Cross Validation model evaluation
for train, test in kf.split(data_master, labels):
  #reshape our newly split data back to their orginal shapes
  k_train_images = data_master[train][:,58:67058]
  k_train_images = k_train_images.reshape(
      k_train_images.shape[0], img_height, img_width, 1)
  k_train_feats_values = data_master[train][:, :58]
  k_test_images = data_master[test][:,58:67058]
  k_test_images = k_test_images.reshape(
      k_test_images.shape[0], img_height, img_width, 1)
  k_test_feats_values = data_master[test][:, :58]

  #the rest looks mostly like the hybrid model Josh built for us.

  # feature layers
  feat_input = keras.Input(shape=(k_feats_values.shape[1],), name='feat_input')
  x = keras.layers.Dense(256, activation='relu')(feat_input)
  x = keras.layers.Dense(128, activation='relu')(x)
  feat_layers = keras.layers.Dense(64, activation='relu')(x)

  # image convolutional layers
  img_input = keras.Input(shape=(img_height, img_width, 1), name="img_input")
  x = keras.layers.Conv2D(32, kernel_size=(5, 5), activation='relu')(img_input)
  x = keras.layers.MaxPooling2D(pool_size=(4, 4), strides=(4,4))(x)
  x = keras.layers.Dropout(0.25)(x)
  x = keras.layers.Flatten()(x)
  img_layers = keras.layers.Dropout(0.5)(x)

  # concatenate img layers with feature layers and define output layer
  combined = keras.layers.concatenate([img_layers, feat_layers])
  out_layer = keras.layers.Dense(10, activation='softmax')(combined)

  # define model with both image and feature inputs
  model = keras.Model(inputs=[feat_input, img_input], outputs=out_layer)

  # compile model
  model.compile(optimizer='adam',
    loss=keras.losses.categorical_crossentropy,
    metrics=['accuracy'])
  
  # Generate a print
  print('------------------------------------------------------------------------')
  print(f'Training for fold {fold_no} ...')

  # fit model
  history = model.fit(x=[k_train_feats_values, k_train_images], y=labels[train],  
                    batch_size=BATCH_SIZE, epochs=NUM_EPOCHS) 

  # Generate generalization metrics and append to respective containers
  scores = model.evaluate([k_test_feats_values, k_test_images], 
                          y=labels[test], verbose=2)
  print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]};\
   {model.metrics_names[1]} of {scores[1]*100}%')
  acc_per_fold.append(scores[1] * 100)
  loss_per_fold.append(scores[0])

  # Increase fold number
  fold_no = fold_no + 1

(100, 10)
(899, 10)
------------------------------------------------------------------------
Training for fold 1 ...
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
4/4 - 0s - loss: 0.4755 - accuracy: 0.8300
Score for fold 1: loss of 0.4755220413208008;   accuracy of 82.99999833106995%
(100, 10)
(899, 10)
------------------------------------------------------------------------
Training for fold 2 ...
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
4/4 - 0s - loss: 0.6848 - accuracy: 0.8600
Score for fold 2: loss of 0.684782862663269;   accuracy of 86.00000143051147%
(100, 10)
(899, 10)
-----------------------------------------

In [None]:
# == Provide average scores ==
print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(acc_per_fold)):
  print('------------------------------------------------------------------------')
  print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Accuracy: {np.mean(acc_per_fold)} (+- {np.std(acc_per_fold)})')
print(f'> Loss: {np.mean(loss_per_fold)}')
print('------------------------------------------------------------------------')

------------------------------------------------------------------------
Score per fold
------------------------------------------------------------------------
> Fold 1 - Loss: 0.49381014704704285 - Accuracy: 82.99999833106995%
------------------------------------------------------------------------
> Fold 2 - Loss: 0.5414174795150757 - Accuracy: 80.0000011920929%
------------------------------------------------------------------------
> Fold 3 - Loss: 0.689606785774231 - Accuracy: 83.99999737739563%
------------------------------------------------------------------------
> Fold 4 - Loss: 0.5572614073753357 - Accuracy: 81.00000023841858%
------------------------------------------------------------------------
> Fold 5 - Loss: 0.5173542499542236 - Accuracy: 81.00000023841858%
------------------------------------------------------------------------
> Fold 6 - Loss: 0.6691000461578369 - Accuracy: 81.99999928474426%
------------------------------------------------------------------------


An accuracy of 76% - 84% is pretty dang good! 