In [4]:
# Imports
from ape_paths import wav_path
import os
import librosa
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D
from tensorflow.python.keras import utils
from keras.utils import to_categorical


In [27]:
def extract_from_genres(root_dir, genre):
    mel_specs = []
    full_labels = []
    for file in os.scandir(root_dir):
        if file.is_dir() and file.name == genre:
            spects, labels = extract_mel_spectrogram(file, file.name)
            # Adding the mel spectrogram to the list
            mel_specs += spects
            # Extracting the label and adding it to the list
            # label = str(file).split('.')[0][11:]
            full_labels += labels
    return mel_specs, full_labels
    # # Converting the list or arrays to an array
    # X = np.array(mel_specs)
    
    # # Converting labels to numeric values
    # full_labels = pd.Series(full_labels)
    # # print("Full labels: ", full_labels, type(full_labels))
    # label_dict = {
    #     'Ambient Electronic': 0,
    #     'Chiptune': 1,
    #     'Classical': 2,
    #     'Country': 3,
    #     'Electronic': 4,
    #     'Folk': 5,
    #     'Hip-Hop': 6,
    #     'Indie-Rock': 7,
    #     'Jazz': 8,
    #     'Metal': 9,
    #     'Pop': 10,
    #     'Post-Rock': 11,
    #     'Psych-Rock': 12,
    #     'Punk': 13,
    #     'Reggae': 14,
    #     'Rock': 15,
    #     'Techno': 16,
    #     'Trip-Hop': 17
    # }
    # y = full_labels.map(label_dict).values
    
    # # Returning the mel spectrograms and labels
    # return X, y

In [25]:
def extract_mel_spectrogram(genre_dir, label):
    '''
    This function takes in a directory of audio files in .wav format, computes the
    mel spectrogram for each audio file, reshapes them so that they are all the 
    same size, and stores them in a numpy array. 
    
    It also creates a list of genre labels and maps them to numeric values.
    
    Parameters:
    directory (int): a directory of audio files in .wav format
    
    Returns:
    X (array): array of mel spectrogram data from all audio files in the given
    directory
    y (array): array of the corresponding genre labels in numeric form
    '''
    
    # Creating empty lists for mel spectrograms and labels
    labels = []
    mel_specs = []
    
    
    # Looping through each file in the directory
    for file in os.scandir(genre_dir):
        # Don't process if not .mp3 file
        if file.name.endswith('.wav'):  
            # Loading in the audio file
            y, sr = librosa.core.load(file)
            
            # Extracting the label and adding it to the list
            # label = str(file).split('.')[0][11:]
            labels.append(label)

            # Computing the mel spectrograms
            spect = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=2048, hop_length=1024)
            spect = librosa.power_to_db(spect, ref=np.max)
            
            # Adjusting the size to be 128 x 660
            if spect.shape[1] != 660:
                spect.resize(128,660, refcheck=False)
                
            # Adding the mel spectrogram to the list
            mel_specs.append(spect)
    return mel_specs, labels
        
    # Converting the list or arrays to an array
    # X = np.array(mel_specs)
    
    # # Converting labels to numeric values
    # labels = pd.Series(labels)
    # label_dict = {
    #     'Ambient Electronic': 0,
    #     'Chiptune': 1,
    #     'Classical': 2,
    #     'Country': 3,
    #     'Electronic': 4,
    #     'Folk': 5,
    #     'Hip-Hop': 6,
    #     'Indie-Rock': 7,
    #     'Jazz': 8,
    #     'Metal': 9,
    #     'Pop': 10,
    #     'Post-Rock': 11,
    #     'Psych-Rock': 12,
    #     'Punk': 13,
    #     'Reggae': 14,
    #     'Rock': 15,
    #     'Techno': 16,
    #     'Trip-Hop': 17
    # }
    # y = labels.map(label_dict).values
    
    # # Returning the mel spectrograms and labels
    # return X, y

In [1]:
# Using the function to read and extract mel spectrograms from the GTZAN Genre Dataset audio files
genre_dict = {
        'Ambient Electronic': 0,
        'Chiptune': 1,
        'Classical': 2,
        'Country': 3,
        'Electronic': 4,
        'Folk': 5,
        'Hip-Hop': 6,
        'Indie-Rock': 7,
        'Jazz': 8,
        'Metal': 9,
        'Pop': 10,
        'Post-Rock': 11,
        'Psych-Rock': 12,
        'Punk': 13,
        'Reggae': 14,
        'Rock': 15,
        'Techno': 16,
        'Trip-Hop': 17
    }

KeyError: 0

In [28]:
X0, y0 = extract_from_genres(wav_path, 'Ambient Electronic')

In [29]:
X1, y1 = extract_from_genres(wav_path, 'Chiptune')

In [30]:
X2, y2 = extract_from_genres(wav_path, 'Classical')

In [38]:
X3, y3 = extract_from_genres(wav_path, 'Country')

In [39]:
X4, y4 = extract_from_genres(wav_path, 'Electronic')

In [40]:
X5, y5 = extract_from_genres(wav_path, 'Folk')

In [41]:
X6, y6 = extract_from_genres(wav_path, 'Hip-Hop')

In [42]:
X7, y7 = extract_from_genres(wav_path, 'Indie-Rock')

In [43]:
X8, y8 = extract_from_genres(wav_path, 'Jazz')

In [44]:
X9, y9 = extract_from_genres(wav_path, 'Metal')

In [45]:
X10, y10 = extract_from_genres(wav_path, 'Pop')

In [46]:
X11, y11 = extract_from_genres(wav_path, 'Post-Rock')

In [47]:
X12, y12 = extract_from_genres(wav_path, 'Psych-Rock')

In [48]:
X13, y13 = extract_from_genres(wav_path, 'Punk')

In [49]:
X14, y14 = extract_from_genres(wav_path, 'Reggae')

In [50]:
X15, y15 = extract_from_genres(wav_path, 'Rock')

In [51]:
X16, y16 = extract_from_genres(wav_path, 'Techno')

In [52]:
X17, y17 = extract_from_genres(wav_path, 'Trip-Hop')

In [53]:
X = X0 + X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11 + X12 + X13 + X14 + X15 + X16 + X17

In [54]:
y = y0 + y1 + y2 + y3 + y4 + y5 + y6 + y7 + y8 + y9 + y10 + y11 + y12 + y13 + y14 + y15 + y16 + y17

In [59]:
print(len(X), len(X[2]), len(X[2][0]))

18 470 128


In [55]:
print(len(y))

18


In [60]:
X = np.array(X)

  X = np.array(X)


In [None]:
# Dropping filename column
data_no_filenames = data_raw.drop(['filename'],axis=1)

In [None]:
data_no_filenames.shape
#data_no_filenames.head()

In [None]:
# Create three data sets, data_no_filenames, data_mfccs_only, and data_feat_mfccs
# CLASSIFICATION first features + MFCCs 1-13 only
# Remove mfcss over 13, retain genre column
mfcc = 'mfcc'
mfcc_list = [mfcc+str(x) for x in list(range(13+1 ,20+1))]
data_mfccs_plus = data_no_filenames.drop(mfcc_list, axis=1)
data_mfccs_plus.shape
print(data_mfccs_plus)

In [None]:
# Encoding the genres into numbers
genres_list = data_mfccs_plus.iloc[:, -1]
encoder = LabelEncoder()
genres_y = encoder.fit_transform(genres_list)
genres_y.shape
# print(genres_y)
# Just checking to see the original genres
#print(encoder.inverse_transform(genres_y))

In [None]:
# Get mfccs only
data_mfccs_only = data_mfccs_plus.iloc[:,data_mfccs_plus.columns.get_loc('mfcc1'):]
data_mfccs_only.shape

In [None]:
# Get first six features and mfcc2 only
mfcc2_drop = [mfcc+str(x) for x in list(range(2+1 ,13+1))]
data_feat_mfcc2 = data_mfccs_plus.drop(mfcc2_drop,axis=1)
data_feat_mfcc2 = data_feat_mfcc2.drop('mfcc1',axis=1)
data_feat_mfcc2.shape
#print(data_feat_mfcc2)

In [None]:
# Scaling the Feature Columns 
# a. All
scaler = StandardScaler()
features_X_a = scaler.fit_transform(np.array(data_no_filenames.iloc[:, :-1], dtype=float))
features_X_a.shape
#print(features_X_a)

In [None]:
# b. MFCCS 1-13 only
scaler = StandardScaler()
features_X_b = scaler.fit_transform(np.array(data_mfccs_only.iloc[:, :-1], dtype=float))
features_X_b.shape
print(features_X_b)

In [None]:
# c. First six features and mfcc2 only
scaler = StandardScaler()
features_X_c = scaler.fit_transform(np.array(data_feat_mfcc2.iloc[:, :-1], dtype=float))
features_X_c.shape
#print(features_X_c)

In [None]:
# CLASSIFICATION ALL FEATURES
# Dividing data into training and Testing set
# Test a
X_train_a, X_test_a, y_train_a, y_test_a = train_test_split(features_X_a, genres_y, test_size=0.2)
# Test b
X_train_b, X_test_b, y_train_b, y_test_b = train_test_split(features_X_b, genres_y, test_size=0.2)
# Test c
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(features_X_c, genres_y, test_size=0.2)

In [None]:
print(len(X_train_a), len(X_train_b), len(X_train_c))
print(len(X_test_a), len(X_test_b), len(X_test_c))
print(len(y_train_a), len(y_train_b), len(y_train_c))
print(len(y_test_a), len(y_test_b), len(y_test_c))
print(len(genres_y))

In [None]:
# Classification with Keras
# Relu - Applies the rectified linear unit activation function.
# With default values, this returns the standard ReLU activation: max(x, 0), the element-wise maximum of 0 and the input tensor.
# https://keras.io/api/layers/activations/
# Softmax - The softmax function, also known as softargmax[1]:184 or normalized exponential function,[2]:198 is a generalization of the logistic 
# function to multiple dimensions. It is used in multinomial logistic regression and is often used as the last activation function of a neural 
# network to normalize the output of a network to a probability distribution over predicted output classes, based on Luce's choice axiom.
# https://en.wikipedia.org/wiki/Softmax_function

# Building our Networks
# A
modelA = models.Sequential()
modelA.add(layers.Dense(512, activation='relu', input_shape=(X_train_a.shape[1],)))
modelA.add(layers.Dense(256, activation='relu'))
modelA.add(layers.Dense(128, activation='relu'))
modelA.add(layers.Dense(64, activation='relu'))
modelA.add(layers.Dense(20, activation='softmax'))

# B
modelB = models.Sequential()
modelB.add(layers.Dense(512, activation='relu', input_shape=(X_train_b.shape[1],)))
modelB.add(layers.Dense(256, activation='relu'))
modelB.add(layers.Dense(128, activation='relu'))
modelB.add(layers.Dense(64, activation='relu'))
modelB.add(layers.Dense(20, activation='softmax'))

# C
modelC = models.Sequential()
modelC.add(layers.Dense(512, activation='relu', input_shape=(X_train_c.shape[1],)))
modelC.add(layers.Dense(256, activation='relu'))
modelC.add(layers.Dense(128, activation='relu'))
modelC.add(layers.Dense(64, activation='relu'))
modelC.add(layers.Dense(20, activation='softmax'))

In [None]:
# Config the model with losses and metrics
# Optimizer that implements the Adam algorithm - https://www.tensorflow.org/api_docs/python/tf/keras/optimizers/Adam
# SparseCategoricalCrossentropy computes the crossentropy loss between the labels and predictions. Use this function
#   when there are two or more label classes. We expect labels to be provided as integers.
# Accuracy calculates how often predictions equal labels.  
modelA.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

modelB.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

modelC.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])



In [None]:
# Set epochs
epochs = 20

In [None]:
# Train the model
# x_train = Input data, numpy array
# y_train = Target data, numpy array consistent with x
# epochs = integer. Number of epochs to train the model. An epoch is an iteration over the entire x and y data provided.
#           The model is not trained for a number of iterations given by epochs, but merely until the epoch of index
#           epochs is reached.
# batch_size = Integer or None. Number of samples per gradient update. 
historyA = modelA.fit(X_train_a, y_train_a, epochs=epochs, batch_size=128)

In [None]:
historyB = modelB.fit(X_train_b, y_train_b, epochs=epochs, batch_size=128)

In [None]:
historyC = modelC.fit(X_train_c, y_train_c, epochs=epochs, batch_size=128)

In [None]:
# Returns the loss value and metrics values for the model in test mode. Computation is done in batches.
# X_test = Input data
# y_test = Target data
# batch_size = Number of samples per batch of computation. If unspecified, will default to 32.
# test_loss, test_acc = model.evaluate(X_test, y_test)
testA = modelA.evaluate(X_test_a, y_test_a, verbose=0)
testB = modelB.evaluate(X_test_b, y_test_b, verbose=0)
testC = modelC.evaluate(X_test_c, y_test_c, verbose=0)
 
print('Test loss A: ', testA[0])
print('Test loss B: ', testB[0])
print('Test loss C: ', testC[0])

print('Test accuracy A: ', testA[1])
print('Test accuracy B: ', testB[1])
print('Test accuracy C: ', testC[1])

In [None]:
# Validating approach
# Set apart 200 samples in training data to use as validation set
x_val_a = X_train_a[:200]
x_val_b = X_train_b[:200]
x_val_c = X_train_c[:200]
partial_x_train_a = X_train_a[200:]
partial_x_train_b = X_train_b[200:]
partial_x_train_c = X_train_c[200:]

y_val_a = y_train_a[:200]
y_val_b = y_train_b[:200]
y_val_c = y_train_c[:200]
partial_y_train_a = y_train_a[200:]
partial_y_train_b = y_train_b[200:]
partial_y_train_c = y_train_c[200:]

In [None]:
# Train network for epochs = 20
# Building our Networks
# A
modelA = models.Sequential()
modelA.add(layers.Dense(512, activation='relu', input_shape=(X_train_a.shape[1],)))
modelA.add(layers.Dense(256, activation='relu'))
modelA.add(layers.Dense(128, activation='relu'))
modelA.add(layers.Dense(64, activation='relu'))
modelA.add(layers.Dense(20, activation='softmax'))

# B
modelB = models.Sequential()
modelB.add(layers.Dense(512, activation='relu', input_shape=(X_train_b.shape[1],)))
modelB.add(layers.Dense(256, activation='relu'))
modelB.add(layers.Dense(128, activation='relu'))
modelB.add(layers.Dense(64, activation='relu'))
modelB.add(layers.Dense(20, activation='softmax'))

# C
modelC = models.Sequential()
modelC.add(layers.Dense(512, activation='relu', input_shape=(X_train_c.shape[1],)))
modelC.add(layers.Dense(256, activation='relu'))
modelC.add(layers.Dense(128, activation='relu'))
modelC.add(layers.Dense(64, activation='relu'))
modelC.add(layers.Dense(20, activation='softmax'))

modelA.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

modelB.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

modelC.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
historyA = modelA.fit(partial_x_train_a, partial_y_train_a, epochs=epochs, batch_size=512, validation_data=(x_val_a, y_val_a))
resultsA = modelA.evaluate(X_test_a, y_test_a)
print(resultsA)

In [None]:
historyB = modelB.fit(partial_x_train_b, partial_y_train_b, epochs=epochs, batch_size=512, validation_data=(x_val_b, y_val_b))
resultsB = modelB.evaluate(X_test_b, y_test_b)
print(resultsB)

In [None]:
historyC = modelC.fit(partial_x_train_c, partial_y_train_c, epochs=epochs, batch_size=512, validation_data=(x_val_c, y_val_c))
resultsC = modelC.evaluate(X_test_c, y_test_c)
print(resultsC)