In [22]:
import json
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow.keras as keras # type: ignore
import matplotlib.pyplot as plt

# path to json file that stores MFCCs and genre labels for each processed segment
DATA_PATH = "data_10.json"

In [23]:
def load_data(data_path):
    """Loads training dataset from json file.

    :param data_path (str): Path to json file containing data
    :return X (ndarray): Inputs
    :return y (ndarray): Targets
    """

    with open(data_path, "r") as fp:
        data = json.load(fp)

    # convert lists to numpy arrays
    X = np.array(data["mfcc"])
    y = np.array(data["labels"])

    print("Data successfully loaded!")

    return X, y


In [24]:
def prepare_datasets(test_size, validation_size):
    
    # load data
    X,y = load_data(DATA_PATH)
    
    # create train/test split
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=test_size)
    # create train/validation split
    X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=validation_size)
    
    #3d array -> (130, 13, 1)
    X_train = X_train[..., np.newaxis]
    X_validation = X_validation[..., np.newaxis]
    X_test = X_test[..., np.newaxis]
    
    return X_train, X_validation, X_test, y_train, y_validation, y_test


# create train, validation and test set
X_train, X_validation, X_test, y_train, y_validation, y_test = prepare_datasets(0.25,0.2)



Data successfully loaded!


In [25]:
#build the cnn net
def build_model(input_shape):
    model = keras.Sequential()
    #1 conv layer
    model.add(keras.layers.Conv2D(32, (3,3), activation='relu', input_shape = input_shape))
    model.add(keras.layers.MaxPool2D((3,3), strides = (2,2), padding='same'))
    model.add(keras.layers.BatchNormalization()) 

    #2 conv layer
    model.add(keras.layers.Conv2D(32, (3,3), activation='relu', input_shape = input_shape))
    model.add(keras.layers.MaxPool2D((3,3), strides = (2,2), padding='same'))
    model.add(keras.layers.BatchNormalization())

    #3 conv layer
    model.add(keras.layers.Conv2D(32, (2,2), activation='relu', input_shape = input_shape))
    model.add(keras.layers.MaxPool2D((2,2), strides = (2,2), padding='same'))
    model.add(keras.layers.BatchNormalization())

    #flatten the output and feed it into dense layer
    model.add(keras.layers.Flatten())
    model.add(keras.layers.Dense(64, activation = "relu"))
    model.add(keras.layers.Dropout(0.3))

    #output layer(softmax)
    model.add(keras.layers.Dense(10, activation = 'softmax'))

    return model
    

In [26]:

input_shape = (X_train.shape[1], X_train.shape[2], 1) 
model = build_model(input_shape)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [27]:
#compile
optimizer = keras.optimizers.Adam(learning_rate = 0.0001)
model.compile(optimizer = optimizer, loss = "sparse_categorical_crossentropy", metrics = ['accuracy'])

In [28]:
#train
model.fit(X_train,y_train, validation_data=(X_validation,y_validation), batch_size=32, epochs=30)

Epoch 1/30
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - accuracy: 0.1507 - loss: 2.7239 - val_accuracy: 0.3500 - val_loss: 1.8330
Epoch 2/30
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.3414 - loss: 1.8784 - val_accuracy: 0.4473 - val_loss: 1.5484
Epoch 3/30
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.4031 - loss: 1.6751 - val_accuracy: 0.4827 - val_loss: 1.4225
Epoch 4/30
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.4467 - loss: 1.5514 - val_accuracy: 0.5100 - val_loss: 1.3435
Epoch 5/30
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.4730 - loss: 1.4639 - val_accuracy: 0.5307 - val_loss: 1.2888
Epoch 6/30
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.5102 - loss: 1.3556 - val_accuracy: 0.5413 - val_loss: 1.2413
Epoch 7/30
[1m188/188

<keras.src.callbacks.history.History at 0x1bb832b96c0>

In [29]:
#eval
test_error, test_accuracy = model.evaluate(X_test, y_test, verbose = 1)
print("accuracy on test set it: {}".format(test_accuracy))

[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7174 - loss: 0.8134
accuracy on test set it: 0.7114846110343933


In [36]:
# prediction on a sample
genres = ["blues", "classical", "country", "disco", "hip-pop", "jazz", "metal", "pop", "reggae", "rock"]

def predict(model, X, y):
    X = X[np.newaxis]  # augumented 3d array X to a 4d array
    # prediction is a 2d array [[0.1,0.2,...]]
    prediction = model.predict(X)  # X -> (1, 130, 13, 1)
    # extract index with max values in prediction
    prediction_index = np.argmax(prediction, axis = 1) # get [index], then map it to the genre label
    print("expected index: {}, Predicted index: {}".format(y, prediction_index))
    print(prediction_index)
    predicted_genre = genres[prediction_index[0]]
    y_genre = genres[y]
    print("expected genre: {}, Predicted genre: {}".format(y_genre, predicted_genre))
    

X = X_test[100]
y = y_test[100]
predict(model, X, y)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
expected index: 9, Predicted index: [9]
[9]
expected genre: rock, Predicted genre: rock
