# Description

Training a CNN to recognize music genres. CNNs work best with images but turns out, they can be pretty great for audio signals too when we consider spectograms or MFCCs to be a greyscale image with depth = 1 and take amplitudes as pixel value.

# Packages

In [1]:
import numpy as np
from utils import load_data, split_dataset
import tensorflow as tf

# Loading data

In [2]:
DATA_PATH = "preprocessed_raw_audio_data.json"

In [3]:
features, target = load_data(DATA_PATH)

In [None]:
features.shape 

(9986, 130, 13)

In [7]:
target.shape

(9986,)

In [4]:
X_train, X_val, X_test, y_train, y_val, y_test = split_dataset(X=features, y=target, t_size=0.25, v_size=0.2)

In [5]:
X_train.shape

(5991, 130, 13, 1)

# Model Definition

In [6]:
model = tf.keras.Sequential()


# first conv layer
model.add(tf.keras.layers.Conv2D(filters=32, kernel_size=(3, 3), activation= 'relu', input_shape=(X_train.shape[1], X_train.shape[2], X_train.shape[3])))
model.add(tf.keras.layers.MaxPool2D(pool_size=(3, 3), strides=(2, 2), padding="same"))
model.add(tf.keras.layers.BatchNormalization())

# second conv layer
model.add(tf.keras.layers.Conv2D(filters=32, kernel_size=(3, 3), activation= 'relu', input_shape=(X_train.shape[1], X_train.shape[2], X_train.shape[3])))
model.add(tf.keras.layers.MaxPool2D(pool_size=(3, 3), strides=(2, 2), padding="same"))
model.add(tf.keras.layers.BatchNormalization())

# 3rd
model.add(tf.keras.layers.Conv2D(filters=32, kernel_size=(2, 2), activation= 'relu', input_shape=(X_train.shape[1], X_train.shape[2], X_train.shape[3])))
model.add(tf.keras.layers.MaxPool2D(pool_size=(2, 2), strides=(2, 2), padding="same"))
model.add(tf.keras.layers.BatchNormalization())

model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(units=64, activation='relu'))
model.add(tf.keras.layers.Dropout(0.3))

# output
model.add(tf.keras.layers.Dense(10, activation='softmax'))




  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [8]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
model.compile(optimizer=optimizer, loss="sparse_categorical_crossentropy", 
              metrics=['accuracy'])

model.summary()

# Training

In [9]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), batch_size=32, epochs=50)

Epoch 1/50
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.1826 - loss: 2.5696 - val_accuracy: 0.3558 - val_loss: 1.8172
Epoch 2/50
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.3626 - loss: 1.8531 - val_accuracy: 0.4593 - val_loss: 1.4800
Epoch 3/50
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.4231 - loss: 1.6310 - val_accuracy: 0.4920 - val_loss: 1.3569
Epoch 4/50
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.4649 - loss: 1.4948 - val_accuracy: 0.5447 - val_loss: 1.2805
Epoch 5/50
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.4921 - loss: 1.4274 - val_accuracy: 0.5487 - val_loss: 1.2535
Epoch 6/50
[1m188/188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.5149 - loss: 1.3405 - val_accuracy: 0.5754 - val_loss: 1.2013
Epoch 7/50
[1m188/188[

<keras.src.callbacks.history.History at 0x27d4820f320>

# Evaluation and prediction

In [10]:
test_error, test_accuracy = model.evaluate(X_test, y_test, verbose=1)

print(f"Accuracy on test set: {test_accuracy}")

[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7401 - loss: 0.7558
Accuracy on test set: 0.7272727489471436


In [23]:
X = X_test[1]
X=X[np.newaxis, ...]

y= y_test[1]

y_cap = tf.argmax(model.predict(X), axis=1)[0]

print(f"Predicted genre: {y_cap} \n Actual genre: {y}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
Predicted genre: 6 
 Actual genre: 6


In [27]:
model.save('./models/cnn_genre_classifier.keras')