## This notebok uses a pre-trained model to classify music genres based on image-text pairs

In [None]:
# Data preprocessing
import pandas as pd
import numpy as np
import os, librosa
from tqdm import tqdm
from sklearn.metrics import confusion_matrix


# Visualization
import IPython.display as ipd
import matplotlib.pyplot as plt
import seaborn as sns

#tf
import keras
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras import layers, Model, Input
import tensorflow_hub as hub
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras import layers, Sequential
from keras.callbacks import EarlyStopping

In [None]:
# Displaying a random waveform and spectrogram
audioFile = 'data/genres_original/blues/blues.00000.wav'
waveform, sampleRate = librosa.load(audioFile)
print('Class : Blues\n')
ipd.display(ipd.Audio(waveform, rate = sampleRate))


# Displaying waveform
plt.figure(figsize = (15, 4))
plt.subplot(1, 2, 1)
plt.plot(waveform)
plt.title('Waveform', fontsize = 16)
plt.xlabel('Sample Index', fontsize = 12)
plt.ylabel('Amplitude', fontsize = 12)


# Displaying spectrogram
plt.subplot(1, 2, 2)
plt.specgram(waveform, Fs = sampleRate)
plt.title('Spectrogram', fontsize = 16)
plt.xlabel('Time (s)', fontsize = 12)
plt.ylabel('Frequency (Hz)', fontsize = 12)

plt.show()

In [None]:
def get_melspectrogram(audioFile, sampleRate=22050):
    audio_as_np_array, sample_rate = librosa.load(audioFile, sr=sampleRate)
    melody_spectrogram = librosa.feature.melspectrogram(y=audio_as_np_array, sr=sampleRate, n_mels=140, fmax=9000, n_fft=2048, hop_length=512)
    mel_spectro_in_DB = librosa.power_to_db(melody_spectrogram,ref=np.max)
    mel_spectro_30_sec = librosa.util.fix_length(mel_spectro_in_DB, size=1293, mode='edge', axis=1)
    return mel_spectro_30_sec

# Root directory containing the audio files

def load_data(path_to_root, split: str):
    X = []
    y = []
    root = os.path.join(path_to_root, split)
    label_mapping = {"pop":0, "metal":1, "disco":2, "blues":3, "reggae":4, "classical":5, "rock":6, "hiphop":7, "country":8, "jazz":9}
    # Iterating through the folders and files to extract features
    for folder in os.listdir(root):
        folderPath = os.path.join(root, folder)
        for file in tqdm(os.listdir(folderPath), desc = f'Processing folder {folder}'):
            filePath = os.path.join(folderPath, file)
            features = get_melspectrogram(filePath)
            if features is not None:
                X.append(features)
                y.append(to_categorical(label_mapping[folder], num_classes = 10))
    return X, y
    
X_train, y_train = load_data('data_train_val_test/', 'train')
X_val, y_val = load_data('data_train_val_test/', 'val')

In [None]:
# Load pre-trained ResNet model without the top (classification) layer
resnet50 = ResNet50(weights='imagenet', include_top=False, input_shape=(140, 1293, 3))
# Freeze layers of the pre-trained model
for layer in resnet50.layers:
    layer.trainable = False

In [None]:
WIDTH = 1293
HEIGHT = 140

# Define the model
input_layer = Input(shape=(HEIGHT,WIDTH,1),batch_size=32)
print('input shape',x.shape)
# Add a layer for the transformation from 2D to 3D
x = layers.Conv2D(3,(1, 1), activation="relu", input_shape = (HEIGHT,WIDTH,1))(input_layer)
print("2d to 3d", x.shape)
#add resnet 
x = resnet50(x)
print("after resnet", x.shape)
# Add additional layers

x = layers.Flatten()(x)
print("flatten dimensions",x.shape)
x = layers.Dense(256, activation='relu')(x)  # Add your custom layer
print("MLP DEnse",x.shape)

predictions = layers.Dense(10, activation='softmax')(x)  # Assuming num_classes is the number of classes in your dataset
print('predictions',predictions.shape)
# Create the model
model = Model(inputs=input_layer, outputs=predictions)

# Compile the model
model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-3),
              loss='categorical_crossentropy',
              metrics=['accuracy'])



In [None]:

# Train the model
model.fit(np.array(X_train),np.array(y_train), epochs=10, batch_size=32, validation_data=(np.array(X_val), np.array(y_val)), callbacks=[EarlyStopping(patience=2)])

# Evaluate the model

In [None]:
# Helper 

genre_names = [
    "blues",
    "classical",
    "country",
    "disco",
    "hiphop",
    "jazz",
    "metal",
    "pop",
    "reggae",
    "rock",
]
genre_codes = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

# Import these into your working script to make sure that we all have the same codes
id2label = {id_: label for id_, label in zip(genre_codes, genre_names)}
label2id = {label: id_ for label, id_ in zip(genre_names, genre_codes)}
genres = list(label2id.keys())

In [None]:
#load test data
X_test, y_test = load_data('data_train_val_test/', 'test')

In [None]:
#model inference
y_pred = model.predict(np.array(X_test))
model_eval = model.evaluate(np.array(X_test), np.array(y_test))

In [None]:
## Inference - > Predict -> confusion matrix

y_pred = np.argmax(y_pred, axis = 1)
y_test=np.argmax(y_test, axis=1)

grid_cm = pd.DataFrame(confusion_matrix(y_test, y_pred, normalize='pred'),
                       index=genres,
                       columns=genres)


plt.figure(figsize=(8,6))
plt.title("Confusion matrix")
sns.heatmap(grid_cm, annot=True, cmap="viridis")
plt.ylabel('True')
plt.xlabel('Predicted')
plt.show()