In [6]:
import os
import numpy as np
import librosa
import pandas as pd
from natsort import natsorted 
import librosa.display
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.utils import to_categorical

%matplotlib inline
import matplotlib.pyplot as plt

In [7]:
!pip install natsort



In [8]:
# Map each folder name (string) to an integer label
labels_map = {
    "Baroque": 0,
    "Classical": 1,
    "Romantic": 2
}
num_classes = len(labels_map)

In [9]:
import librosa.display

data_dir = "resampled_dataset"  # Adjust to your dataset path

X = []
y = []

for label_str, label_idx in labels_map.items():
    folder_path = os.path.join(data_dir, label_str)
    
    # Iterate over all audio files in the current folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".wav"):
            file_path = os.path.join(folder_path, filename)
            
            # 1. Load audio
            signal, sr = librosa.load(file_path, sr=22050)
            
            # 2. Compute mel-spectrogram
            S = librosa.feature.melspectrogram(y=signal, sr=sr, n_mels=128, fmax=sr/2)
            S_dB = librosa.power_to_db(S, ref=np.max)
            
            # 4. Append to X, y
            X.append(S_dB)
            y.append(label_idx)

# Convert X and y to numpy arrays
X = np.array(X)
y = np.array(y)

# If you want a channel dimension for Keras Conv2D, reshape X to (samples, freq, time, 1)
X = X[..., np.newaxis]  # adds a channel dimension

# One-hot encode labels for categorical crossentropy
y = to_categorical(y, num_classes=num_classes)

print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (7674, 128, 216, 1)
y shape: (7674, 3)


In [10]:
def build_cnn_model(input_shape, num_classes):
    model = models.Sequential()
    
    model.add(layers.Conv2D(16, (3, 3), activation='relu', input_shape=input_shape))
    model.add(layers.MaxPooling2D((2, 2)))
    
    model.add(layers.Conv2D(32, (3, 3), activation='relu'))
    model.add(layers.MaxPooling2D((2, 2)))
    
    model.add(layers.Conv2D(64, (3, 3), activation='relu'))
    model.add(layers.MaxPooling2D((2, 2)))
    
    model.add(layers.Flatten())
    model.add(layers.Dense(128, activation='relu'))
    model.add(layers.Dropout(0.3))
    model.add(layers.Dense(num_classes, activation='softmax'))
    
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model

In [11]:
from sklearn.model_selection import train_test_split

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)

print("Train size:", X_train.shape, y_train.shape)
print("Val size:", X_val.shape, y_val.shape)
print("Test size:", X_test.shape, y_test.shape)

Train size: (5371, 128, 216, 1) (5371, 3)
Val size: (1151, 128, 216, 1) (1151, 3)
Test size: (1152, 128, 216, 1) (1152, 3)


In [12]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

In [13]:
# Build the model
input_shape = (128, X_train.shape[2], 1)  # (freq_bins, time_frames, channels)
model = build_cnn_model(input_shape, num_classes)

# Train the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=20,
    batch_size=16
)

# Evaluate on the test set
test_loss, test_acc = model.evaluate(X_test, y_test)
print("Test Accuracy:", test_acc)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 268ms/step - accuracy: 0.3869 - loss: 4.5214 - val_accuracy: 0.3814 - val_loss: 1.0903
Epoch 2/20
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 322ms/step - accuracy: 0.3681 - loss: 1.0940 - val_accuracy: 0.3814 - val_loss: 1.0906
Epoch 3/20
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m102s[0m 304ms/step - accuracy: 0.3975 - loss: 1.0858 - val_accuracy: 0.3814 - val_loss: 1.0907
Epoch 4/20
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 319ms/step - accuracy: 0.3859 - loss: 1.0899 - val_accuracy: 0.3814 - val_loss: 1.0904
Epoch 5/20
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 271ms/step - accuracy: 0.3696 - loss: 1.0939 - val_accuracy: 0.3814 - val_loss: 1.0905
Epoch 6/20
[1m336/336[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 61ms/step - accuracy: 0.3767 - loss: 1.0935 - val_accuracy: 0.3814 - val_loss: 1.0906
Epoch 7