In [1]:
import os
import numpy as np
import librosa
import pandas as pd
from natsort import natsorted 
import librosa.display
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.utils import to_categorical

%matplotlib inline
import matplotlib.pyplot as plt

In [1]:
#all composers with more than 10 recordings in the dataset(from excel analaysis)
composer_list=['Beethoven', 'Bach', 'Schubert', 'Handel', 'Brahms', 'Schumann', 'Mozart', 'Dvorak', 'Vivaldi']

In [2]:
import os
import numpy as np
import librosa
import config1  # This module should define your configuration parameters
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.utils import to_categorical

def get_subdirectories(a_dir):
    """
    Returns a list of subfolder names inside the given directory.
    Each subfolder typically represents a class (e.g., 'Baroque', 'Classical', 'Romantic').
    """
    return [name for name in os.listdir(a_dir) if os.path.isdir(os.path.join(a_dir, name))]

def get_audios_path(dataset_dir, folder_name):
    """
    Returns a list of audio file paths from a given subfolder.
    
    Parameters:
    - dataset_dir: The base dataset directory.
    - folder_name: The name of the subfolder.
    
    Returns:
    - List of full paths to audio files (e.g., wav or mp3) within that folder.
    """
    folder_path = os.path.join(dataset_dir, folder_name)
    audio_paths = librosa.util.find_files(folder_path, ext=['wav', 'mp3'])
    return audio_paths

def extract_features(audio_path, samp_rate, frame_size, hop_size, n_mels=128, fixed_frames=646):
    """
    Loads an audio file and computes its log-mel spectrogram, then pads or crops
    it to ensure the time dimension is fixed.
    
    Parameters:
    - audio_path: Full path to the audio file.
    - samp_rate: Sampling rate to use when loading audio.
    - frame_size: FFT window size.
    - hop_size: Hop (stride) size for the FFT.
    - n_mels: Number of mel bins (default 128).
    - fixed_frames: The desired fixed number of time frames.
    
    Returns:
    - log_S: A NumPy array (shape: (n_mels, fixed_frames)) containing the log-mel spectrogram.
    """
    try:
        # Load the audio file (entire file; you could specify duration if needed)
        y, sr = librosa.load(audio_path, sr=samp_rate)
    except Exception as e:
        print(f"Error loading {audio_path}: {e}")
        return None

    # Compute the mel-spectrogram
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=frame_size, hop_length=hop_size, n_mels=n_mels)
    # Convert to log scale (dB)
    log_S = librosa.power_to_db(S, ref=np.max)
    
    # Ensure a fixed time dimension by padding or cropping
    if log_S.shape[1] < fixed_frames:
        pad_width = fixed_frames - log_S.shape[1]
        log_S = np.pad(log_S, ((0, 0), (0, pad_width)), mode='constant', constant_values=log_S.min())
    else:
        log_S = log_S[:, :fixed_frames]
    
    return log_S

def main():
    # Retrieve configuration parameters from config1.CreateDataset
    samp_rate = config1.CreateDataset.SAMPLING_RATE
    frame_size = config1.CreateDataset.FRAME_SIZE
    hop_size = config1.CreateDataset.HOP_SIZE
    # You can define FIXED_FRAMES in your config or set it here
    fixed_frames = getattr(config1.CreateDataset, 'FIXED_FRAMES', 646)
    dataset_dir = 'resampled_dataset'

    print("Dataset directory:", dataset_dir)
    
    # Get a list of subfolders (each corresponding to a class like 'Baroque', etc.)
    sub_folders = get_subdirectories(dataset_dir)
    print("Found subfolders:", sub_folders)

    # Lists to hold features and labels
    features_list = []
    labels_list = []

    print("Extracting features from audio files...")
    # Process each subfolder (class) in the dataset
    for sub_folder in sub_folders:
        print(f"Processing folder: {sub_folder}")
        # Get all audio file paths from this subfolder
        audio_paths = get_audios_path(dataset_dir, sub_folder)
        for audio_path in audio_paths:
            feat = extract_features(audio_path, samp_rate, frame_size, hop_size, n_mels=128, fixed_frames=fixed_frames)
            if feat is not None:
                features_list.append(feat)
                labels_list.append(sub_folder)
    
    # Convert lists to numpy arrays; now each feature will have the same shape (128, fixed_frames)
    features_array = np.array(features_list)
    labels_array = np.array(labels_list)
    
    # Expand dims to add a channel dimension for the CNN (expected shape: (n_samples, freq_bins, fixed_frames, 1))
    X = np.expand_dims(features_array, axis=-1)
    y = labels_array  # Still as strings for now

    print("Extracted features from", len(features_list), "audio files.")
    print("Features array shape:", X.shape)
    print("Labels array shape:", y.shape)
    
    # Split the dataset (80% train, 20% test)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # Further split training set into training and validation (e.g., 10% of training for validation)
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=0.1, random_state=42, stratify=y_train
    )
    
    print("Training set shape:", X_train.shape)
    print("Validation set shape:", X_val.shape)
    print("Test set shape:", X_test.shape)
    
    # Determine number of classes and map string labels to integer indices
    class_names = np.unique(y)
    num_classes = len(class_names)
    class_to_int = {name: i for i, name in enumerate(class_names)}
    
    y_train_int = np.array([class_to_int[label] for label in y_train])
    y_val_int = np.array([class_to_int[label] for label in y_val])
    y_test_int = np.array([class_to_int[label] for label in y_test])
    
    # One-hot encode the labels
    y_train_cat = to_categorical(y_train_int, num_classes=num_classes)
    y_val_cat = to_categorical(y_val_int, num_classes=num_classes)
    y_test_cat = to_categorical(y_test_int, num_classes=num_classes)
    
    def build_cnn_model(input_shape, num_classes):
        model = models.Sequential()
    
        model.add(layers.Conv2D(16, (3, 3), activation='relu', input_shape=input_shape))
        model.add(layers.MaxPooling2D((2, 2)))
    
        model.add(layers.Conv2D(32, (3, 3), activation='relu'))
        model.add(layers.MaxPooling2D((2, 2)))
    
        model.add(layers.Conv2D(64, (3, 3), activation='relu'))
        model.add(layers.MaxPooling2D((2, 2)))
    
        model.add(layers.Flatten())
        model.add(layers.Dense(128, activation='relu'))
        model.add(layers.Dropout(0.3))
        model.add(layers.Dense(num_classes, activation='softmax'))
    
        model.compile(optimizer='adam',
                      loss='categorical_crossentropy',
                      metrics=['accuracy'])
        return model
    
    # Define input shape based on your features.
    # Here, we have 128 mel bins, fixed_frames time steps, and 1 channel.
    input_shape = (128, fixed_frames, 1)
    model = build_cnn_model(input_shape, num_classes)
    
    # Train the model
    history = model.fit(
        X_train, y_train_cat,
        validation_data=(X_val, y_val_cat),
        epochs=20,
        batch_size=16
    )
    
    # Evaluate on the test set
    test_loss, test_acc = model.evaluate(X_test, y_test_cat)
    print("Test Accuracy:", test_acc)
    
if __name__ == '__main__':
    main()


Dataset directory: resampled_dataset
Found subfolders: ['Classical', 'Baroque', 'Modern', 'Romantic']
Extracting features from audio files...
Processing folder: Classical
Processing folder: Baroque
Processing folder: Modern
Processing folder: Romantic
Extracted features from 9530 audio files.
Features array shape: (9530, 128, 646, 1)
Labels array shape: (9530,)
Training set shape: (6861, 128, 646, 1)
Validation set shape: (763, 128, 646, 1)
Test set shape: (1906, 128, 646, 1)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m429/429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 176ms/step - accuracy: 0.3292 - loss: 21.0691 - val_accuracy: 0.3866 - val_loss: 1.3139
Epoch 2/20
[1m429/429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 168ms/step - accuracy: 0.3455 - loss: 1.3538 - val_accuracy: 0.3984 - val_loss: 1.2908
Epoch 3/20
[1m429/429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 168ms/step - accuracy: 0.3833 - loss: 1.3120 - val_accuracy: 0.4128 - val_loss: 1.2586
Epoch 4/20
[1m429/429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 180ms/step - accuracy: 0.4137 - loss: 1.2748 - val_accuracy: 0.4102 - val_loss: 1.2875
Epoch 5/20
[1m429/429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 173ms/step - accuracy: 0.4218 - loss: 1.2477 - val_accuracy: 0.4194 - val_loss: 1.3010
Epoch 6/20
[1m429/429[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 177ms/step - accuracy: 0.4492 - loss: 1.2199 - val_accuracy: 0.4522 - val_loss: 1.2207
Epoch 7/2