In [91]:
import pandas as pd
import utility as utils
import importlib
import numpy as np
import librosa
from tqdm import tqdm
import numpy as np
import mir_eval

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.layers import Input

importlib.reload(utils)

train_dataset = '/Users/borosabel/Doc/Uni/4/ASP_2024/data/train'
test_dataset = '/Users/borosabel/Doc/Uni/4/ASP_2024/data/test'

In [82]:
def evaluate_loop(submission, target):
    f, _, _ = mir_eval.onset.f_measure(
        np.array(target),
        np.array(submission),
        window=0.05  # 50 [ms]
    )
    return f

In [2]:
df = utils.get_audio_and_onsets_in_dataframe(train_dataset)

In [143]:
def preprocess_audio(audio_path, sr=utils.SAMPLING_RATE, n_mels=80, fmin=27.5, fmax=16000):
    y, _ = librosa.load(audio_path, sr=sr)  # Load audio file with the specified sampling rate
    hop_length = int(sr * 0.01)  # 10 ms hop time
    melspecs = []
    
    for window_size in [int(sr * 0.023), int(sr * 0.046), int(sr * 0.093)]:  # Window sizes: 23 ms, 46 ms, 93 ms
        melspec = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=window_size,
                                                 hop_length=hop_length, n_mels=n_mels,
                                                 fmin=fmin, fmax=fmax)
        melspec = librosa.power_to_db(melspec, ref=np.max)  # Convert power spectrogram to decibel (dB) units
        # Normalization - assume you have precomputed the mean and std for your dataset
        melspec = (melspec - melspec.mean()) / melspec.std()  # Normalize using local mean and std
        melspecs.append(melspec)

    return melspecs

def onsets_to_frames(onset_times, sr=utils.SAMPLING_RATE):
    # Calculate frame duration in seconds
    hop_length = int(sr * 0.01)
    frame_duration = hop_length / sr
    # Calculate frame indices for each onset time
    frame_indices = [int(time / frame_duration) for time in onset_times]

    # Assume maximum frame index to create the binary array
    if frame_indices:
        max_index = max(frame_indices)
        onsets_binary = np.zeros(max_index + 1, dtype=int)  # +1 because indexing starts at 0
        onsets_binary[frame_indices] = 1
    else:
        onsets_binary = np.array([])  # No onsets given
    return onsets_binary

def frames_to_onset(onset_binary, sr=utils.SAMPLING_RATE):
    hop_length = int(sr * 0.01)  # Calculate hop length from sampling rate
    frame_duration = hop_length / sr  # Calculate the duration of each frame in seconds

    # Find indices where there is an onset
    onset_indices = np.where(onset_binary == 1)[0]

    # Convert frame indices to times
    onset_times = onset_indices * frame_duration

    return onset_times.tolist()  # Convert to list for convenience

def prepare_data(audio_path, onset_times, sr=utils.SAMPLING_RATE, n_mels=80, fmin=27.5, fmax=16000):
    # Generate Mel spectrograms
    melspecs = preprocess_audio(audio_path, sr, n_mels, fmin, fmax)

    # Convert onset times to frame indices
    hop_length = int(sr * 0.01)
    frame_indices = [int(time * sr / hop_length) for time in onset_times]
    max_index = max(frame_indices, default=0)

    # Prepare labels for each frame in the spectrogram
    labels = np.zeros((max_index + 1,), dtype=int)
    for index in frame_indices:
        labels[index] = 1

    return melspecs, labels


def process_data(audio_paths, onset_times_list, sr=utils.SAMPLING_RATE, n_mels=80, fmin=27.5, fmax=16000):
    all_features = []
    all_labels = []

    # Iterate through the lists with a tqdm progress bar
    for audio_path, onset_times in tqdm(zip(audio_paths, onset_times_list), total=len(audio_paths), desc="Processing audio files"):
        # Prepare data from this file
        features, labels = prepare_data(audio_path, onset_times, sr, n_mels, fmin, fmax)

        # Aggregate the data
        all_features.append(features)
        all_labels.append(labels)

    # Concatenate all data into arrays
    all_features = np.concatenate(all_features, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)

    return all_features, all_labels

In [108]:
def create_specific_cnn(input_shape):
    model = Sequential([
        Input(shape=input_shape),  # Use an Input layer to specify the input shape
        Conv2D(10, kernel_size=(7, 3), activation='tanh', padding='same'),
        MaxPooling2D(pool_size=(1, 3), strides=(1, 3), padding='valid'),
        Conv2D(20, kernel_size=(3, 3), activation='tanh', padding='same'),
        MaxPooling2D(pool_size=(1, 3), strides=(1, 3), padding='valid'),
        Flatten(),
        Dense(256, activation='sigmoid'),
        Dense(1, activation='sigmoid')
    ])

    optimizer = SGD(learning_rate=0.05, momentum=0.45)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

class MomentumScheduler(Callback):
    def __init__(self, initial_momentum, ramp_up_epochs, max_momentum):
        super().__init__()
        self.initial_momentum = initial_momentum
        self.ramp_up_epochs = ramp_up_epochs
        self.max_momentum = max_momentum
        self.steps_per_epoch = 0

    def on_train_begin(self, logs=None):
        self.steps_per_epoch = self.params['steps']  # Number of steps per epoch

    def on_epoch_end(self, epoch, logs=None):
        if epoch < self.ramp_up_epochs:
            new_momentum = self.initial_momentum + (self.max_momentum - self.initial_momentum) * (epoch / self.ramp_up_epochs)
            tf.keras.backend.set_value(self.model.optimizer.momentum, new_momentum)
            print(f'Epoch {epoch+1}: Momentum updated to {new_momentum}')
        else:
            tf.keras.backend.set_value(self.model.optimizer.momentum, self.max_momentum)

# Using the MomentumScheduler
momentum_scheduler = MomentumScheduler(initial_momentum=0.45, ramp_up_epochs=10, max_momentum=0.9)

# Input shape is 15 frames, 80 Mel bands, 3 channels
input_shape = (15, 80, 3)
model = create_specific_cnn(input_shape)
model.summary()

In [141]:
features = df['File Path']
labels = df['Onsets']

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.33, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [144]:
features_train, labels_train = process_data(X_train, y_train)
features_test, labels_test = process_data(X_test, y_test)
features_val, labels_val = process_data(X_val, y_val)

Processing audio files: 100%|██████████| 148/148 [01:55<00:00,  1.28it/s]


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 2 dimensions. The detected shape was (3, 80) + inhomogeneous part.

In [None]:
history = model.fit(
    features_train,
    labels_train,
    epochs=100,  # Number of epochs to train for
    batch_size=256,  # Batch size for the training
    validation_data=(features_val, labels_val),  # Validation data to evaluate the model
    verbose=1  # Show detailed progress during training
)