In [163]:
import pandas as pd
import utility as utils
import importlib
import numpy as np
import librosa
from tqdm import tqdm
import numpy as np
import mir_eval

from sklearn.model_selection import train_test_split

importlib.reload(utils)

train_dataset = './data/onset/train'
test_dataset = './data/onset/test'

In [147]:
def evaluate_loop(submission, target):
    f, _, _ = mir_eval.onset.f_measure(
        np.array(target),
        np.array(submission),
        window=0.05  # 50 [ms]
    )
    return f

In [148]:
df = utils.get_audio_and_onsets_in_dataframe(train_dataset)

In [149]:
# import numpy as np
# import librosa
# 
# def preprocess_audio_to_cnn_input(audio_path, sr=utils.SAMPLING_RATE, n_mels=80, fmin=27.5, fmax=16000):
#     # Loading & defining the stuff
#     hop_length = int(sr * 0.01)
#     context_frames = 7
#     y, sr = librosa.load(audio_path, sr=sr)
# 
#     # Prepare spectrograms
#     melspecs = []
#     for window_size in [int(sr * 0.023), int(sr * 0.046), int(sr * 0.093)]:
#         melspec = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=window_size, hop_length=hop_length,
#                                                  n_mels=n_mels, fmin=fmin, fmax=fmax)
#         melspec = librosa.power_to_db(melspec)
#         melspecs.append(melspec)
# 
#     # Padding part in case that one of the spectograms wouldn't allign correctly.
#     max_length = max(mel.shape[1] for mel in melspecs)
#     melspecs = [np.pad(mel, ((0, 0), (0, max_length - mel.shape[1])), mode='constant') for mel in melspecs]
# 
#     melspecs = np.stack(melspecs, axis=-1)  # Stacking along the new axis to treat them as channels
#     mean = np.mean(melspecs, axis=(0, 2), keepdims=True)
#     std = np.std(melspecs, axis=(0, 2), keepdims=True)
#     melspecs = (melspecs - mean) / std
# 
#     # Combine and prepare the context window data for each time frame
#     num_frames = melspecs.shape[1]  # Number of frames should now be consistent
#     cnn_inputs = []
#     for t in range(context_frames, num_frames - context_frames):
#         # Extract the context window for all channels
#         context_window = melspecs[:, t-context_frames:t+context_frames+1, :]
#         cnn_inputs.append(context_window)
#     cnn_inputs = np.array(cnn_inputs)
#     cnn_inputs = cnn_inputs.reshape(cnn_inputs.shape[0], 15, 80, 3)
# 
#     return cnn_inputs
# 
# def onsets_to_frames(onset_times, sr=utils.SAMPLING_RATE):
#     # Calculate frame duration in seconds
#     hop_length = int(sr * 0.01)
#     frame_duration = hop_length / sr
#     # Calculate frame indices for each onset time
#     frame_indices = [int(time / frame_duration) for time in onset_times]
# 
#     # Assume maximum frame index to create the binary array
#     if frame_indices:
#         max_index = max(frame_indices)
#         onsets_binary = np.zeros(max_index + 1, dtype=int)  # +1 because indexing starts at 0
#         onsets_binary[frame_indices] = 1
#     else:
#         onsets_binary = np.array([])
#     return np.array(onsets_binary)
# 
# def frames_to_onset(onset_binary, sr=utils.SAMPLING_RATE):
#     hop_length = int(sr * 0.01)  # Calculate hop length from sampling rate
#     frame_duration = hop_length / sr  # Calculate the duration of each frame in seconds
# 
#     # Find indices where there is an onset
#     onset_indices = np.where(onset_binary == 1)[0]
# 
#     # Convert frame indices to times
#     onset_times = onset_indices * frame_duration
# 
#     return onset_times.tolist()  # Convert to list for convenience
# 
# def prepare_data(audio_path, onset_times, sr=utils.SAMPLING_RATE, n_mels=80, fmin=27.5, fmax=16000):
#     # Generate Mel spectrograms
#     melspecs = preprocess_audio(audio_path, sr, n_mels, fmin, fmax)
# 
#     # Convert onset times to frame indices
#     hop_length = int(sr * 0.01)
#     frame_indices = [int(time * sr / hop_length) for time in onset_times]
#     max_index = max(frame_indices, default=0)
# 
#     # Prepare labels for each frame in the spectrogram
#     labels = np.zeros((max_index + 1,), dtype=int)
#     for index in frame_indices:
#         labels[index] = 1
# 
#     return melspecs, labels
# 
# def process_data(audio_paths, onset_times_list, sr=utils.SAMPLING_RATE, n_mels=80, fmin=27.5, fmax=16000):
#     all_features = []
#     all_labels = []
# 
#     # Iterate through the lists with a tqdm progress bar
#     for audio_path, onset_times in tqdm(zip(audio_paths, onset_times_list), total=len(audio_paths), desc="Processing audio files"):
#         # Prepare data from this file
#         features, labels = prepare_data(audio_path, onset_times, sr, n_mels, fmin, fmax)
#         print(np.array(features).shape)
#         # Aggregate the data
#         all_features.append(features)
#         all_labels.append(labels)
# 
#     # Concatenate all data into arrays
#     all_features = np.concatenate(all_features, axis=0)
#     all_labels = np.concatenate(all_labels, axis=0)
# 
#     return all_features, all_labels

In [188]:
def frames_to_onset(onset_binary, sr=utils.SAMPLING_RATE):
    hop_length = int(sr * 0.01)  # Calculate hop length from sampling rate
    frame_duration = hop_length / sr  # Calculate the duration of each frame in seconds
    onset_indices = np.where(onset_binary == 1)[0]
    onset_times = onset_indices * frame_duration

    return onset_times.tolist()

def preprocess_test_data(audio_path, sr=utils.SAMPLING_RATE, n_mels=80, fmin=27.5, fmax=16000):
    hop_length = int(sr * 0.01)
    context_frames = 7  # ±70 ms implies 7 frames on each side
    y, sr = librosa.load(audio_path, sr=sr)

    # Prepare spectrograms
    melspecs = []
    for window_size in [int(sr * 0.023), int(sr * 0.046), int(sr * 0.093)]:
        melspec = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=window_size,
                                                 hop_length=hop_length, n_mels=n_mels,
                                                 fmin=fmin, fmax=fmax)
        melspec = librosa.power_to_db(melspec)
        melspecs.append(melspec)

    # Define frame step to ensure overlapping windows
    frame_step = context_frames  # This can be adjusted based on the desired overlap

    # Collect context windows for all frames
    cnn_inputs = []
    num_frames = melspecs[0].shape[1]
    for idx in range(context_frames, num_frames - context_frames, frame_step):
        context_windows = [melspec[:, idx-context_frames:idx+context_frames+1] for melspec in melspecs]
        if not all(window.shape == context_windows[0].shape for window in context_windows):
            continue
        stacked_window = np.stack(context_windows, axis=-1)
        cnn_inputs.append(stacked_window)

    # Normalize the inputs
    cnn_inputs = np.array(cnn_inputs)
    mean = np.mean(cnn_inputs, axis=(0, 1, 2), keepdims=True)
    std = np.std(cnn_inputs, axis=(0, 1, 2), keepdims=True)
    cnn_inputs_normalized = (cnn_inputs - mean) / std

    return cnn_inputs_normalized.reshape(cnn_inputs_normalized.shape[0], 15, 80, 3)

def process_test_data(audio_paths, sr=utils.SAMPLING_RATE, n_mels=80, fmin=27.5, fmax=16000):
    all_features = []

    # Iterate through each audio file
    for audio_path in audio_paths:
        # Process each file
        cnn_inputs = preprocess_test_data(
            audio_path,
            sr=sr,
            n_mels=n_mels,
            fmin=fmin,
            fmax=fmax
        )

        # Append the results to the aggregate list
        all_features.extend(cnn_inputs)

    # Convert list to numpy array for use in machine learning models
    return np.array(all_features)

In [154]:
def preprocess_onset_contexts_and_labels_balanced(audio_path, sr=utils.SAMPLING_RATE, n_mels=80, fmin=27.5, fmax=16000, onset_times=[]):
    hop_length = int(sr * 0.01)
    context_frames = 7  # ±70 ms implies 7 frames on each side
    y, sr = librosa.load(audio_path, sr=sr)

    # Prepare spectrograms
    melspecs = []
    for window_size in [int(sr * 0.023), int(sr * 0.046), int(sr * 0.093)]:
        melspec = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=window_size,
                                                 hop_length=hop_length, n_mels=n_mels,
                                                 fmin=fmin, fmax=fmax)
        melspec = librosa.power_to_db(melspec)
        melspecs.append(melspec)

    # Calculate frame indices for onsets and filter for valid ones
    frame_indices = [int(time / (hop_length / sr)) for time in onset_times]
    valid_frames = [idx for idx in frame_indices if idx >= context_frames and idx + context_frames < melspecs[0].shape[1]]

    # Collect context windows and labels for valid onsets
    cnn_inputs = []
    labels = []
    for idx in valid_frames:
        context_windows = [melspec[:, idx-context_frames:idx+context_frames+1] for melspec in melspecs]
        # Check that all windows are of the same shape
        if not all(window.shape == context_windows[0].shape for window in context_windows):
            continue
        stacked_window = np.stack(context_windows, axis=-1)
        cnn_inputs.append(stacked_window)
        labels.append(1)

    # Select an equal number of non-onset frames
    non_onset_frames = [i for i in range(context_frames, melspecs[0].shape[1] - context_frames)
                        if i not in valid_frames and all(abs(i - x) > context_frames for x in valid_frames)]
    if len(non_onset_frames) >= len(valid_frames):
        non_onset_samples = np.random.choice(non_onset_frames, len(valid_frames), replace=False)
    else:
        non_onset_samples = non_onset_frames

    for idx in non_onset_samples:
        context_windows = [melspec[:, idx-context_frames:idx+context_frames+1] for melspec in melspecs]
        if not all(window.shape == context_windows[0].shape for window in context_windows):
            continue
        stacked_window = np.stack(context_windows, axis=-1)
        cnn_inputs.append(stacked_window)
        labels.append(0)

    # Shuffle and normalize
    permutation = np.random.permutation(len(labels))
    cnn_inputs = np.array(cnn_inputs)[permutation]
    labels = np.array(labels)[permutation]
    mean = np.mean(cnn_inputs, axis=(0, 1, 2), keepdims=True)
    std = np.std(cnn_inputs, axis=(0, 1, 2), keepdims=True)
    cnn_inputs = (cnn_inputs - mean) / std

    return cnn_inputs.reshape(cnn_inputs.shape[0], 15, 80, 3), labels

def process_data(audio_paths, onset_times_list, sr=utils.SAMPLING_RATE, n_mels=80, fmin=27.5, fmax=16000):
    all_features = []
    all_labels = []

    # Iterate through each audio file and its corresponding onset times
    for audio_path, onset_times in zip(audio_paths, onset_times_list):
        # Process each file with the balanced preprocessing function
        cnn_inputs, labels = preprocess_onset_contexts_and_labels_balanced(
            audio_path,
            sr=sr,
            n_mels=n_mels,
            fmin=fmin,
            fmax=fmax,
            onset_times=onset_times
        )

        # Append the results to the aggregate lists
        all_features.extend(cnn_inputs)
        all_labels.extend(labels)

    # Convert lists to numpy arrays for use in machine learning models
    all_features = np.array(all_features)
    all_labels = np.array(all_labels)

    return all_features, all_labels

In [151]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, InputLayer
from tensorflow.keras.optimizers import SGD

model = Sequential([
    InputLayer(shape=(15, 80, 3)),
    Conv2D(10, kernel_size=(7, 3), activation='tanh'),
    MaxPooling2D(pool_size=(1, 3), strides=(1, 3)),
    Conv2D(20, kernel_size=(3, 3), activation='tanh'),
    MaxPooling2D(pool_size=(1, 3), strides=(1, 3)),
    Flatten(),
    Dense(256, activation='sigmoid'),
    Dense(1, activation='sigmoid')
])

model.summary()

optimizer = SGD(learning_rate=0.05, momentum=0.45)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

In [155]:
features = df['File Path']
labels = df['Onsets']

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.33, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [156]:
features_train, labels_train = process_data(X_train, y_train)
features_test, labels_test = process_data(X_test, y_test)
features_val, labels_val = process_data(X_val, y_val)

In [157]:
model.fit(features_train, labels_train,
          validation_data=(features_val, labels_val),
          epochs=100, batch_size=256)

Epoch 1/100
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 46ms/step - accuracy: 0.6852 - loss: 0.5703 - val_accuracy: 0.8967 - val_loss: 0.2881
Epoch 2/100
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 69ms/step - accuracy: 0.8475 - loss: 0.3499 - val_accuracy: 0.8927 - val_loss: 0.2671
Epoch 3/100
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 66ms/step - accuracy: 0.8638 - loss: 0.3203 - val_accuracy: 0.8922 - val_loss: 0.2769
Epoch 4/100
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 75ms/step - accuracy: 0.8736 - loss: 0.2950 - val_accuracy: 0.8857 - val_loss: 0.2628
Epoch 5/100
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 124ms/step - accuracy: 0.8853 - loss: 0.2744 - val_accuracy: 0.8896 - val_loss: 0.2645
Epoch 6/100
[1m107/107[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 124ms/step - accuracy: 0.8928 - loss: 0.2597 - val_accuracy: 0.8770 - val_loss: 0.2897
Epoch 7/100


<keras.src.callbacks.history.History at 0x297b1ee20>

In [158]:
test_loss, test_accuracy = model.evaluate(features_test, labels_test)
print("Test Accuracy:", test_accuracy)

[1m432/432[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9212 - loss: 0.2474
Test Accuracy: 0.9126656651496887


In [185]:
test, _, _, _= utils.load_dataset_paths(test_dataset, is_train_dataset=False)

In [187]:
onsets = {}

In [190]:
for path in test:
    file_name = path.split('/')[-1].replace('.wav', '')
    t_preprocessed = preprocess_test_data(path)
    predictions = model.predict(t_preprocessed)
    onset_predictions = (predictions > 0.5).astype(int)
    onsets_in_sec = frames_to_onset(onset_predictions)
    onsets[file_name] = {'onsets': list(onsets_in_sec)}

[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step 
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step 
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step 
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m

In [191]:
import json

json_filename = 'onsets_data_3.json'
with open(json_filename, 'w') as f:
    json.dump(onsets, f, indent=4)

(10844, 15, 80, 3)

In [176]:
predictions = model.predict(test)

[1m339/339[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


In [179]:
onset_predictions = (predictions > 0.5).astype(int)

In [180]:
onset_predictions

array([[1],
       [0],
       [0],
       ...,
       [0],
       [1],
       [0]])