In [2]:
import utils as utils
import importlib
import numpy as np
import mir_eval
import data
import pickle

import torch as th
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.nn.functional as F
from torch import optim

from scipy.signal import find_peaks
from sklearn.model_selection import train_test_split

importlib.reload(utils)
importlib.reload(data)

train_dataset_path = './data/onset/train'
test_dataset_path = './data/onset/test'

In [3]:
# Model descrbied in the paper plus droput
class OnsetDetectionCNN(nn.Module):
    def __init__(self):
        super(OnsetDetectionCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 10, kernel_size=(3, 7))
        self.pool1 = nn.MaxPool2d(kernel_size=(3, 1))
        self.conv2 = nn.Conv2d(10, 20, kernel_size=(3, 3))
        self.pool2 = nn.MaxPool2d(kernel_size=(3, 1))
        self.fc1 = nn.Linear(20 * 7 * 8, 256)
        self.fc2 = nn.Linear(256, 1)
        self.dropout = nn.Dropout(0.5)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.pool1(F.relu(self.conv1(x)))
        x = self.pool2(F.relu(self.conv2(x)))
        x = x.view(-1, 20 * 7 * 8) 
        x = self.dropout(F.relu(self.fc1(x)))  # Apply dropout
        x = self.sigmoid(self.fc2(x))
        return x

# Initialize the model
model = OnsetDetectionCNN()

In [6]:
class AudioOnsetDataset(th.utils.data.Dataset):
    def __init__(self, spectograms, sample_rates, targets, sample_onsets, zero_sample_ratio=1.0):
        self.X = []
        self.y = []

        for X, sample_rate, y, onsets in zip(spectograms, sample_rates, targets, sample_onsets):
            # At this point we have a bunch of data samples which has labels 0 and small amount of labels 1 (0 - non-onsets, 1 - onsets)
            X_frames, y_frames = utils.make_frames(X, y, onsets, sample_rate)
            self.X += X_frames
            self.y += y_frames

        tmp = th.cat(self.X)
        self.mean = th.mean(tmp, dim=(0, 2)).unsqueeze(1)
        self.std = th.std(tmp, dim=(0, 2)).unsqueeze(1)
        del tmp

        self.X = [(x - self.mean)/self.std for x in self.X]
        
        # Here we balance out the samples so we will have equal amount of labels 1 and label 0
        self._balance_dataset(zero_sample_ratio)

    def _balance_dataset(self, zero_sample_ratio):
        self.X = th.stack(self.X)
        self.y = th.tensor(self.y)

        pos_indices = (self.y == 1).nonzero(as_tuple=True)[0]
        neg_indices = (self.y == 0).nonzero(as_tuple=True)[0]

        num_pos_samples = len(pos_indices)
        num_neg_samples = int(num_pos_samples * zero_sample_ratio)
        sampled_neg_indices = np.random.choice(neg_indices.cpu().numpy(), num_neg_samples, replace=False)

        # Combine positive and sampled negative indices
        balanced_indices = th.cat((pos_indices, th.tensor(sampled_neg_indices, dtype=th.long)))

        # Shuffle the indices
        balanced_indices = balanced_indices[th.randperm(len(balanced_indices))]

        # Update X and y with balanced samples
        self.X = self.X[balanced_indices]
        self.y = self.y[balanced_indices]

    def __len__(self):
        return len(self.X)

    def __getitem__(self, i):
        return self.X[i], self.y[i]

In [32]:
# Load the paths and then split them into train and test set (validation set in our case for now).
wav_files_paths_train, _, onset_files_paths_train, _ = utils.load_dataset_paths(train_dataset_path, is_train_dataset=True)
X_train_paths, X_test_paths, y_train_paths, y_test_paths = train_test_split(wav_files_paths_train, onset_files_paths_train, test_size=0.2, random_state=42)

In [33]:
# Prepare train data
features_train, sample_rates_train = utils.preprocess_audio(X_train_paths)
onsets_train = utils.load_onsets(y_train_paths)
labels_train = [utils.make_target(onsets_train[i], features_train[i].shape[-1], sample_rates_train[i]) for i in range(len(onsets_train))]

100%|██████████| 101/101 [00:03<00:00, 27.67it/s]


In [34]:
# Prepare test data (validation data in our case for now)
features_test, sample_rates_test = utils.preprocess_audio(X_test_paths)
onsets_test = utils.load_onsets(y_test_paths)
labels_test = [utils.make_target(onsets_test[i], features_test[i].shape[-1], sample_rates_test[i]) for i in range(len(onsets_test))]

100%|██████████| 26/26 [00:01<00:00, 22.42it/s]


In [35]:
dataset = AudioOnsetDataset(features_train, sample_rates_train, labels_train, onsets_train)
dataloader = DataLoader(dataset, batch_size=256, shuffle=True)

# We save the mean and std so we can use it to normalize the validation data and the actual test data.
mean_std = {'mean': dataset.mean, 'std': dataset.std}
with open('mean_std.pkl', 'wb') as f:
    pickle.dump(mean_std, f)

# Save using torch
th.save({'mean': dataset.mean, 'std': dataset.std}, 'mean_std.pth')

In [36]:
device = 'cuda' if th.cuda.is_available() else 'cpu'
print(device)

cpu


In [37]:
def manual_evaluate(model, features, onsets, sample_rates, frame_size=15, thresholds=None, mean=None, std=None):
    
    # We can experience with different thresholds but during a lot of testing we found that 0.95 is a good chocie.
    if thresholds is None:
        thresholds = np.arange(0.1, 1.0, 0.05)  # Define a range of thresholds to test
        
    if mean is None or std is None:
        raise ValueError("Mean and std must be provided for normalization.")

    mean = mean.to(device)
    std = std.to(device)
    best_threshold = 0.0
    best_f1 = 0.0
    
    model = model.to(device)
    model.eval()

    half_frame_size = frame_size // 2

    with th.no_grad():
        for threshold in thresholds:
            all_f1_scores = []
            for i in range(len(features)):
                predictions = []
                
                # Prepare features
                f = features[i].to(device)
                feature = (f - mean) / std  # Normalize the feature
                
                label = onsets[i]
                num_frames = feature.shape[2]
                
                # Loop through on the frames
                for j in range(half_frame_size, num_frames - half_frame_size):
                    start = j - half_frame_size
                    end = j + half_frame_size + 1
                    
                    input_frame = feature[:, :, start:end].unsqueeze(0).float()  # Add batch dimension
                    output = model(input_frame).squeeze().item()
                    predictions.append(output)
                
                # Smoothing the predictions the 10 hamming window is coming from trial and error.
                predictions = np.convolve(predictions, np.hamming(10))
                
                res = []
                for idx in find_peaks(predictions)[0]:
                    if predictions[idx] >= threshold:
                        res.append(idx * utils.HOP_LENGTH / utils.SAMPLING_RATE)
                        
                f, _, _ = mir_eval.onset.f_measure(label, np.array(res), window=0.05)  # 70 ms window
                all_f1_scores.append(f)
                
            avg_f1 = np.mean(all_f1_scores)
            if avg_f1 > best_f1:
                best_f1 = avg_f1
                best_threshold = threshold

    return best_threshold, avg_f1

In [41]:
epochs = 100
lr = 3e-4

# Define loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001, weight_decay=1e-5)

best_f1_test = 0.0  # Initialize best F1 score

for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in dataloader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        # Zero the parameter gradients
        labels = labels.float()
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)
        outputs = outputs.squeeze()
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Accumulate running loss
        running_loss += loss.item()

    print(f"Epoch [{epoch+1}/{epochs}], Loss: {running_loss/len(dataloader):.4f}")

    # Evaluate the model on the test set
    best_treshold, current_f1_test = manual_evaluate(model, features_test, onsets_test, sample_rates_test, thresholds=[0.95], mean=dataset.mean, std=dataset.std)

    # Check if the current F1 score is the best we have seen so far
    if current_f1_test > best_f1_test:
        best_f1_test = current_f1_test
        # Save the model
        th.save(model.state_dict(), 'best_model.pth')
        print(f"New best F1 score: {best_f1_test:.4f}, model saved.")

    print(f"TEST best_f1: {current_f1_test} with best threshold: {best_treshold}")

print("Training finished.")

Epoch [1/100], Loss: 0.3390
New best F1 score: 0.7082, model saved.
TEST best_f1: 0.7082399294741945 with best threshold: 0.95
Epoch [2/100], Loss: 0.3249
TEST best_f1: 0.6961354523099221 with best threshold: 0.95
Epoch [3/100], Loss: 0.3160
New best F1 score: 0.7191, model saved.
TEST best_f1: 0.7191097497023254 with best threshold: 0.95
Epoch [4/100], Loss: 0.3086
TEST best_f1: 0.7008580153248823 with best threshold: 0.95
Epoch [5/100], Loss: 0.3033
New best F1 score: 0.7393, model saved.
TEST best_f1: 0.7393444251276311 with best threshold: 0.95
Epoch [6/100], Loss: 0.2967
TEST best_f1: 0.7156176678973748 with best threshold: 0.95
Epoch [7/100], Loss: 0.2924
TEST best_f1: 0.7148454343132985 with best threshold: 0.95
Epoch [8/100], Loss: 0.2875
TEST best_f1: 0.7167531645088647 with best threshold: 0.95
Epoch [9/100], Loss: 0.2822
TEST best_f1: 0.7348458908595803 with best threshold: 0.95
Epoch [10/100], Loss: 0.2779
TEST best_f1: 0.7385983020265746 with best threshold: 0.95
Epoch [11