In [16]:
import pandas as pd
import utility as utils
import importlib
import numpy as np
import librosa
from tqdm import tqdm
import numpy as np
import torchaudio
import mir_eval
import glob
from os import path
import data
import models

import torch as th
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import torch.nn as nn
from torch import optim

from scipy.signal import find_peaks
from sklearn.model_selection import train_test_split

importlib.reload(utils)
importlib.reload(data)

train_dataset_path = './data/onset/train_extra_onsets'
test_dataset_path = './data/onset/test'

In [17]:
class OnsetDetectionCNN(nn.Module):
    def __init__(self):
        super(OnsetDetectionCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 10, kernel_size=(3, 7))
        self.pool1 = nn.MaxPool2d(kernel_size=(3, 1))
        self.conv2 = nn.Conv2d(10, 20, kernel_size=(3, 3))
        self.pool2 = nn.MaxPool2d(kernel_size=(3, 1))
        self.fc1 = nn.Linear(20 * 7 * 8, 256)
        self.fc2 = nn.Linear(256, 1)

    def forward(self, x):
        x = th.tanh(self.conv1(x))  # Conv1 output: (batch_size, 10, 9, 78)
        x = self.pool1(x)              # Pool1 output: (batch_size, 10, 9, 26)
        x = th.tanh(self.conv2(x))  # Conv2 output: (batch_size, 20, 7, 24)
        x = self.pool2(x)              # Pool2 output: (batch_size, 20, 7, 8)
        x = x.view(-1, 20 * 7 * 8)     # Flatten output: (batch_size, 20 * 7 * 8)
        x = th.sigmoid(self.fc1(x)) # FC1 output: (batch_size, 256)
        x = th.sigmoid(self.fc2(x)) # FC2 output: (batch_size, 1)
        return x

In [18]:
class AudioOnsetDataset(Dataset):
    def __init__(self, spectrograms, sample_rates, targets, sample_onsets):
        self.X = []
        self.y = []

        for spectrogram, sample_rate, target, onsets in zip(spectrograms, sample_rates, targets, sample_onsets):
            X_frames, y_frames = utils.make_frames(spectrogram, target, onsets, sample_rate)
            self.X += X_frames
            self.y += y_frames

        self.X = [th.tensor(x, dtype=th.float32) for x in self.X]
        self.y = [th.tensor(y, dtype=th.float32) for y in self.y]

        # Normalization (if required by your model)
        tmp = th.cat(self.X, dim=2)
        self.mean = th.mean(tmp, dim=(0, 2), keepdim=True)
        self.std = th.std(tmp, dim=(0, 2), keepdim=True)
        del tmp

        self.X = [(x - self.mean) / self.std for x in self.X]

    def __len__(self):
        return len(self.X)

    def __getitem__(self, i):
        return self.X[i], self.y[i]

In [19]:
# Load all of the train file paths.
wav_files_paths_train, _, onset_files_paths_train, _ = utils.load_dataset_paths(train_dataset_path)
wav_files_paths_test, _, onset_files_paths_test, _ = utils.load_dataset_paths(test_dataset_path)

In [20]:
# Train-test split for validation and testing.
X_train_paths, X_test_paths, y_train_paths, y_test_paths = train_test_split(wav_files_paths_train, onset_files_paths_train, test_size=0.2, random_state=42)

In [21]:
# Prepare train data
features_train, sample_rates_train = utils.preprocess_audio(X_train_paths)
onsets_train = utils.load_onsets(y_train_paths)
labels_train = [utils.make_target(onsets_train[i], features_train[i].shape[-1], sample_rates_train[i]) for i in range(len(onsets_train))]

100%|██████████| 1/1 [00:00<00:00,  5.54it/s]


In [22]:
# Prepare test data
features_test, sample_rates_test = utils.preprocess_audio(X_test_paths)
onsets_test = utils.load_onsets(y_test_paths)
labels_test = [utils.make_target(onsets_test[i], features_test[i].shape[-1], sample_rates_test[i]) for i in range(len(onsets_test))]
features_test_tensors = [th.tensor(feature, dtype=th.float32) for feature in features_test]

100%|██████████| 1/1 [00:00<00:00,  7.85it/s]


In [23]:
dataset = AudioOnsetDataset(features_train, sample_rates_train, labels_train, onsets_train)

In [24]:
num_epochs = 100
device = 'cuda' if th.cuda.is_available() else 'cpu'
lr = 3e-4

train_dataloader = DataLoader(dataset, shuffle=True, batch_size=256)

In [26]:
import torch as th
import numpy as np
from scipy.signal import find_peaks
import mir_eval

# Assuming the necessary imports and utility functions are already defined

model = OnsetDetectionCNN().to(device)
criterion = nn.BCELoss()  # Binary Cross Entropy Loss
optimizer = optim.SGD(model.parameters(), lr=0.05, momentum=0.45)
data_mean, data_std = dataset.mean.to(device), dataset.std.to(device)

dropout_rate = 0.5
num_epochs = 100

def apply_dropout(m):
    if type(m) == th.nn.Linear:
        m.dropout = th.nn.Dropout(p=dropout_rate)

model.apply(apply_dropout)

for epoch in range(num_epochs):
    if 10 <= epoch < 20:
        optimizer.param_groups[0]['momentum'] = 0.45 + (epoch - 10) * (0.9 - 0.45) / 10
    elif epoch >= 20:
        optimizer.param_groups[0]['momentum'] = 0.9

    if epoch >= 1:
        optimizer.param_groups[0]['lr'] *= 0.995

    model.train()
    running_loss = 0.0
    print(f'epoch {epoch + 1}')

    for i, (inputs, labels) in enumerate(train_dataloader):
        X = inputs.to(device)
        y = labels.to(device)
        optimizer.zero_grad()
        outputs = model(X)
        loss = criterion(outputs, y.unsqueeze(1))
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)

        if i % 100 == 0:
            print(f'loss: {loss.item()}')

    model.eval()
    with th.no_grad():
        all_predictions = []

        for idx, x_test in enumerate(features_test_tensors):
            x_test = x_test.to(device)
            x_test = (x_test - data_mean) / data_std

            frame_length = 15
            num_frames = x_test.shape[-1]
            pred = []

            # Sliding window over the entire test spectrogram
            for i in range(0, num_frames - frame_length + 1):
                frame = x_test[:, :, i:i+frame_length].unsqueeze(0)
                frame_pred = model(frame)
                pred.append(frame_pred.item())

            pred = np.array(pred)
            print(pred.shape)

            # Smooth the output using a Hamming window of 5 frames
            pred_smoothed = np.convolve(pred, np.hamming(5), mode='same')

            # Detect peaks in the smoothed output
            peaks, _ = find_peaks(pred_smoothed, height=0.95)
            predictions_binary = peaks * utils.HOP_LENGTH / sample_rates_test[idx]

            all_predictions.append(predictions_binary)

        f1_scores_test = []
        for pred, actual, sample_rate_test in zip(all_predictions, onsets_test, sample_rates_test):
            f, _, _ = mir_eval.onset.f_measure(
                actual,
                np.array(pred),
                window=0.05  # 50 [ms]
            )
            f1_scores_test.append(f)

        print(f'F-scores: TEST {np.mean(f1_scores_test)}')

print('Training complete')

epoch 1
loss: 0.707843005657196
(327,)
F-scores: TEST 0.5084745762711865
epoch 2
loss: 0.6873898506164551
(327,)
F-scores: TEST 0.4814814814814815
epoch 3
loss: 0.6852604150772095
(327,)
F-scores: TEST 0.44444444444444436
epoch 4
loss: 0.6810068488121033
(327,)
F-scores: TEST 0.41509433962264153
epoch 5
loss: 0.6750934720039368
(327,)
F-scores: TEST 0.36734693877551017
epoch 6
loss: 0.6706129312515259
(327,)
F-scores: TEST 0.3404255319148936
epoch 7
loss: 0.6655551195144653
(327,)
F-scores: TEST 0.3404255319148936
epoch 8
loss: 0.6599844098091125
(327,)
F-scores: TEST 0.26666666666666666
epoch 9
loss: 0.6540942788124084
(327,)
F-scores: TEST 0.2857142857142857
epoch 10
loss: 0.6476475596427917
(327,)
F-scores: TEST 0.2564102564102564
epoch 11
loss: 0.640603244304657
(327,)
F-scores: TEST 0.0625
epoch 12
loss: 0.6329579949378967
(327,)
F-scores: TEST 0.0
epoch 13
loss: 0.6243250370025635
(327,)
F-scores: TEST 0.0
epoch 14
loss: 0.6144399642944336
(327,)
F-scores: TEST 0.0
epoch 15
loss:

KeyboardInterrupt: 

In [13]:
model = OnsetDetectionCNN().to(device)
criterion = nn.BCELoss()  # Binary Cross Entropy Loss
optimizer = optim.Adam(model.parameters(), lr=0.001)
data_mean, data_std = dataset.mean.to(device), dataset.std.to(device)

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    print(f'epoch {epoch + 1}')
    
    for i, (inputs, labels) in enumerate(train_dataloader):
        X = inputs.to(device)
        y = labels.to(device)
        optimizer.zero_grad()
        outputs = model(X)
        loss = criterion(outputs, y.unsqueeze(1))
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)

        if i % 100 == 0:
            print(f'loss: {loss.item()}')

    model.eval()
    with th.no_grad():
        all_predictions = []
        
        for idx, x_test in enumerate(features_test_tensors):
            x_test = x_test.to(device)
            x_test = (x_test - data_mean) / data_std
            x_test = utils.make_test_frames(x_test)
            
            predictions = []
            for frame in x_test:
                frame = frame.unsqueeze(0)
                pred = model(frame)
                predictions.append(pred.item())

            predictions = np.array(predictions).flatten()

            predictions_binary = []
            for i in find_peaks(predictions)[0]:
                if predictions[i] >= 0.95:
                    predictions_binary.append(i * utils.HOP_LENGTH / sample_rates_test[idx])
                    
        all_predictions.append(predictions_binary)
        
        f1_scores_test = []
        for pred, actual, sample_rate_test in zip(all_predictions, onsets_test, sample_rates_test):
            f, _, _ = mir_eval.onset.f_measure(
                actual,
                np.array(pred),
                window=0.05  # 50 [ms]
            )
            f1_scores_test.append(f)
            
        print(f'F-scores: TEST {np.mean(f1_scores_test)}')

print('Training complete')

epoch 1
loss: 0.7111229300498962
torch.Size([1, 3, 80, 15])
torch.Size([1, 3, 80, 15])
torch.Size([1, 3, 80, 15])
torch.Size([1, 3, 80, 15])
torch.Size([1, 3, 80, 15])
torch.Size([1, 3, 80, 15])
torch.Size([1, 3, 80, 15])
torch.Size([1, 3, 80, 15])
torch.Size([1, 3, 80, 15])
torch.Size([1, 3, 80, 15])
torch.Size([1, 3, 80, 15])
torch.Size([1, 3, 80, 15])
torch.Size([1, 3, 80, 15])
torch.Size([1, 3, 80, 15])
torch.Size([1, 3, 80, 15])
torch.Size([1, 3, 80, 15])
torch.Size([1, 3, 80, 15])
torch.Size([1, 3, 80, 15])
torch.Size([1, 3, 80, 15])
torch.Size([1, 3, 80, 15])
torch.Size([1, 3, 80, 15])
torch.Size([1, 3, 80, 15])
torch.Size([1, 3, 80, 15])
torch.Size([1, 3, 80, 15])
torch.Size([1, 3, 80, 15])
torch.Size([1, 3, 80, 15])
torch.Size([1, 3, 80, 15])
torch.Size([1, 3, 80, 15])
torch.Size([1, 3, 80, 15])
torch.Size([1, 3, 80, 15])
torch.Size([1, 3, 80, 15])
torch.Size([1, 3, 80, 15])
torch.Size([1, 3, 80, 15])
torch.Size([1, 3, 80, 15])
torch.Size([1, 3, 80, 15])
torch.Size([1, 3, 80, 

KeyboardInterrupt: 

In [419]:
th.sigmoid(th.tensor(0.7)).detach().cpu()

tensor(0.6682)

In [265]:
print(features_train[0].shape)
print(labels_train[0].shape)

(3, 80, 850)
(850,)


In [268]:
sum(labels_train[0])

43.0

In [270]:
res = []
for idx, value in enumerate(labels_train[0]):
    if(value > 0):
        res.append(idx * data.HOP_LENGTH / data.SAMPLE_RATE)

In [272]:
f, _, _ = mir_eval.onset.f_measure(
    onsets_train[0],
    np.array(res),
    window=0.05  # 50 [ms]
)

In [273]:
f

1.0