In [87]:
import pandas as pd
import utility as utils
import importlib
import numpy as np
import librosa
from tqdm import tqdm
import numpy as np
import torchaudio
import mir_eval
import glob
from os import path
import data
import models
import torch
from torch.utils.data import DataLoader

from sklearn.model_selection import train_test_split

importlib.reload(utils)
importlib.reload(data)

train_dataset_path = './data/onset/train_extra_onsets'
test_dataset_path = './data/onset/test'

In [49]:
wav_files_paths, _, onset_files_paths, _ = utils.load_dataset_paths(train_dataset_path)

In [50]:
X_files_train, X_files_test, y_files_train, y_files_test = train_test_split(wav_files_paths, onset_files_paths, test_size=0.2, random_state=42)
print(len(X_files_train), len(X_files_test))

1 1


In [81]:
X_train, sample_rates = utils.preprocess_audio(X_files_train)
onsets_train = utils.load_onsets(y_files_train)
y_train = [utils.make_target(onsets_train[i], X_train[i].shape[-1], sample_rates[i]) for i in range(len(onsets_train))]

X_train_tensors = [torch.tensor(spec, dtype=torch.float32) for spec in X_train]
y_train_tensors = [torch.tensor(target, dtype=torch.float32) for target in y_train]
X_train_tensors = torch.stack(X_train_tensors)
y_train_tensors = torch.stack(y_train_tensors)

100%|██████████| 1/1 [00:00<00:00, 13.96it/s]


In [82]:
X_test, sample_rates_test = data.preprocess_audio(X_files_test)
onsets_test = [data.load_onsets(path) for path in y_files_test]

X_test_tensors = [torch.tensor(spec, dtype=torch.float32) for spec in X_test]
X_test_tensors = torch.stack(X_test_tensors)

100%|██████████| 1/1 [00:00<00:00, 60.92it/s]
  X_test_tensors = [torch.tensor(spec, dtype=torch.float32) for spec in X_test]


In [83]:
import torch as th
from torch import nn
import torch.nn.functional as F

class AudioOnsetDataset(th.utils.data.Dataset):
    def __init__(self, spectograms, sample_rates, targets, sample_onsets):
        self.X = []
        self.y = []

        for X, sample_rate, y, onsets in zip(spectograms, sample_rates, targets, sample_onsets):
            X_frames, y_frames = utils.create_audio_onset_dataset(X, y, onsets, sample_rate)

            self.X += X_frames
            self.y += y_frames

        tmp = th.cat(self.X)
        self.mean = th.mean(tmp, dim=(0, 2)).unsqueeze(1)
        self.std = th.std(tmp, dim=(0, 2)).unsqueeze(1)
        del tmp

        self.X = [(x - self.mean)/self.std for x in self.X]

    def __len__(self):
        return len(self.X)

    def __getitem__(self, i):
        return self.X[i], self.y[i]

In [84]:
import torch as th
from torch import nn
import torch.nn.functional as F

class ResBlock(nn.Module):
    def __init__(self, c):
        super().__init__()

        self.block = nn.Sequential(
            nn.Conv2d(c, c, 3, padding='same'),
            nn.BatchNorm2d(c),
            nn.ReLU(c),
            nn.Conv2d(c, c, 3, padding='same'),
            nn.BatchNorm2d(c)
        )

    def forward(self, x):
        return F.relu(x + self.block(x))

class Resi(nn.Module):
    def __init__(self, input_dim):
        super().__init__()

        self.embed = nn.Sequential(
            nn.Conv2d(input_dim, 16, (3, 7), padding=(0, 3)), # 16xLx78
            nn.ReLU(),
            ResBlock(16), # 16xLx78
            nn.MaxPool2d((3, 1)), # 16xLx26
            nn.Dropout2d(0.4),
            nn.Conv2d(16, 32, (3, 3), padding=(0, 1)), # 32xLx24
            nn.ReLU(),
            ResBlock(32), # 32xLx24
            nn.MaxPool2d((3, 1)), # 32xLx8
            nn.Dropout2d(0.4),
            nn.Conv2d(32, 64, (3, 3), padding=(0, 1)), # 64xLx6
            nn.ReLU(),
            nn.AdaptiveMaxPool2d((1, None)), # 64xLx1
            nn.Dropout2d(0.4),
            nn.Conv2d(64, 1, 1) # 1xLx1
        )

    def forward(self, X):
        out = self.embed(X)
        out = th.flatten(out, start_dim=1)
        return out

In [86]:
dataset = data.AudioOnsetDataset(X_train_tensors, sample_rates, y_train_tensors, onsets_train)

In [88]:
epochs = 100
device = 'cuda' if th.cuda.is_available() else 'cpu'
lr = 3e-4

train_dataloader = DataLoader(dataset, shuffle=True, batch_size=256)

In [None]:
model = models.Resi(3).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)

pos_weight = th.Tensor([14.]).to(device)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

data_mean, data_std = dataset.mean.to(device), dataset.std.to(device)

best_mean = 0
for epoch in range(epochs):
    model.train()
    print(f'epoch {epoch + 1}')

    for batch_idx, (X, y) in enumerate(train_dataloader):
        X, y = X.to(device), y.to(device)

        # fuzzy weight mask
        mask = th.ones_like(y)
        mask[y == 0.25] = 0.25
        y[y == 0.25] = 1.

        out = model(X)
        loss = F.binary_cross_entropy_with_logits(out, y, weight=mask, pos_weight=pos_weight)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch_idx % 100 == 0:
            print(f'loss: {loss.item()}')

    model.eval()
    f_scores_train = onsets.evaluate_onsets(model, X_train, onsets_train, data_mean, data_std)
    f_scores_test = onsets.evaluate_onsets(model, X_test, onsets_test, data_mean, data_std)
    f_mean_test = np.mean(f_scores_test)
    print(f'F-scores: TRAIN {np.mean(f_scores_train)} | TEST {np.mean(f_mean_test)}')

    if f_mean_test > best_mean:
        best_mean = f_mean_test
        th.save(model.state_dict(), 'resi.pt')