In [33]:
import numpy as np
import torch as th
from torch import nn
from torch import optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torchaudio

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import glob
from os import path
import pickle

import models
import data

In [125]:
import torch as th
import models
import numpy as np
from scipy.signal import find_peaks
import data
import mir_eval
import pickle
import glob
from os import path
import json

device = 'cuda' if th.cuda.is_available() else 'cpu'

with open('dataset-stats.pkl', 'rb') as f:
    stats = pickle.load(f)
    mean = th.Tensor(stats['mean']).to(device)
    std = th.Tensor(stats['std']).to(device)

@th.no_grad()
def onset_signal(model, x):
    model = model.to(device)
    model.eval()
    x = x.to(device)
    x = (x - mean)/std
    x = x.unsqueeze(0)
    out = th.sigmoid(model(x)).detach().cpu()
    out = np.convolve(out[0], np.hamming(5))

    return out

def onsets(onset_signal):
    res = []
    print("onset peak signal", find_peaks(onset_signal)[0])
    print("onset peak signal", find_peaks(onset_signal)[0])
    for idx in find_peaks(onset_signal)[0]:
        if onset_signal[idx] >= 0.95:
            res.append(idx * data.HOP_LENGTH / data.SAMPLE_RATE)

    return np.array(res)

def evaluate_onsets(model, X, y):
    f_scores = []
    for idx, x in enumerate(X):
        pred = onsets(onset_signal(model, x))
        f, _, _ = mir_eval.onset.f_measure(y[idx], pred, window=0.05)
        f_scores.append(f)

    return f_scores

In [126]:
import torch as th
import torchaudio
from tqdm import tqdm
import numpy as np

FRAME_LENGTH = 15
FRAME_HALF = 7
HOP_LENGTH = 512
WIN_LENGTHS = [1024, 2048, 4096]
SAMPLE_RATE = 44100

_transforms =[torchaudio.transforms.MelSpectrogram(SAMPLE_RATE, n_fft=wl, hop_length=HOP_LENGTH, n_mels=80, f_min=27.5, f_max=16000) for wl in WIN_LENGTHS]

def make_frames(X, y, onsets, sample_rate):
    X_frames, y_frames = [], []

    for onset_time in onsets:
        onset_idx = int(seconds_to_bins(onset_time, sample_rate))

        start = max(0, onset_idx - FRAME_LENGTH//2)
        end = min(onset_idx + FRAME_LENGTH//2 + 1, X.shape[2] - FRAME_LENGTH)

        idx = start
        while idx < end:
            X_frames.append(X[:, :, idx:idx+FRAME_LENGTH])
            y_frames.append(y[idx:idx+FRAME_LENGTH])
            idx += 1

    return X_frames, y_frames

def make_target(onsets, length, sample_rate):
    y = th.zeros(length)

    for x in onsets:
        x_t = int(seconds_to_bins(x, sample_rate))
        y[x_t] = 1

        if x_t - 1 >= 0 and y[x_t - 1] != 1:
            y[x_t - 1] = 0.25

        if x_t + 1 < length and y[x_t + 1] != 1:
            y[x_t + 1] = 0.25

    return y

def seconds_to_bins(a, sample_rate):
    return a * sample_rate / HOP_LENGTH

def mel(waveform):
    mel_specs = [transform(waveform) for transform in _transforms]
    return th.log10(th.stack(mel_specs) + 1e-08)

def load_onsets(file_path):
    with open(file_path, 'r') as f:
        onsets = list(map(float, f.read().split()))
    return np.array(onsets)

def preprocess_audio(files):
    spectograms = []
    sample_rates = []

    for file_path in tqdm(files):
        waveform, sample_rate = torchaudio.load(file_path)
        mel_specgram = mel(waveform[0])
        spectograms.append(mel_specgram)
        sample_rates.append(sample_rate)

    return spectograms, sample_rates

class AudioOnsetDataset(th.utils.data.Dataset):
    def __init__(self, spectograms, sample_rates, targets, sample_onsets):
        self.X = []
        self.y = []

        for X, sample_rate, y, onsets in zip(spectograms, sample_rates, targets, sample_onsets):
            X_frames, y_frames = make_frames(X, y, onsets, sample_rate)
            print(X_frames[0].shape)
            self.X += X_frames
            self.y += y_frames

        tmp = th.cat(self.X)
        self.mean = th.mean(tmp, dim=(0, 2)).unsqueeze(1)
        self.std = th.std(tmp, dim=(0, 2)).unsqueeze(1)
        del tmp

        self.X = [(x - self.mean)/self.std for x in self.X]

    def __len__(self):
        return len(self.X)

    def __getitem__(self, i):
        return self.X[i], self.y[i]

In [127]:
train_path = './data/onset/train_extra_onsets'
X_files = glob.glob(path.join(train_path, '*.wav'))
y_files = glob.glob(path.join(train_path, '*.onsets.gt'))
print(len(X_files))

X_files_train, X_files_test, y_files_train, y_files_test = train_test_split(X_files, y_files, test_size=0.2, random_state=42)
print(len(X_files_train), len(X_files_test))

2
1 1


In [128]:
X_files_train

['./data/onset/train_extra_onsets/ah_development_guitar_2684_TexasMusicForge_Dandelion_pt1.wav']

In [129]:
X_train, sample_rates_train = data.preprocess_audio(X_files_train)
onsets_train = [data.load_onsets(path) for path in y_files_train]
y_train = [data.make_target(onsets_train[i], X_train[i].shape[-1], sample_rates_train[i]) for i in range(len(onsets_train))]

100%|██████████| 1/1 [00:00<00:00, 17.85it/s]


In [130]:
X_test, sample_rates_test = data.preprocess_audio(X_files_test)
onsets_test = [data.load_onsets(path) for path in y_files_test]

100%|██████████| 1/1 [00:00<00:00, 77.64it/s]


In [131]:
dataset = AudioOnsetDataset(X_train, sample_rates_train, y_train, onsets_train)

with open('dataset-stats.pkl', 'wb') as f:
    pickle.dump({ 'mean': dataset.mean, 'std': dataset.std }, f)

torch.Size([3, 80, 15])


In [132]:
epochs = 100
device = 'cuda' if th.cuda.is_available() else 'cpu'
lr = 3e-4

train_dataloader = DataLoader(dataset, shuffle=True, batch_size=256)

In [133]:
model = models.Resi(3).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)

pos_weight = th.Tensor([14.]).to(device)
criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

data_mean, data_std = dataset.mean.to(device), dataset.std.to(device)

best_mean = 0
for epoch in range(epochs):
    model.train()
    print(f'epoch {epoch + 1}')

    for batch_idx, (X, y) in enumerate(train_dataloader):
        X, y = X.to(device), y.to(device)
        # fuzzy weight mask
        mask = th.ones_like(y)
        mask[y == 0.25] = 0.25
        y[y == 0.25] = 1.
        out = model(X)
        loss = F.binary_cross_entropy_with_logits(out, y, weight=mask, pos_weight=pos_weight)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch_idx % 100 == 0:
            print(f'loss: {loss.item()}')

    model.eval()
    f_scores_train = evaluate_onsets(model, X_train, onsets_train)
    f_scores_test = evaluate_onsets(model, X_test, onsets_test)
    f_mean_test = np.mean(f_scores_test)
    print(f'F-scores: TRAIN {np.mean(f_scores_train)} | TEST {np.mean(f_mean_test)}')

    if f_mean_test > best_mean:
        best_mean = f_mean_test
        th.save(model.state_dict(), 'resi.pt')

epoch 1
loss: 1.4471831321716309
onset peak signal [  4  12  18  32  38  43  50  59  67  72  77  82  86  99 109 111 121 128
 135 152 157 162 169 177 182 187 193 199 203 212 222 235 243 247 256 265
 275 281 291 298 303 308 312 321 332 337 346 359 364 376 378 384 394 405
 413 418 424 437 444 450 452 457 463 476 489 500 504 519 523 527 532 536
 545 551 556 558 571 580 591 599 603 607 615 620 625 632 644 655 663 668
 673 679 683 693 698 701 705 709 715 720 724 735 741 749 753 760 765 776
 784 788 801 814 820 825 832 845]
onset peak signal [  4  12  18  32  38  43  50  59  67  72  77  82  86  99 109 111 121 128
 135 152 157 162 169 177 182 187 193 199 203 212 222 235 243 247 256 265
 275 281 291 298 303 308 312 321 332 337 346 359 364 376 378 384 394 405
 413 418 424 437 444 450 452 457 463 476 489 500 504 519 523 527 532 536
 545 551 556 558 571 580 591 599 603 607 615 620 625 632 644 655 663 668
 673 679 683 693 698 701 705 709 715 720 724 735 741 749 753 760 765 776
 784 788 801 814 820 

KeyboardInterrupt: 