In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

import IPython.display as ipd
import librosa
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.io import wavfile

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from fastai.conv_learner import ConvLearner
from fastai.core import to_np
from fastai.dataloader import DataLoader
from fastai.dataset import get_cv_idxs, split_by_idx, ArraysIndexDataset, ModelData
from fastai.metrics import accuracy, accuracy_np
from fastai.model import fit, predict
from fastai.text import SortishSampler

from data_loading_utils import load_audio_files, read_file
from preprocessing_utils import load_features

from tqdm import tqdm_notebook as tqdm

In [3]:
PATH = Path('data/')
TRAIN_PATH = PATH/'audio_train'
TEST_PATH = PATH/'audio_test'

sample_rate = 44100
n_features = 60
n_fft = 1024
hop_length = 512

In [4]:
train = pd.read_csv(PATH/'train.csv')

labels = sorted(train.label.unique())
label_idx = {label:i for i, label in enumerate(labels)}

In [5]:
x = load_features(TRAIN_PATH,
                  filenames=train.fname, 
                  feature_name='log_mel_spec',
                  n_fft=n_fft, 
                  hop_length=hop_length,
                  n_features=n_features)
y = train.label.apply(lambda l: label_idx[l]).values
len(x), len(y)

Loading cached data..
Loaded features for 9473 files


(9473, 9473)

In [6]:
test = pd.read_csv(PATH/'sample_submission.csv')
test_x = load_features(TEST_PATH, 
                       filenames=test.fname, 
                       feature_name='log_mel_spec', 
                       n_fft=n_fft, 
                       hop_length=hop_length,
                       n_features=n_features)
test_y = np.zeros(len(test_x))
len(test_x), len(test_y)

Loading cached data..
Loaded features for 9400 files


(9400, 9400)

In [7]:
# from blake
def get_trn_val_split(x, y, val_pct=0.15):
    val_idxs = get_cv_idxs(len(x), val_pct=val_pct)
    if isinstance(x, list):
        return [([arr[i] for i in val_idxs], [arr[i] for i in range(len(arr)) if i not in val_idxs]) for arr in [x,y]]
    else:
        return split_by_idx(val_idxs, x, y)

In [8]:
((val_x, trn_x), (val_y, trn_y)) = get_trn_val_split(x, y, 0.15)
len(trn_x), len(trn_y), len(val_x), len(val_y)

(8053, 8053, 1420, 1420)

In [9]:
class AudioDatasetDataset(ArraysIndexDataset):
    def __init__(self, x, y, transform=None):
        super().__init__(x, y, transform)
    def get_c(self): 
        return max(self.y) + 1
    def get_sz(self):
        return self.x[0].shape[0]
    def get_x(self, i):
        return self.x[i]

In [10]:
class AudioDataLoader(DataLoader):
    def get_batch(self, indexes):
        batch_data = [self.dataset[i] for i in indexes]
        x_lens = [item[0].shape[1] for item in batch_data]
        if len(np.unique(x_lens)) > 1:
            max_len = np.max(x_lens)
            for i, item in enumerate(batch_data):
                x, y = item
                clip_len = x.shape[1]
                pad_mode = 'wrap' if clip_len > 1 else 'constant'
                x = np.pad(x, ((0, 0), (0, max_len-clip_len)), pad_mode)
                batch_data[i] = x, y
        return self.np_collate(batch_data)

In [11]:
bs = 32

def get_dl(x, y, bs):
    ds = AudioDatasetDataset(x, y)
    sampler = SortishSampler(ds, key=lambda x: ds[x][0].shape[1], bs=bs)
    dl = AudioDataLoader(ds, bs, sampler)
    return dl

trn_dl = get_dl(trn_x, trn_y, bs)
val_dl = get_dl(val_x, val_y, bs)
test_dl = get_dl(test_x, test_y, bs)

In [12]:
x1, y1 = next(iter(trn_dl))
x1.shape, y1.shape

(torch.Size([32, 60, 2197]), torch.Size([32]))

In [13]:
def conv_block(n_in, n_out, kernel_size=3, max_pool=1):
    return nn.Sequential(
        nn.Conv2d(n_in, n_out, kernel_size=kernel_size, padding=kernel_size//2),
        nn.ReLU(),
        nn.Conv2d(n_out, n_out, kernel_size=kernel_size, padding=kernel_size//2),
        nn.ReLU(),
        nn.MaxPool2d(max_pool),
        nn.Dropout2d(0.1)
    )

class Lambda(nn.Module):
    def __init__(self, lambd):
        super().__init__()
        self.lambd = lambd
    def forward(self, x):
        return self.lambd(x)


class AudioCNN(nn.Module):
    def __init__(self, n_classes):
        self.debug = False
        super().__init__()
        self.layers = nn.Sequential(
            Lambda(lambda x: x.view(x.shape[0], 1, x.shape[1], x.shape[2])),
            conv_block(1, 16, 9, 2),
            conv_block(16, 32, 3, 2),
            nn.Conv2d(32, 64, 3),
            nn.ReLU(),
            nn.Dropout2d(0.05),
            nn.Conv2d(64, n_classes, 3),
            Lambda(lambda x: x.view(x.shape[0], n_classes, -1)),
            Lambda(lambda x: torch.mean(x, dim=2))
        )
        
    def forward(self, x):
        return self.layers(x)
    

In [14]:
def mapk_np(preds, targs, k=3):
    preds = np.argsort(-preds, axis=1)[:, :k]
    score = 0.0
    for i in range(k):
        num_hits = (preds[:, i] == targs).sum()
        score += num_hits * (1.0 / (i+1.0))
    score /= preds.shape[0]
    return score

def mapk(preds, targs, k=3):
    return mapk_np(to_np(preds), to_np(targs), k)

In [15]:
md = ModelData(PATH, trn_dl, val_dl, test_dl)
model = AudioCNN(len(labels)).cuda()
opt = optim.Adam
metrics = [accuracy, mapk]
loss = F.cross_entropy
learn = ConvLearner.from_model_data(model, md, crit=loss, metrics=metrics, opt_fn=opt)

In [16]:
# learn.lr_find()
# learn.sched.plot()

In [17]:
# lr = 1e-2
# learn.fit(lr, 1, wds=[1e-7], cycle_len=50, use_clr_beta=(10, 25, 0.95, 0.85))

# learn.save_cycle('2d_full_conv_clr_v2', 50)

In [18]:
learn.load_cycle('2d_full_conv_clr_v2', 50)

In [21]:
learn.model.eval()
val_preds = learn.predict_with_targs()

val_acc = accuracy_np(*val_preds)
val_map = mapk_np(*val_preds)

print(f'Val Acc: {val_acc:.3f}, Val MAP: {val_map:.3f}')

Val Acc: 0.691, Val MAP: 0.766
