In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

import IPython.display as ipd
import librosa
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.io import wavfile

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision.transforms import Lambda

from fastai.conv_learner import ConvLearner
from fastai.core import *
from fastai.dataloader import DataLoader
from fastai.dataset import get_cv_idxs, split_by_idx, ArraysIndexDataset, ModelData
from fastai.metrics import accuracy
from fastai.model import fit, predict
from fastai.text import SortishSampler

from data_loading_utils import load_audio_files, read_file

In [3]:
PATH = Path('data/')
TRAIN_PATH = PATH/'audio_train_16KHz'
TEST_PATH = PATH/'audio_test_16KHz'

sample_rate = 16000

In [4]:
train = pd.read_csv(PATH/'train.csv')

labels = sorted(train.label.unique())
label_idx = {label:i for i, label in enumerate(labels)}

x = load_audio_files(TRAIN_PATH, filenames=train.fname, trimmed=True)
y = train.label.apply(lambda l: label_idx[l]).values
len(x), len(y)

A Jupyter Widget




(9473, 9473)

In [5]:
# from blake
def get_trn_val_split(x, y, val_pct=0.15):
    val_idxs = get_cv_idxs(len(x), val_pct=val_pct)
    if isinstance(x, list):
        return [([arr[i] for i in val_idxs], [arr[i] for i in range(len(arr)) if i not in val_idxs]) for arr in [x,y]]
    else:
        return split_by_idx(val_idxs, x, y)
    
((val_x, trn_x), (val_y, trn_y)) = get_trn_val_split(x, y, 0.15)
len(trn_x), len(trn_y), len(val_x), len(val_y)

(8053, 8053, 1420, 1420)

In [6]:
class AudioDatasetDataset(ArraysIndexDataset):
    def __init__(self, x, y, transform=None):
        super().__init__(x, y, transform)
    def get_c(self): 
        return max(self.y) + 1
    def get_sz(self):
        return self.x[0].shape[0]
    def get_x(self, i):
        return self.x[i]

In [7]:
class AudioDataLoader1d(DataLoader):
    def get_batch(self, indexes):
        batch_data = [self.dataset[i] for i in indexes]
        x_lens = [item[0].shape[0] for item in batch_data]
        if len(np.unique(x_lens)) > 1:
            max_len = np.max(x_lens)
            for i, item in enumerate(batch_data):
                x, y = item
                clip_len = x.shape[0]
                pad_mode = 'wrap' if clip_len > 1 else 'constant'
                x = np.pad(x, (0, max_len-clip_len), pad_mode)
                batch_data[i] = x, y
        return self.np_collate(batch_data)

In [8]:
bs = 64

# ArraysIndexDataset expects np arrays
trn_y, val_y = np.array(trn_y), np.array(val_y)

trn_ds = AudioDatasetDataset(trn_x, trn_y)
val_ds = AudioDatasetDataset(val_x, val_y)
trn_dl = AudioDataLoader1d(trn_ds, 
                           sampler=SortishSampler(trn_ds, key=lambda x: trn_ds[x][0].shape[0], bs=bs),
                           batch_size=bs)
val_dl = AudioDataLoader1d(val_ds,
                           sampler=SortishSampler(val_ds, key=lambda x: val_ds[x][0].shape[0], bs=bs),
                           batch_size=bs)

In [9]:
x1, y1 = next(iter(trn_dl))
x1.size(), y1.size()

(torch.Size([64, 22080]), torch.Size([64]))

In [10]:
class Lambda(nn.Module):
    def __init__(self, lambd):
        super().__init__()
        self.lambd = lambd
    def forward(self, x):
        return self.lambd(x)


class RawAudioRNN(nn.Module):
    def __init__(self, n_hidden, n_classes, n_layers=3):
        super().__init__()
        
        self.n_hidden = n_hidden
        self.n_classes = n_classes
        self.n_layers = n_layers
        
        self.n_final_conv = 64
        
        self.lstm = nn.LSTM(self.n_final_conv, n_hidden, n_layers, batch_first=True, dropout=0.2)
        self.fc = nn.Linear(n_hidden, n_classes)
        
        self.conv_layers = nn.Sequential(
            # Add in channel dimension
            Lambda(lambda x: x.view(x.shape[0], 1, x.shape[1])),
    
            nn.Conv1d(1, 16, kernel_size=9, padding=0),
            nn.ReLU(inplace=True),
            nn.Conv1d(16, 16, kernel_size=9, padding=0),
            nn.ReLU(inplace=True),
            nn.MaxPool1d(16),
            nn.Dropout(0.1),
            
            nn.Conv1d(16, 32, kernel_size=3, padding=0),
            nn.ReLU(inplace=True),
            nn.Conv1d(32, 32, kernel_size=3, padding=0),
            nn.ReLU(inplace=True),
            nn.MaxPool1d(4),
            nn.Dropout(0.1),
            
            nn.Conv1d(32, 64, kernel_size=3, padding=0),
            nn.ReLU(inplace=True),
            nn.Conv1d(64, 64, kernel_size=3, padding=0),
            nn.ReLU(inplace=True),
            nn.MaxPool1d(4),
            nn.Dropout(0.1),
            
            nn.Conv1d(64, self.n_final_conv, kernel_size=3, padding=0),
            nn.ReLU(inplace=True),
            nn.MaxPool1d(8),
        )
        
        
    def forward(self, x):
        out = self.conv_layers(x)
        
        bs, input_size, sequence_length = out.size()
        out = out.view(bs, sequence_length, input_size)
                
        h0 = V(torch.zeros(self.n_layers, bs, self.n_hidden))
        c0 = V(torch.zeros(self.n_layers, bs, self.n_hidden))
        
        out, _ = self.lstm(out, (h0, c0))            
        out = self.fc(out[:, -1, :])
        return out

In [11]:
def mapk_np(preds, targs, k=3):
    preds = np.argsort(-preds, axis=1)[:, :k]
    score = 0.0
    for i in range(k):
        num_hits = (preds[:, i] == targs).sum()
        score += num_hits * (1.0 / (i+1.0))
    score /= preds.shape[0]
    return score

def mapk(preds, targs, k=3):
    return mapk_np(to_np(preds), to_np(targs), k)

In [12]:
model = RawAudioRNN(128, len(labels)).cuda()
model(x1).size()

torch.Size([64, 41])

In [13]:
md = ModelData(PATH, trn_dl, val_dl)
opt = optim.Adam
metrics = [accuracy, mapk]
loss = F.cross_entropy
learn = ConvLearner.from_model_data(model, md, crit=loss, metrics=metrics, opt_fn=opt)

In [14]:
lr = 1e-3
learn.fit(lr, 1, cycle_len=50, use_clr_beta=(5, 25, 0.95, 0.75))

A Jupyter Widget

epoch      trn_loss   val_loss   accuracy   mapk       
    0      3.671593   3.665341   0.037324   0.060329  
    1      3.656525   3.651369   0.035211   0.062207  
    2      3.59219    3.524137   0.08662    0.129108  
    3      3.494444   3.503371   0.059155   0.101995  
    4      3.423718   3.348474   0.11831    0.176878  
    5      3.315465   3.242729   0.148592   0.204577  
    6      3.165086   3.193549   0.137324   0.202347  
    7      3.09327    3.222853   0.139437   0.201761  
    8      3.081697   3.062034   0.167606   0.244601  
    9      3.046067   2.936918   0.197183   0.28439   
    10     2.939611   2.938675   0.195775   0.284155  
    11     2.909037   2.818861   0.214085   0.307864  
    12     2.75819    2.846054   0.20493    0.305516  
    13     2.779248   2.926299   0.195775   0.28392   
    14     2.736981   2.766463   0.230986   0.327347  
    15     2.696654   2.705832   0.238732   0.33838   
    16     2.64717    2.705368   0.244366   0.341901  
    17   

[2.000971834424516, 0.440845070338585, 0.5433098591549295]

In [15]:
learn.save_cycle('1d_rnn_3_layers_nh_128_16KHz', 50)