In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

import IPython.display as ipd
import librosa
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.io import wavfile

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision.transforms import Lambda

from fastai.dataloader import DataLoader
from fastai.dataset import get_cv_idxs, split_by_idx, ArraysIndexDataset, ModelData
from fastai.metrics import accuracy
from fastai.model import fit, predict

from data_loading_utils import load_audio_files, read_file

In [3]:
PATH = Path('data/')
TRAIN_PATH = PATH/'audio_train_16KHz'
TEST_PATH = PATH/'audio_test_16KHz'

sample_rate = 16000
n_seconds = 2
n_samples = sample_rate * n_seconds

In [4]:
train = pd.read_csv(PATH/'train.csv')

labels = sorted(train.label.unique())
label_idx = {label:i for i, label in enumerate(labels)}

x = load_audio_files(TRAIN_PATH, filenames=train.fname)
y = train.label.apply(lambda l: label_idx[l]).values
len(x), len(y)

A Jupyter Widget




(9473, 9473)

In [5]:
print('Label:', labels[y[0]])
ipd.Audio(x[0], rate=sample_rate)

Label: Hi-hat


In [6]:
# from blake
def get_trn_val_split(x, y, val_pct=0.15):
    val_idxs = get_cv_idxs(len(x), val_pct=val_pct)
    if isinstance(x, list):
        return [([arr[i] for i in val_idxs], [arr[i] for i in range(len(arr)) if i not in val_idxs]) for arr in [x,y]]
    else:
        return split_by_idx(val_idxs, x, y)

In [7]:
((val_x, trn_x), (val_y, trn_y)) = get_trn_val_split(x, y, 0.15)

len(trn_x), len(trn_y), len(val_x), len(val_y)

(8053, 8053, 1420, 1420)

In [8]:
def random_subset(x, n):    
    if x.shape[0] > n:
        offset = np.random.randint(x.shape[0] - n)
        return x[offset:offset+n]
    elif x.shape[0] < n:
        pad_total = n - x.shape[0]
        pad_start = np.random.randint(pad_total)
        pad_end = pad_total - pad_start
        return np.pad(x, (pad_start, pad_end), mode='constant') # zeros
    else:
        return x

In [9]:
class RandomOffsetArraysIndexDataset(ArraysIndexDataset):
    def __init__(self, x, y, n_samples, transform=None):
        self.n_samples = n_samples
        assert(len(x)==len(y))
        super().__init__(x, y, transform)
    
    def get_x(self, i):
        return random_subset(self.x[i], self.n_samples)
    
    def get_sz(self):
        return self.n_samples

In [14]:
class Lambda(nn.Module):
    def __init__(self, lambd):
        super().__init__()
        self.lambd = lambd
    def forward(self, x):
        return self.lambd(x)

class AudioCNN(nn.Module):
    def __init__(self, n_classes, n_samples):
        super().__init__()
        
        # calculate the first linear input channels based on padding/pooling dims
        first_linear_in = int((int((int((n_samples-8)/16)-4)/4)-4)/4)-4
        
        self.layers = nn.Sequential(
            # Add in channel dimension
            Lambda(lambda x: x.view(x.shape[0], 1, x.shape[1])),
            
            nn.Conv1d(1, 16, kernel_size=9, padding=0),
            nn.ReLU(),
            nn.Conv1d(16, 16, kernel_size=9, padding=0),
            nn.ReLU(),
            nn.MaxPool1d(16),
            nn.Dropout(0.1),

            nn.Conv1d(16, 32, kernel_size=3, padding=0),
            nn.ReLU(),
            nn.Conv1d(32, 32, kernel_size=3, padding=0),
            nn.ReLU(),
            nn.MaxPool1d(4),
            nn.Dropout(0.1),
            
            nn.Conv1d(32, 32, kernel_size=3, padding=0),
            nn.ReLU(),
            nn.Conv1d(32, 32, kernel_size=3, padding=0),
            nn.ReLU(),
            nn.MaxPool1d(4),
            nn.Dropout(0.1),
            
            nn.Conv1d(32, 256, kernel_size=3, padding=0),
            nn.ReLU(),
            nn.Conv1d(256, 256, kernel_size=3, padding=0),
            nn.ReLU(),

             # GlobalMaxPool
            Lambda(lambda x: torch.max(x, dim=1)[0]),
            nn.Dropout(0.2),
            nn.Linear(first_linear_in, 64),
            nn.ReLU(),
            nn.Linear(64, 1024),
            nn.ReLU(),
            nn.Linear(1024, n_classes)
        )

    def forward(self, x):
        return self.layers(x)

In [15]:
 # ArraysIndexDataset expects np arrays
trn_y, val_y = np.array(trn_y), np.array(val_y)

trn_ds = RandomOffsetArraysIndexDataset(trn_x, trn_y, n_samples)
val_ds = RandomOffsetArraysIndexDataset(val_x, val_y, n_samples)
trn_dl = DataLoader(trn_ds, shuffle=True, batch_size=128)
val_dl = DataLoader(val_ds, shuffle=False, batch_size=128)

In [16]:
x1, y1 = next(iter(trn_dl))
x1.size(), y1.size()

(torch.Size([128, 32000]), torch.Size([128]))

In [17]:
md = ModelData(PATH, trn_dl, val_dl)

model = AudioCNN(len(labels), n_samples).cuda()

opt = optim.Adam(model.parameters())
metrics = [accuracy]
loss = F.cross_entropy

In [18]:
fit(model, md, n_epochs=75, crit=loss, opt=opt, metrics=metrics)

A Jupyter Widget

epoch      trn_loss   val_loss   accuracy   
    0      3.659497   3.660416   0.026761  
    1      3.535071   3.398164   0.100704  
    2      3.350562   3.308102   0.099296  
    3      3.248297   3.233547   0.121831  
    4      3.186578   3.151518   0.138732  
    5      3.158186   3.12538    0.165493  
    6      3.123214   3.158531   0.140845  
    7      3.074801   3.111039   0.155634  
    8      3.063493   3.048566   0.167606  
    9      3.02881    3.105673   0.14507   
    10     3.007717   2.989677   0.174648  
    11     2.998451   3.064284   0.176056  
    12     2.95743    2.988572   0.184507  
    13     2.94193    2.95349    0.20493   
    14     2.925419   2.962125   0.182394  
    15     2.905335   2.954703   0.18662   
    16     2.892765   2.912508   0.183099  
    17     2.869197   2.867941   0.207746  
    18     2.856686   2.874657   0.21831   
    19     2.835774   2.836885   0.223944  
    20     2.815953   2.874961   0.221127  
    21     2.796526   2.843541 

[2.602527507593934, 0.29929577473183755]

In [19]:
test = pd.read_csv(PATH/'sample_submission.csv')
test.head()

Unnamed: 0,fname,label
0,00063640.wav,Laughter Hi-Hat Flute
1,0013a1db.wav,Laughter Hi-Hat Flute
2,002bb878.wav,Laughter Hi-Hat Flute
3,002d392d.wav,Laughter Hi-Hat Flute
4,00326aa9.wav,Laughter Hi-Hat Flute


In [20]:
test_x = load_audio_files(TEST_PATH, filenames=test.fname)
test_y = np.zeros(len(test_x))
len(test_x), len(test_y)

A Jupyter Widget




(9400, 9400)

In [24]:
class AudioArraysIndexDataset(ArraysIndexDataset):
    def __init__(self, x, y, n_samples, transform=None):
        self.n_samples = n_samples
        assert(len(x)==len(y))
        super().__init__(x, y, transform)
    
    def get_x(self, i):
        data = self.x[i]
        if data.shape[0] < self.n_samples:
            data = np.pad(data, (0, self.n_samples-data.shape[0]), 'constant')
        elif data.shape[0] > self.n_samples:
            offset = (data.shape[0] - self.n_samples) // 2
            data = data[offset:offset+self.n_samples]
        return data
    
    def get_sz(self):
        return self.n_samples

In [25]:
test_ds = AudioArraysIndexDataset(test_x, test_y, n_samples)
test_dl = DataLoader(test_ds, shuffle=False, batch_size=128)
predictions = predict(model, test_dl)
predictions.shape

(9400, 41)

In [26]:
# From the fizzbuzz starter kernel
top_3 = np.array(labels)[np.argsort(-predictions, axis=1)[:, :3]]
predicted_labels = [' '.join(list(x)) for x in top_3]
test.label = predicted_labels
test.to_csv('fixed_1d_conv_2seconds.csv', index=False)