In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

import IPython.display as ipd
import librosa
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.io import wavfile

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision.transforms import Lambda

from fastai.dataloader import DataLoader
from fastai.dataset import get_cv_idxs, split_by_idx, ArraysIndexDataset, ModelData
from fastai.metrics import accuracy
from fastai.model import fit, predict

from data_loading_utils import load_audio_files, read_file
from preprocessing_utils import load_features

In [3]:
PATH = Path('data/')
TRAIN_PATH = PATH/'audio_train'
TEST_PATH = PATH/'audio_test'

sample_rate = 44100
n_segments = 220  # approx 2.5 seconds
n_features = 80
n_fft = 1024

In [5]:
train = pd.read_csv(PATH/'train.csv')

labels = sorted(train.label.unique())
label_idx = {label:i for i, label in enumerate(labels)}

x = load_features(TRAIN_PATH, filenames=train.fname, feature_name='log_mel_spec', n_fft=n_fft, n_features=n_features)
y = train.label.apply(lambda l: label_idx[l]).values
len(x), len(y)

Loading audio files...


A Jupyter Widget


Computing log_mel_spec features..


A Jupyter Widget


Saving data..
Loaded features for 9473 files


(9473, 9473)

In [6]:
x[0].shape, y.shape

((80, 1206), (9473,))

In [7]:
# from blake
def get_trn_val_split(x, y, val_pct=0.15):
    val_idxs = get_cv_idxs(len(x), val_pct=val_pct)
    if isinstance(x, list):
        return [([arr[i] for i in val_idxs], [arr[i] for i in range(len(arr)) if i not in val_idxs]) for arr in [x,y]]
    else:
        return split_by_idx(val_idxs, x, y)

In [8]:
((val_x, trn_x), (val_y, trn_y)) = get_trn_val_split(x, y, 0.15)

len(trn_x), len(trn_y), len(val_x), len(val_y)

(8053, 8053, 1420, 1420)

In [9]:
def random_subset2d(x, n):    
    if x.shape[0] > n:
        offset = np.random.randint(x.shape[0] - n)
        return x[offset:offset+n]
    elif x.shape[0] < n:
        pad_total = n - x.shape[0]
        pad_start = np.random.randint(pad_total)
        pad_end = pad_total - pad_start
        return np.pad(x, ((pad_start, pad_end), (0, 0)), mode='constant') # zeros
    else:
        return x

In [10]:
class RandomOffsetArraysIndexDataset(ArraysIndexDataset):
    def __init__(self, x, y, n_segments, transform=None):
        self.n_segments = n_segments
        assert(len(x)==len(y))
        super().__init__(x, y, transform)
    
    def get_x(self, i):
        data = self.x[i].T
        return random_subset2d(data, self.n_segments)
    
    def get_sz(self):
        return self.n_segments

In [11]:
def conv_block(n_in, n_out):
    return nn.Sequential(
        nn.Conv2d(n_in, n_out, kernel_size=(7, 7), padding=(3, 3)),
        nn.BatchNorm2d(n_out),
        nn.ReLU(),
        nn.MaxPool2d(2),
        nn.Dropout(0.2)
    )

class Lambda(nn.Module):
    def __init__(self, lambd):
        super().__init__()
        self.lambd = lambd
    def forward(self, x):
        return self.lambd(x)

class AudioCNN_MFCC(nn.Module):
    def __init__(self, n_classes, n_segments, n_features):
        super().__init__()

        linear_input_ch = (n_features//16)*(n_segments//16) * 64
        
        self.layers = nn.Sequential(
            # Add in channel dimension
            Lambda(lambda x: x.view(x.shape[0], 1, x.shape[1], x.shape[2])),
            conv_block(1, 32),
            conv_block(32, 32),
            conv_block(32, 64),
            conv_block(64, 64),
            Lambda(lambda x: x.view(x.shape[0], -1)),
            nn.Linear(linear_input_ch, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, n_classes)
        )
        
    def forward(self, x):
        return self.layers(x)

In [12]:
 # ArraysIndexDataset expects np arrays
trn_y, val_y = np.array(trn_y), np.array(val_y)

trn_ds = RandomOffsetArraysIndexDataset(trn_x, trn_y, n_segments)
val_ds = RandomOffsetArraysIndexDataset(val_x, val_y, n_segments)
trn_dl = DataLoader(trn_ds, shuffle=True, batch_size=16)
val_dl = DataLoader(val_ds, shuffle=False, batch_size=16)

In [13]:
x1, y1 = next(iter(trn_dl))
x1.size(), y1.size()

(torch.Size([16, 220, 80]), torch.Size([16]))

In [14]:
model = AudioCNN_MFCC(len(labels), n_segments, n_features).cuda()

md = ModelData(PATH, trn_dl, val_dl)
opt = optim.Adam(model.parameters())
metrics = [accuracy]
loss = F.cross_entropy

In [15]:
fit(model, md, n_epochs=50, crit=loss, opt=opt, metrics=metrics)

A Jupyter Widget

epoch      trn_loss   val_loss   accuracy   
    0      3.143239   3.188856   0.108451  
    1      2.668115   2.476378   0.309155  
    2      2.460484   2.666632   0.258451  
    3      2.343896   2.324785   0.345775  
    4      2.192679   2.150116   0.410563  
    5      2.058338   1.976179   0.455634  
    6      2.500651   2.442777   0.31831   
    7      1.952349   1.905239   0.472535  
    8      1.801051   1.823379   0.506338  
    9      1.687673   1.863748   0.48169   
    10     1.730925   1.926121   0.475352  
    11     1.735418   1.680195   0.516901  
    12     1.71097    1.681556   0.542958  
    13     1.871553   1.854797   0.497183  
    14     1.534922   1.677012   0.544366  
    15     1.585034   1.698259   0.540141  
    16     1.51044    1.639519   0.55493   
    17     1.628853   2.084307   0.441549  
    18     1.448779   1.664785   0.542958  
    19     1.909282   2.418951   0.358451  
    20     1.550798   1.640004   0.552817  
    21     1.497575   1.599612 

[1.459698949061649, 0.6133802816901408]

In [17]:
torch.save(model.state_dict(), 'conv2d_1.w')

In [16]:
opt = optim.Adam(model.parameters(), lr=0.0001)
fit(model, md, n_epochs=25, crit=loss, opt=opt, metrics=metrics)

A Jupyter Widget

epoch      trn_loss   val_loss   accuracy   
    0      0.857896   1.393606   0.628873  
    1      0.884294   1.391728   0.65      
    2      0.87205    1.389483   0.634507  
    3      0.916998   1.350283   0.647183  
    4      0.831819   1.420194   0.640845  
    5      0.823298   1.346529   0.646479  
    6      0.848156   1.378234   0.657042  
    7      0.843087   1.36628    0.65493   
    8      0.884747   1.35158    0.648592  
    9      0.832478   1.353926   0.649296  
    10     0.869389   1.369114   0.66338   
    11     0.828256   1.378637   0.65493   
    12     0.80068    1.393751   0.643662  
    13     0.883887   1.393557   0.647887  
    14     0.83453    1.368734   0.65493   
    15     0.721828   1.361386   0.659859  
    16     0.832753   1.370942   0.656338  
    17     0.773136   1.381003   0.655634  
    18     0.830521   1.355519   0.664085  
    19     0.844764   1.401605   0.651408  
    20     0.791343   1.365795   0.660563  
    21     0.849177   1.386019 

[1.3811815940158467, 0.6429577464788733]

In [18]:
test = pd.read_csv(PATH/'sample_submission.csv')
test_x = load_features(TEST_PATH, filenames=test.fname, feature_name='log_mel_spec', n_fft=n_fft, n_features=n_features)
test_y = np.zeros(len(test_x))
len(test_x), len(test_y)

Loading cached data..
Loaded features for 9400 files


(9400, 9400)

In [19]:
# Takes the middle two seconds of the audio file to run the model on
class AudioArraysIndexDataset(ArraysIndexDataset):
    def __init__(self, x, y, n_segments, transform=None):
        self.n_segments = n_segments
        assert(len(x)==len(y))
        super().__init__(x, y, transform)
    
    def get_x(self, i):
        data = self.x[i].T
        if data.shape[0] < self.n_segments:
            data = np.pad(data, ((0, self.n_segments-data.shape[0]), (0, 0)), 'constant')
        elif data.shape[0] > self.n_segments:
            offset = (data.shape[0] - self.n_segments) // 2
            data = data[offset:offset+self.n_segments]
        return data
    
    def get_sz(self):
        return self.n_segments

In [21]:
test_ds = AudioArraysIndexDataset(test_x, test_y, n_segments)
test_dl = DataLoader(test_ds, shuffle=False, batch_size=16)
next(iter(test_dl))[0].shape

torch.Size([16, 220, 80])

In [22]:
predictions = predict(model, test_dl)
predictions.shape

(9400, 41)

In [23]:
# From the fizzbuzz starter kernel
top_3 = np.array(labels)[np.argsort(-predictions, axis=1)[:, :3]]
predicted_labels = [' '.join(list(x)) for x in top_3]
test.label = predicted_labels
test.to_csv('fixed_2d_conv_log_mel_spec.csv', index=False)