In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [2]:
from itertools import islice
import IPython.display as ipd
from pathlib import Path
import torch
import librosa
import numpy as np
import pandas as pd

In [11]:
from fastai import *
from fastai.conv_learner import *
from fastai.core import *
from fastai.metrics import accuracy, accuracy_np
from fastai.text import SortSampler, SortishSampler
from data_loading_utils import read_file
from preprocessing_utils import load_features
from helpers import *
from metrics import *
from models import *

In [3]:
from data_loading_utils import *

In [4]:
PATH = Path('data/audioset')

TRAIN_PATH = PATH/'train_segments_mono'
VALID_PATH = PATH/'eval_segments_mono'

TRAIN_PATH_LMS = PATH/'train_22500_1024_256_128'
VALID_PATH_LMS = PATH/'valid_22500_1024_256_128'

TRAIN_LABELS_CSV = PATH/'train_segments_cl.csv'
VALID_LABELS_CSV = PATH/'eval_segments_cl.csv'

LABELS_CSV = PATH/'class_labels_indices.csv'

In [5]:
train = pd.read_csv(PATH/'train.csv')
valid = pd.read_csv(PATH/'valid.csv')
train.head()

Unnamed: 0,YTID,positive_labels,fname
0,--ZhevVpy1s,/m/012xff,--ZhevVpy1s_50.000.wav
1,--aE2O5G5WE,"/m/03fwl,/m/04rlf,/m/09x0r",--aE2O5G5WE_0.000.wav
2,--aO5cdqSAg,"/t/dd00003,/t/dd00005",--aO5cdqSAg_30.000.wav
3,--aaILOrkII,"/m/032s66,/m/073cg4",--aaILOrkII_200.000.wav
4,--cB2ZVjpnA,/m/01y3hg,--cB2ZVjpnA_30.000.wav


In [6]:
label_df = pd.read_csv(LABELS_CSV, index_col='mid', usecols=['mid', 'display_name'])
label_ids = sorted(label_df.index)
label_id_toi = {label_id:i for i, label_id in enumerate(label_ids)}
labels = [label_df.loc[label_ids[i]].display_name for i in range(len(label_ids))]

In [7]:
def get_y(df):
    df_labels = df['positive_labels']
    y = np.zeros((len(df_labels), len(labels)))
    for i, idxs in enumerate(df_labels.apply(lambda row_labels: [label_id_toi[label]
                                                                 for label in row_labels.split(',')])):
        y[i, idxs] = 1
    return y

In [8]:
train_y = get_y(train)
train_y.shape

(18725, 527)

In [9]:
valid_y = get_y(valid)
valid_y.shape

(17492, 527)

In [12]:
stats = (-26.88621199474663, 19.561070532225614)

norm = Normalize(*stats)
shift = RandomPitchTimeShift(min_x=1.0, max_x=1.0, max_y=1.2)
light = RandomLight()

# NOTE: using these transforms did show slight improvement at one point..
# and used with TTA also got a slight bump when making predictions
# trn_tfms = Transforms([light, shift, norm])
trn_tfms = Transforms([norm])
val_tfms = Transforms([norm])

In [33]:
num_classes = len(labels)
opt = optim.Adam
metrics = [recall, precision, f1]
loss = F.binary_cross_entropy_with_logits # needed since originally included the nn.Sigmoid final layer

# TODO: switch back to these when fine tuning
# metrics = [accuracy, mapk]
# loss = F.cross_entropy

In [14]:
class AudioSetFilesDataset(BaseDataset):
    def __init__(self, path, fnames, y, use_tfms=False, transform=None):
        self.path = Path(path)
        self.fnames = fnames
        self.y = y
        self.use_tfms = use_tfms
        assert len(fnames) == len(y)
        super().__init__(transform)
    def get_x(self, i):
        fname = self.fnames[i]
        if self.use_tfms:
            fname = f'{fname[:-4]}_{np.random.randint(10)}.wav'
        fname = self.path/f'{fname}.npy'
        return np.load(fname)
    def get_y(self, i):
        return self.y[i]
    def get_n(self):
        return len(self.y)
    def get_c(self):
        return self.y.shape[1] if len(self.y.shape)>1 else 0
    def get_sz(self):
        return self.get_x(0).shape[0]
    @property
    def is_multi(self): return True

In [16]:
bs = 4

train_fname = train.fname
valid_fname = valid.fname

trn_ds = AudioSetFilesDataset(TRAIN_PATH_LMS, train_fname, train_y, transform=trn_tfms)
trn_dl = AudioDataLoader2d(trn_ds,
                           batch_size=bs,
                           sampler=SortishSampler(trn_ds, key=lambda x: trn_ds[x][0].shape[1], bs=bs))

val_ds = AudioSetFilesDataset(VALID_PATH_LMS, valid_fname, valid_y, transform=val_tfms)
val_dl = AudioDataLoader2d(val_ds,
                           batch_size=bs,
                           sampler=SortSampler(val_ds, key=lambda x: val_ds[x][0].shape[1]))

In [17]:
def conv_block(in_channels, out_channels, kernel_size=3, stride=1, padding=1):
    conv = nn.Conv2d(in_channels, 
                     out_channels,
                     kernel_size=kernel_size, 
                     stride=stride,
                     padding=padding, 
                     bias=False)
    batch_norm = nn.BatchNorm2d(out_channels, 
                                momentum=0.01)
    relu = nn.ReLU()
    return nn.Sequential(conv, batch_norm, relu)

class AudioCNN(nn.Module):
    def __init__(self, n_classes):
        super().__init__()
        layers = [Lambda(lambda x: x.view(x.shape[0], 1, x.shape[1], x.shape[2]))]

        # B1 to B5
        in_channels = 1
        num_filters = [16, 32, 64, 128, 256]
        for out_channels in num_filters:
            layers += [conv_block(in_channels,  out_channels, kernel_size=3, padding=1),
                       conv_block(out_channels, out_channels, kernel_size=3, padding=1),
                       nn.MaxPool2d(2)]
            in_channels = out_channels
        
        # B6
        layers += [conv_block(256, 512, kernel_size=3, padding=1)] #  nn.MaxPool2d(2)]
        
        # F1
        layers += [conv_block(512, 1024, kernel_size=2, padding=0)]
        
        # F2
        #layers += [conv_block(1024, n_classes, kernel_size=1, padding=0)]
        layers += [
            nn.AdaptiveAvgPool2d(1),
            Flatten(),
            #nn.Dropout(0.1),
            nn.Linear(1024, num_classes),
            nn.Sigmoid() # added
        ]
                
        self.layers = nn.Sequential(*layers)
        self.n_classes = n_classes
        
    def forward(self, x):
        return self.layers(x)

In [18]:
md = ModelData(PATH, trn_dl, val_dl)
model = AudioCNN(num_classes).cuda()
learn = ConvLearner.from_model_data(model, md, crit=loss, metrics=metrics, opt_fn=opt)

In [19]:
learn.load_cycle('Audioset_CNN_1024_256_128_modified', 50)

In [28]:
val_log_preds, val_targs = learn.predict_with_targs()

In [30]:
val_log_preds = torch.from_numpy(val_log_preds)
val_targs = torch.from_numpy(val_targs)

In [32]:
print("Recall: ", recall(val_log_preds, val_targs))
print("Precision: ", precision(val_log_preds, val_targs))
print("F1: ", f1(val_log_preds, val_targs))

Recall:  0.27513024650984463
Precision:  0.48190724500276527
F1:  0.3502792413822806
