In [None]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

In [None]:
from itertools import islice
import IPython.display as ipd
from pathlib import Path
import torch
import librosa
import numpy as np
import pandas as pd

In [3]:
from fastai import *
from fastai.conv_learner import *
from fastai.core import *
from fastai.metrics import *
from fastai.text import SortSampler, SortishSampler
from data_loading_utils import read_file
from preprocessing_utils import load_features
from helpers import *
from metrics import *
from models import *

In [4]:
from data_loading_utils import *

In [5]:
PATH = Path('data/audioset')

TRAIN_PATH = PATH/'train_segments_mono'
VALID_PATH = PATH/'eval_segments_mono'

TRAIN_PATH_LMS = PATH/'train_22500_1024_256_128'
VALID_PATH_LMS = PATH/'valid_22500_1024_256_128'

TRAIN_LABELS_CSV = PATH/'train_segments_cl.csv'
VALID_LABELS_CSV = PATH/'eval_segments_cl.csv'

LABELS_CSV = PATH/'class_labels_indices.csv'

In [6]:
train = pd.read_csv(TRAIN_LABELS_CSV, sep=' ', usecols=[0,3])
train.head()

Unnamed: 0,YTID,positive_labels
0,--ZhevVpy1s,/m/012xff
1,--aE2O5G5WE,"/m/03fwl,/m/04rlf,/m/09x0r"
2,--aO5cdqSAg,"/t/dd00003,/t/dd00005"
3,--aaILOrkII,"/m/032s66,/m/073cg4"
4,--cB2ZVjpnA,/m/01y3hg


In [7]:
valid = pd.read_csv(VALID_LABELS_CSV, sep=' ', usecols=[0,3])
valid.head()

Unnamed: 0,YTID,positive_labels
0,--4gqARaEJE,"/m/068hy,/m/07q6cd_,/m/0bt9lr,/m/0jbk"
1,--BfvyPmVMo,/m/03l9g
2,--U7joUcTCo,/m/01b_21
3,--i-y1v8Hy8,"/m/04rlf,/m/09x0r,/t/dd00004,/t/dd00005"
4,-0BIyqJj9ZU,"/m/07rgt08,/m/07sq110,/t/dd00001"


In [8]:
label_df = pd.read_csv(LABELS_CSV, index_col='mid', usecols=['mid', 'display_name'])
label_df.head()

Unnamed: 0_level_0,display_name
mid,Unnamed: 1_level_1
/m/09x0r,Speech
/m/05zppz,"Male speech, man speaking"
/m/02zsn,"Female speech, woman speaking"
/m/0ytgt,"Child speech, kid speaking"
/m/01h8n0,Conversation


In [9]:
train.shape, valid.shape, label_df.shape,

((18725, 2), (17492, 2), (527, 1))

In [10]:
train_fnames = list(fname.name for fname in TRAIN_PATH.iterdir())
valid_fnames = list(fname.name for fname in VALID_PATH.iterdir())
len(train_fnames), len(valid_fnames)

(18725, 17492)

In [11]:
def add_fnames_to_df(df, path):
    fnames = list(fname.name for fname in path.iterdir())
    fns = pd.DataFrame({'fname': fnames})
    fns['YTID'] = fns.fname.apply(lambda fname: fname[:11])
    df = df.merge(fns)
    return df

In [12]:
train = add_fnames_to_df(train, TRAIN_PATH)
train.head()

Unnamed: 0,YTID,positive_labels,fname
0,--ZhevVpy1s,/m/012xff,--ZhevVpy1s_50.000.wav
1,--aE2O5G5WE,"/m/03fwl,/m/04rlf,/m/09x0r",--aE2O5G5WE_0.000.wav
2,--aO5cdqSAg,"/t/dd00003,/t/dd00005",--aO5cdqSAg_30.000.wav
3,--aaILOrkII,"/m/032s66,/m/073cg4",--aaILOrkII_200.000.wav
4,--cB2ZVjpnA,/m/01y3hg,--cB2ZVjpnA_30.000.wav


In [13]:
valid = add_fnames_to_df(valid, VALID_PATH)
valid.head()

Unnamed: 0,YTID,positive_labels,fname
0,--4gqARaEJE,"/m/068hy,/m/07q6cd_,/m/0bt9lr,/m/0jbk",--4gqARaEJE_0.000.wav
1,--BfvyPmVMo,/m/03l9g,--BfvyPmVMo_20.000.wav
2,--U7joUcTCo,/m/01b_21,--U7joUcTCo_0.000.wav
3,--i-y1v8Hy8,"/m/04rlf,/m/09x0r,/t/dd00004,/t/dd00005",--i-y1v8Hy8_0.000.wav
4,-0BIyqJj9ZU,"/m/07rgt08,/m/07sq110,/t/dd00001",-0BIyqJj9ZU_30.000.wav


In [14]:
label_ids = sorted(label_df.index)
label_id_toi = {label_id:i for i, label_id in enumerate(label_ids)}
labels = [label_df.loc[label_ids[i]].display_name for i in range(len(label_ids))]

In [15]:
idx = 106
row = train.iloc[idx]

row_label_ids = row.positive_labels.split(',')
row_label_idxs = [label_id_toi[mid] for mid in row_label_ids]
row_labels = [labels[i] for i in row_label_idxs]

print(row_label_ids)
print(row_label_idxs)
print(row_labels)

file, sr = read_file(row.fname, path=TRAIN_PATH)
print(file.shape, sr)

['/m/015lz1', '/m/04rlf', '/t/dd00004']
[16, 191, 496]
['Singing', 'Music', 'Female singing']
(220500,) 22050


In [16]:
ipd.Audio(file, rate=sr)

In [17]:
def get_y(df):
    df_labels = df['positive_labels']
    y = np.zeros((len(df_labels), len(labels)))
    for i, idxs in enumerate(df_labels.apply(lambda row_labels: [label_id_toi[label]
                                                                 for label in row_labels.split(',')])):
        y[i, idxs] = 1
    return y

In [18]:
train_y = get_y(train)
train_y.shape

(18725, 527)

In [19]:
valid_y = get_y(valid)
valid_y.shape

(17492, 527)

In [20]:
len(train.iloc[1].positive_labels.split(',')) == train_y[1].sum()

True

In [21]:
len(valid.iloc[100].positive_labels.split(',')) == valid_y[100].sum()

True

In [22]:
# train.to_csv(PATH/'train.csv')
# valid.to_csv(PATH/'valid.csv')

### Train Model

In [23]:
stats = (-26.88621199474663, 19.561070532225614)

norm = Normalize(*stats)
shift = RandomPitchTimeShift(min_x=1.0, max_x=1.0, max_y=1.2)
light = RandomLight()

# trn_tfms = Transforms([light, shift, norm])
trn_tfms = Transforms([norm])
val_tfms = Transforms([norm])

In [24]:
train_fname = train.fname
valid_fname = valid.fname

In [25]:
num_classes = len(labels)
opt = optim.Adam
# metrics = [accuracy, mapk]
metrics = [recall, precision]
loss = F.binary_cross_entropy_with_logits

In [26]:
class AudioSetFilesDataset(BaseDataset):
    def __init__(self, path, fnames, y, use_tfms=False, transform=None):
        self.path = Path(path)
        self.fnames = fnames
        self.y = y
        self.use_tfms = use_tfms
        assert len(fnames) == len(y)
        super().__init__(transform)
    def get_x(self, i):
        fname = self.fnames[i]
        # These transforms require pretransformed audio files
        # This did not significantly improve performance
        if self.use_tfms:
            fname = f'{fname[:-4]}_{np.random.randint(10)}.wav'
        fname = self.path/f'{fname}.npy'
        return np.load(fname)
    def get_y(self, i):
        return self.y[i]
    def get_n(self):
        return len(self.y)
    def get_c(self):
        return self.y.shape[1] if len(self.y.shape)>1 else 0
    def get_sz(self):
        return self.get_x(0).shape[0]
    @property
    def is_multi(self): return True

In [28]:
# Copied from fastai/metrics.py and fixed to avoid division by zero bug
eps = np.finfo(float).eps
eps

def recall(preds, targs, thresh=0.5):
    pred_pos = preds > thresh
    tpos = torch.mul((targs.byte() == pred_pos), targs.byte())
    return tpos.sum()/(targs.sum() + eps)

def precision(preds, targs, thresh=0.5):
    pred_pos = preds > thresh
    tpos = torch.mul((targs.byte() == pred_pos), targs.byte())
    return tpos.sum()/(pred_pos.sum() + eps)

In [29]:
bs = 16

trn_ds = AudioSetFilesDataset(TRAIN_PATH_LMS, train_fname, train_y, transform=trn_tfms)
trn_dl = AudioDataLoader2d(trn_ds,
                           batch_size=bs,
                           sampler=SortishSampler(trn_ds, key=lambda x: trn_ds[x][0].shape[1], bs=bs))

val_ds = AudioSetFilesDataset(VALID_PATH_LMS, valid_fname, valid_y, transform=val_tfms)
val_dl = AudioDataLoader2d(val_ds,
                           batch_size=bs,
                           sampler=SortSampler(val_ds, key=lambda x: val_ds[x][0].shape[1]))

In [30]:
md = ModelData(PATH, trn_dl, val_dl)
model = AudioResNet(BasicBlock, [5, 5, 5, 5], num_classes=num_classes).cuda()
learn = ConvLearner.from_model_data(model, md, crit=loss, metrics=metrics, opt_fn=opt)

In [31]:
# x1,y1 = next(iter(trn_dl))
# x1v = V(x1)
# yh1 = model(x1v)
# y1.size(), yh1.size()
# F.binary_cross_entropy_with_logits(yh1, V(y1))

In [32]:
# learn.lr_find()

In [33]:
lr = 1e-3
cycle_len = 10

learn.fit(lr, 1, cycle_len=cycle_len, use_clr_beta=(5, 20, 0.95, 0.75))

learn.save_cycle('Audioset_1024_256_128_modified', cycle_len)

A Jupyter Widget



ZeroDivisionError: division by zero

In [None]:
# learn.load_cycle('Audioset_1024_256_128_modified', 10)

# learn.lr_find()

In [None]:
learn.sched.plot(n_skip_end=300)

In [None]:
learn.load_cycle('Audioset_1024_256_128_modified', 10)

lr = 1e-4

learn.fit(lr, 1, cycle_len=30)
learn.save_cycle('Audioset_1024_256_128_modified', 40)

In [None]:
learn.load_cycle('Audioset_1024_256_128_modified', 40)

lr = 1e-5

learn.fit(lr, 1, cycle_len=30)
learn.save_cycle('Audioset_1024_256_128_modified', 70)