In [26]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import Dataset
import torchaudio
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from os.path import join as oj
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
data_dir = '/scratch/users/vision/data/cosmo/UrbanSound8K'
out_dir = '/scratch/users/vision/data/cosmo/audio_models'
from audio_helper import *
from model import Net
from copy import deepcopy
import pickle as pkl
from captum.attr import (
    GradientShap,
    DeepLift,
    DeepLiftShap,
    IntegratedGradients,
    LayerConductance,
    NeuronConductance,
    NoiseTunnel,
)
from util import to_freq
from tqdm import tqdm

import sys
sys.path.append('..')
import transform_wrappers
from functools import partial

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# import the data

We will use the UrbanSound8K dataset to train our network. It is
available for free `here <https://urbansounddataset.weebly.com/>`_ and contains
10 audio classes with over 8000 audio samples! Once you have downloaded
the compressed dataset, extract it to your current working directory.

In [2]:
csvData = pd.read_csv(oj(data_dir, 'metadata/UrbanSound8K.csv'))
print(csvData.iloc[0, :])

slice_file_name    100032-3-0-0.wav
fsID                         100032
start                             0
end                        0.317551
salience                          1
fold                              5
classID                           3
class                      dog_bark
Name: 0, dtype: object


The 10 audio classes in the UrbanSound8K dataset are air_conditioner,
car_horn, children_playing, dog_bark, drilling, enginge_idling,
gun_shot, jackhammer, siren, and street_music. Let’s play a couple files
and see what they sound like. The first file is street music and the
second is an air conditioner.




In [None]:
import IPython.display as ipd
ipd.Audio(oj(data_dir, 'audio/fold1/108041-9-0-5.wav'))
# ipd.Audio(oj(data_dir, 'audio/fold5/100852-0-0-19.wav'))

# create dataloader

The UrbanSound8K dataset is separated
into 10 folders. We will use the data from 9 of these folders to train
our network and then use the 10th folder to test the network.

We use ``torchaudio.load()`` to convert the wav
files to tensors. ``torchaudio.load()`` returns a tuple containing the
newly created tensor along with the sampling frequency of the audio file
(44.1kHz for UrbanSound8K). The dataset uses two channels for audio so
we will use ``torchaudio.transforms.DownmixMono()`` to convert the audio
data to one channel. 

Next, we need to format the audio data. The network
we will make takes an input size of 32,000, while most of the audio
files have well over 100,000 samples. The UrbanSound8K audio is sampled
at 44.1kHz, so 32,000 samples only covers around 700 milliseconds. By
downsampling the audio to aproximately 8kHz, we can represent 4 seconds
with the 32,000 samples. This downsampling is achieved by taking every
fifth sample of the original audio tensor. Not every audio tensor is
long enough to handle the downsampling so these tensors will need to be
padded with zeros. The minimum length that won’t require padding is
160,000 samples.

In [4]:
csv_path = oj(data_dir, 'metadata/UrbanSound8K.csv')
file_path = oj(data_dir, 'audio/')

train_set = UrbanSoundDataset(csv_path, file_path, range(1, 10))
test_set = UrbanSoundDataset(csv_path, file_path, [10])
print("Train set size: " + str(len(train_set)))
print("Test set size: " + str(len(test_set)))

kwargs = {'num_workers': 1, 'pin_memory': True} if device == 'cuda' else {} #needed for using datasets on gpu

train_loader = torch.utils.data.DataLoader(train_set, batch_size = 128, shuffle = True, **kwargs)
test_loader = torch.utils.data.DataLoader(test_set, batch_size = 128, shuffle = True, **kwargs)

Train set size: 7895
Test set size: 837


# training / testing

For this tutorial we will use a convolutional neural network to process
the raw audio data. Usually more advanced transforms are applied to the
audio data, however CNNs can be used to accurately process the raw data.
The specific architecture is modeled after the M5 network architecture
described in https://arxiv.org/pdf/1610.00087.pdf. An important aspect
of models processing raw audio data is the receptive field of their
first layer’s filters. Our model’s first filter is length 80 so when
processing audio sampled at 8kHz the receptive field is around 10ms.
This size is similar to speech processing applications that often use
receptive fields ranging from 20ms to 40ms.

If trained on 9 folders, the network should be more than 50% accurate by
the end of the training process. Training on less folders will result in
a lower overall accuracy but may be necessary if long runtimes are a
problem. Greater accuracies can be achieved using deeper CNNs at the
expense of a larger memory footprint.

For more advanced audio applications, such as speech recognition,
recurrent neural networks (RNNs) are commonly used. There are also other
data preprocessing methods, such as finding the mel frequency cepstral
coefficients (MFCC), that can reduce the size of the dataset.

In [None]:
model = Net()
model.to(device)
print(model)

optimizer = optim.Adam(model.parameters(), lr = 0.01, weight_decay = 0.0001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size = 20, gamma = 0.1)

In [None]:
log_interval = 20
for epoch in range(1, 21):
    # maybe set lr to 0.001 at epoch 31
    scheduler.step()
    train(model, epoch, optimizer, train_loader, device)
    acc = test(model, test_loader, device)
    pkl.dump(deepcopy(model), open(oj(out_dir, f'audio_model_{epoch}_{acc:.1f}.pkl'), 'wb'))

# load the best model

In [60]:
model = pkl.load(open(oj(out_dir, 'audio_model_7_57.2.pkl'), 'rb'))
test(model, test_loader, device) # should print the test acc


Test set: Accuracy: 479/837 (57%)



57.22819593787336

**look at individual example**

In [134]:
x, y = train_set[0]
x = x.to(device)
# x_np = x.cpu().flatten().numpy()
# plt.plot(x_np[:2000])
# plt.show()

In [136]:
xt = torch.rfft(deepcopy(x), signal_ndim=1)

In [154]:
x2 = torch.irfft(xt, signal_ndim=1)

In [138]:
transform = lambda x: torch.irfft(x, signal_ndim=1)[:, :-1] #transform_wrappers.lay_from_w(D)
net = transform_wrappers.Net_with_transform(model, 
                                            transform=transform,
                                            reshape=transform_wrappers.ReshapeLayer((1, 32000))).to(device)
# print(list(net.modules()))

In [139]:
transform(xt).shape

torch.Size([1, 32000])

In [140]:
net(xt)

tensor([[-6.8851, -1.8586, -3.2695, -1.4533, -2.9915, -5.7169, -1.2332, -4.4807,
         -1.5578, -5.4086]], device='cuda:0', grad_fn=<LogSoftmaxBackward>)

# interpretations

In [161]:
len(test_set)

837

In [None]:
results = {
    key: [] for key in range(10)
}
for i in tqdm(range(len(test_set))):
    x, y = test_set[i]
    xt = torch.rfft(deepcopy(x), signal_ndim=1)

    # this only works with cpu
    device_captum = 'cpu'
    net = net.to(device_captum)
    xt = xt.to(device_captum)
    # x = xt.unsqueeze(0).to(device_captum)
    xt.requires_grad = True
    baseline = torch.zeros(xt.shape).to(device_captum)
    ig = IntegratedGradients(net.to(device_captum))
    attributions_ig, delta_ig = ig.attribute(deepcopy(xt), deepcopy(baseline),
                                             target=int(y), return_convergence_delta=True)
    attributions_ig = to_freq(attributions_ig)
    results[y].append(deepcopy(attributions_ig))
    
    if i % 20 == 0:
        pkl.dump(results, open(oj(out_dir, 'audio_ig_correct.pkl'), 'wb'))





 77%|███████▋  | 645/837 [44:54<13:26,  4.20s/it][A[A[A[A



 77%|███████▋  | 646/837 [44:58<13:17,  4.17s/it][A[A[A[A



 77%|███████▋  | 647/837 [45:02<13:10,  4.16s/it][A[A[A[A



 77%|███████▋  | 648/837 [45:06<13:05,  4.16s/it][A[A[A[A



 78%|███████▊  | 649/837 [45:10<13:00,  4.15s/it][A[A[A[A



 78%|███████▊  | 650/837 [45:14<12:54,  4.14s/it][A[A[A[A



 78%|███████▊  | 651/837 [45:19<12:51,  4.15s/it][A[A[A[A



 78%|███████▊  | 652/837 [45:23<12:47,  4.15s/it][A[A[A[A



 78%|███████▊  | 653/837 [45:27<12:44,  4.16s/it][A[A[A[A



 78%|███████▊  | 654/837 [45:31<12:41,  4.16s/it][A[A[A[A



 78%|███████▊  | 655/837 [45:35<12:37,  4.16s/it][A[A[A[A



 78%|███████▊  | 656/837 [45:39<12:35,  4.17s/it][A[A[A[A



 78%|███████▊  | 657/837 [45:44<12:28,  4.16s/it][A[A[A[A



 79%|███████▊  | 658/837 [45:48<12:22,  4.15s/it][A[A[A[A



 79%|███████▊  | 659/837 [45:52<12:17,  4.14s/it][A[A[A[A



 79%|███████▉  | 660/