In [None]:
! pip install -U pip
! pip install -U torch==1.5.1
! pip install -U torchaudio==0.5.1
! pip install -U torchvision==0.6.1
! pip install -U matplotlib==3.2.1
! pip install -U clearml>=0.16.1
! pip install -U pandas==1.0.4
! pip install -U numpy==1.18.4
! pip install -U tensorboard==2.2.1

In [None]:
import PIL
import io

import pandas as pd
import numpy as np
from pathlib2 import Path
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.tensorboard import SummaryWriter

import torchaudio
from torchvision.transforms import ToTensor
from torchvision import models

from clearml import Task
from clearml.storage import StorageManager

%matplotlib inline

In [None]:
task = Task.init(project_name='Audio Example', task_name='audio classification UrbanSound8K')
configuration_dict = {'number_of_epochs': 3, 'batch_size': 8, 'dropout': 0.3, 'base_lr': 0.005, 
                      'number_of_mel_filters': 64, 'resample_freq': 22050}
configuration_dict = task.connect(configuration_dict)  # enabling configuration override by clearml
print(configuration_dict)  # printing actual configuration (after override in remote mode)

In [None]:
# Download UrbanSound8K dataset (https://urbansounddataset.weebly.com/urbansound8k.html)
# For simplicity we will use here a subset of that dataset using clearml StorageManager
path_to_UrbanSound8K = StorageManager.get_local_copy("https://allegro-datasets.s3.amazonaws.com/clearml/UrbanSound8K.zip", 
                                                     extract_archive=True)
path_to_UrbanSound8K_csv = Path(path_to_UrbanSound8K) / 'UrbanSound8K' / 'metadata' / 'UrbanSound8K.csv'
path_to_UrbanSound8K_audio = Path(path_to_UrbanSound8K) / 'UrbanSound8K' / 'audio'

In [None]:
class UrbanSoundDataset(Dataset):
    def __init__(self, csv_path, file_path, folderList, resample_freq=0, return_audio=False):
        self.file_path = file_path
        self.file_names = []
        self.labels = []
        self.folders = []
        self.n_mels = configuration_dict.get('number_of_mel_filters', 64)
        self.return_audio = return_audio
        self.resample = resample_freq
        
        #loop through the csv entries and only add entries from folders in the folder list
        csvData = pd.read_csv(csv_path)
        for i in range(0,len(csvData)):
            if csvData.iloc[i, 5] in folderList:
                self.file_names.append(csvData.iloc[i, 0])
                self.labels.append(csvData.iloc[i, 6])
                self.folders.append(csvData.iloc[i, 5])
        
    def __getitem__(self, index):
        #format the file path and load the file
        path = self.file_path / ("fold" + str(self.folders[index])) / self.file_names[index]
        soundData, sample_rate = torchaudio.load(path, out = None, normalization = True)

        if self.resample > 0:
            resample_transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=self.resample)
            soundData = resample_transform(soundData)
        
        # This will convert audio files with two channels into one
        soundData = torch.mean(soundData, dim=0, keepdim=True)
               
        # Convert audio to log-scale Mel spectrogram
        melspectrogram_transform = torchaudio.transforms.MelSpectrogram(sample_rate=self.resample, n_mels=self.n_mels)
        melspectrogram = melspectrogram_transform(soundData)
        melspectogram_db = torchaudio.transforms.AmplitudeToDB()(melspectrogram)
        
        #Make sure all spectrograms are the same size
        fixed_length = 3 * (self.resample//200)
        if melspectogram_db.shape[2] < fixed_length:
            melspectogram_db = torch.nn.functional.pad(melspectogram_db, (0, fixed_length - melspectogram_db.shape[2]))
        else:
            melspectogram_db = melspectogram_db[:, :, :fixed_length]
        
        if self.return_audio:
            fixed_length = 3 * self.resample
            if soundData.numel() < fixed_length:
                soundData = torch.nn.functional.pad(soundData, (0, fixed_length - soundData.numel())).numpy()
            else:
                soundData = soundData[0,:fixed_length].reshape(1,fixed_length).numpy()
        else:
            soundData = np.array([])

        return soundData, self.resample, melspectogram_db, self.labels[index]
    
    def __len__(self):
        return len(self.file_names)

train_set = UrbanSoundDataset(path_to_UrbanSound8K_csv, path_to_UrbanSound8K_audio, range(1,10), 
                              resample_freq=configuration_dict.get('resample_freq', 0), return_audio=False)
test_set = UrbanSoundDataset(path_to_UrbanSound8K_csv, path_to_UrbanSound8K_audio, [10], 
                             resample_freq=configuration_dict.get('resample_freq', 0), return_audio=True)
print("Train set size: " + str(len(train_set)))
print("Test set size: " + str(len(test_set)))

train_loader = torch.utils.data.DataLoader(train_set, batch_size = configuration_dict.get('batch_size', 4), 
                                           shuffle = True, pin_memory=True, num_workers=1)
test_loader = torch.utils.data.DataLoader(test_set, batch_size = configuration_dict.get('batch_size', 4), 
                                          shuffle = False, pin_memory=False, num_workers=1)

classes = ('air_conditioner', 'car_horn', 'children_playing', 'dog_bark', 'drilling', 'engine_idling', 
           'gun_shot', 'jackhammer', 'siren', 'street_music')

In [None]:
model = models.resnet18(pretrained=True)
model.conv1=nn.Conv2d(1, model.conv1.out_channels, kernel_size=model.conv1.kernel_size[0], 
                      stride=model.conv1.stride[0], padding=model.conv1.padding[0])
num_ftrs = model.fc.in_features
model.fc = nn.Sequential(*[nn.Dropout(p=configuration_dict.get('dropout', 0.25)), nn.Linear(num_ftrs, len(classes))])

In [None]:
optimizer = optim.SGD(model.parameters(), lr = configuration_dict.get('base_lr', 0.001), momentum = 0.9)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size = configuration_dict.get('number_of_epochs')//3, gamma = 0.1)
criterion = nn.CrossEntropyLoss()

In [None]:
device = torch.cuda.current_device() if torch.cuda.is_available() else torch.device('cpu')
print('Device to use: {}'.format(device))
model.to(device)

In [None]:
tensorboard_writer = SummaryWriter('./tensorboard_logs')

In [None]:
def plot_signal(signal, title, cmap=None):
    fig = plt.figure()
    if signal.ndim == 1:
        plt.plot(signal)
    else:
        plt.imshow(signal, cmap=cmap)    
    plt.title(title)
    
    plot_buf = io.BytesIO()
    plt.savefig(plot_buf, format='jpeg')
    plot_buf.seek(0)
    plt.close(fig)
    return ToTensor()(PIL.Image.open(plot_buf))

In [None]:
def train(model, epoch):
    model.train()
    for batch_idx, (sounds, sample_rate, inputs, labels) in enumerate(train_loader):
        inputs = inputs.to(device)
        labels = labels.to(device)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        iteration = epoch * len(train_loader) + batch_idx
        if batch_idx % log_interval == 0: #print training stats
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'
                  .format(epoch, batch_idx * len(inputs), len(train_loader.dataset), 
                          100. * batch_idx / len(train_loader), loss))
            tensorboard_writer.add_scalar('training loss/loss', loss, iteration)
            tensorboard_writer.add_scalar('learning rate/lr', optimizer.param_groups[0]['lr'], iteration)
                
        
        if batch_idx % debug_interval == 0:    # report debug image every "debug_interval" mini-batches
            for n, (inp, pred, label) in enumerate(zip(inputs, predicted, labels)):
                series = 'label_{}_pred_{}'.format(classes[label.cpu()], classes[pred.cpu()])
                tensorboard_writer.add_image('Train MelSpectrogram samples/{}_{}_{}'.format(batch_idx, n, series), 
                                             plot_signal(inp.cpu().numpy().squeeze(), series, 'hot'), iteration)

In [None]:
def test(model, epoch):
    model.eval()
    class_correct = list(0. for i in range(len(classes)))
    class_total = list(0. for i in range(len(classes)))
    with torch.no_grad():
        for idx, (sounds, sample_rate, inputs, labels) in enumerate(test_loader):
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = model(inputs)

            _, predicted = torch.max(outputs, 1)
            c = (predicted == labels)
            for i in range(len(inputs)):
                label = labels[i].item()
                class_correct[label] += c[i].item()
                class_total[label] += 1
        
            iteration = (epoch + 1) * len(train_loader)
            if idx % debug_interval == 0:    # report debug image every "debug_interval" mini-batches
                for n, (sound, inp, pred, label) in enumerate(zip(sounds, inputs, predicted, labels)):
                    series = 'label_{}_pred_{}'.format(classes[label.cpu()], classes[pred.cpu()])
                    tensorboard_writer.add_audio('Test audio samples/{}_{}_{}'.format(idx, n, series), 
                                                 sound, iteration, int(sample_rate[n]))
                    tensorboard_writer.add_image('Test MelSpectrogram samples/{}_{}_{}'.format(idx, n, series), 
                                                 plot_signal(inp.cpu().numpy().squeeze(), series, 'hot'), iteration)

    total_accuracy = 100 * sum(class_correct)/sum(class_total)
    print('[Iteration {}] Accuracy on the {} test images: {}%\n'.format(epoch, sum(class_total), total_accuracy))
    tensorboard_writer.add_scalar('accuracy/total', total_accuracy, iteration)

In [None]:
log_interval = 10
debug_interval = 25
for epoch in range(configuration_dict.get('number_of_epochs', 10)):
    train(model, epoch)
    test(model, epoch)
    scheduler.step()