# Training a Sound Classifier with PyTorch

## 0. Install libraries

In [1]:
!pip install torch==1.8.1 torchvision==0.9.1 torchaudio==0.8.1 torchsummary==1.5.1

Collecting torch==1.8.1
[?25l  Downloading https://files.pythonhosted.org/packages/56/74/6fc9dee50f7c93d6b7d9644554bdc9692f3023fa5d1de779666e6bf8ae76/torch-1.8.1-cp37-cp37m-manylinux1_x86_64.whl (804.1MB)
[K     |████████████████████████████████| 804.1MB 21kB/s 
[?25hCollecting torchvision==0.9.1
[?25l  Downloading https://files.pythonhosted.org/packages/93/8a/82062a33b5eb7f696bf23f8ccf04bf6fc81d1a4972740fb21c2569ada0a6/torchvision-0.9.1-cp37-cp37m-manylinux1_x86_64.whl (17.4MB)
[K     |████████████████████████████████| 17.4MB 116kB/s 
[?25hCollecting torchaudio==0.8.1
[?25l  Downloading https://files.pythonhosted.org/packages/aa/55/01ad9244bcd595e39cea5ce30726a7fe02fd963d07daeb136bfe7e23f0a5/torchaudio-0.8.1-cp37-cp37m-manylinux1_x86_64.whl (1.9MB)
[K     |████████████████████████████████| 1.9MB 36.3MB/s 
[31mERROR: torchtext 0.10.0 has requirement torch==1.9.0, but you'll have torch 1.8.1 which is incompatible.[0m
Installing collected packages: torch, torchvision, torchaudi

## 1. CNN

In [2]:
from torch import nn

class CNNNetwork(nn.Module):

    def __init__(self):
        super().__init__()

        # 4 conv blocks -> flatten -> Linear -> SoftMax

        self.conv1 = nn.Sequential(
            nn.Conv2d(
                in_channels=1,
                out_channels=16,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )

        self.conv2 = nn.Sequential(
            nn.Conv2d(
                in_channels=16,
                out_channels=32,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )

        self.conv3 = nn.Sequential(
            nn.Conv2d(
                in_channels=32,
                out_channels=64,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )

        self.conv4 = nn.Sequential(
            nn.Conv2d(
                in_channels=64,
                out_channels=128,
                kernel_size=3,
                stride=1,
                padding=2
            ),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2)
        )
        self.flatten = nn.Flatten()
        self.linear = nn.Linear(in_features=128 * 5 * 4, out_features=10)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_data):
        x = self.conv1(input_data)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)

        x = self.flatten(x)
        logits = self.linear(x)
        predictions = self.softmax(logits)

        return predictions

## 2. UrbanSoundDataset Class

In [3]:
import os
import pandas as pd
from torch.utils.data import Dataset
import torchaudio
import torch


class UrbanSoundDataset(Dataset):

    def __init__(self, annotations_file, audio_dir, transformation, target_sample_rate, num_samples, device):
        self.annotations = pd.read_csv(annotations_file)
        self.audio_dir = audio_dir
        self.device = device
        self.transformation = transformation.to(self.device)
        self.target_sample_rate = target_sample_rate
        self.num_samples = num_samples

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, index):
        # Get the audio sample at 'index'
        audio_sample_path = self._get_audio_sample_path(index)

        # Get the label associated with this audio sample path
        label = self._get_audio_sample_label(index)
        signal, sr = torchaudio.load(audio_sample_path)

        # Register the signal to the device
        signal = signal.to(self.device)

        # Make sure that sample rate is same for all
        signal = self._resample_if_necessary(signal, sr)

        # Use a single channel (mono) in case the audio has multi-channels
        signal = self._mix_down_if_necessary(signal)

        # In case our audio file has more samples than the ones we need (num_sammples)
        signal = self._cut_if_necessary(signal)

        # In case our audio file has less samples than the ones we need (num_sammples)
        signal = self._right_pad_if_necessary(signal)

        signal = self.transformation(signal)
        return signal, label

    def _cut_if_necessary(self, signal):
        # signal -> Tensor -> (num_channels, num_samples) -> (1, num_samples) -> (1, 50000) -> (1, 22050)
        if signal.shape[1] > self.num_samples:
            signal = signal[:, :self.num_samples]
        return signal

    def _right_pad_if_necessary(self, signal):
        # [1, 1, 1] -> [1, 1, 1, 0, 0]
        length_signal = signal.shape[1]
        if length_signal < self.num_samples:
            num_missing_samples = self.num_samples - length_signal

            # We want to do right-padding (append and NOT pre-pend)
            # Example [1, 1, 1] -> [1, 1, 1, 0, 0]
            last_dim_padding = (0, num_missing_samples)
            signal = torch.nn.functional.pad(signal, last_dim_padding)
        return signal

    def _resample_if_necessary(self, signal, sr):
        if sr != self.target_sample_rate:
            resampler = torchaudio.transforms.Resample(sr, self.target_sample_rate)
            signal = resampler(signal)
        return signal

    def _mix_down_if_necessary(self, signal):
        # signal -> (num_channels, samples) -> (2, 16000) -> (1, 16000)

        # If audio is not mono (more than 1 channel)
        if signal.shape[0] > 1:
            signal = torch.mean(signal, dim=0, keepdim=True)
        return signal

    def _get_audio_sample_path(self, index):
        # Get the fold number in format "foldx", where x is the fold number
        # present in the 6th coloumn of the csv file.
        fold = f"fold{self.annotations.iloc[index, 5]}"

        # Get the complete path of the audio file
        # audio_dir/fold/{name of audio file}
        path = os.path.join(self.audio_dir, fold, self.annotations.iloc[index, 0])
        return path

    def _get_audio_sample_label(self, index):
        # Get class label (7th coloumn in the CSV file)
        return self.annotations.iloc[index, 6]

## 3. Mount Google Drive

In [4]:
from google.colab import drive

drive.mount("/content/gdrive")

Mounted at /content/gdrive


## 4. Training

In [5]:
import torch
import torchaudio
from torch import nn
from torch.utils.data import DataLoader
from torchsummary import summary


BATCH_SIZE = 128
EPOCHS = 10
LEARNING_RATE = 0.001

ANNOTATIONS_FILE = "/content/gdrive/MyDrive/Datasets/UrbanSound8K/metadata/UrbanSound8K.csv"
AUDIO_DIR = "/content/gdrive/MyDrive/Datasets/UrbanSound8K/audio"
SAMPLE_RATE = 22050
NUM_SAMPLES = 22050


def train_one_epoch(model, data_loader, loss_func, optimiser, device):
    for input, target in data_loader:
        input, target = input.to(device), target.to(device)

        # Calculate loss
        predictions = model(input)
        loss = loss_func(predictions, target)

        # Reset gradients to zero after every batch of iteration
        optimiser.zero_grad()

        # Backpropogate loss and update weights
        loss.backward()  # Backpropogate
        optimiser.step()  # Update the weights

    print(f"Loss: {loss.item()}")


def train(model, data_loader, loss_func, optimiser, device, epochs):
    """
    Function that trains over all the epochs, one by one.

    :param model: The feed-forward model class object
    :param data_loader: Pytorch's DataLoader class object, with defined batch size for loading
    :param loss_func: Function for evaluating th loss
    :param optimiser: Adam optimizer, with learning rate given as LEARNING_RATE
    :param device: CPU/GPU
    :param epochs: The number of EPOCHS defined
    :return:
    """
    for i in range(epochs):
        print(f"Epoch {i + 1}")
        train_one_epoch(model, data_loader, loss_func, optimiser, device)
        print("----------------")
    print("Training finished")


if __name__ == "__main__":
    # Check for GPU availability
    if torch.cuda.is_available():
        device = "cuda"
    else:
        device = "cpu"

    # Instantiate our dataset object
    mel_spectrogram = torchaudio.transforms.MelSpectrogram(
        sample_rate=SAMPLE_RATE,
        n_fft=1024,
        hop_length=512,
        n_mels=64
    )

    usd = UrbanSoundDataset(ANNOTATIONS_FILE, AUDIO_DIR, mel_spectrogram, SAMPLE_RATE, NUM_SAMPLES, device)

    # Create data loader for train set
    train_data_loader = DataLoader(usd, batch_size=BATCH_SIZE)

    # Build model
    cnn = CNNNetwork().to(device)
    summary(model=cnn, input_size=(1, 64, 44))

    # Instantiate loss func + optimiser
    loss_func = nn.CrossEntropyLoss()
    optimiser = torch.optim.Adam(cnn.parameters(), lr=LEARNING_RATE)

    train(model=cnn,
          data_loader=train_data_loader,
          loss_func=loss_func,
          optimiser=optimiser,
          device=device, epochs=EPOCHS)

    # Save the trained model
    torch.save(cnn.state_dict(), "feedforwardnet.pth")
    print("Model trained and saved to feedforwardnet.pth")

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 16, 66, 46]             160
              ReLU-2           [-1, 16, 66, 46]               0
         MaxPool2d-3           [-1, 16, 33, 23]               0
            Conv2d-4           [-1, 32, 35, 25]           4,640
              ReLU-5           [-1, 32, 35, 25]               0
         MaxPool2d-6           [-1, 32, 17, 12]               0
            Conv2d-7           [-1, 64, 19, 14]          18,496
              ReLU-8           [-1, 64, 19, 14]               0
         MaxPool2d-9             [-1, 64, 9, 7]               0
           Conv2d-10           [-1, 128, 11, 9]          73,856
             ReLU-11           [-1, 128, 11, 9]               0
        MaxPool2d-12            [-1, 128, 5, 4]               0
          Flatten-13                 [-1, 2560]               0
           Linear-14                   