# Experiment: Speech Recognition

Since maxout networks seem to be popular for speech recognition tasks, we decided to run some experiments testing our initialization strategy on the `Speech Commands` dataset. In this notebook we explore how to use it in scope of `Pytorch` package. We follow an official `Pytorch` `Speech Commands` [tutorial](https://pytorch.org/tutorials/intermediate/speech_command_classification_with_torchaudio_tutorial.html).

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
#import torchaudio
import sys

import matplotlib.pyplot as plt
import IPython.display as ipd

from tqdm import tqdm
#import numpy as np

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


## Importing and splitting the dataset

In [3]:
import os
from torchaudio.datasets import SPEECHCOMMANDS
#import torchaudio.transforms as transforms

> *Note: we encountered issues with incompatible versions of pytorch and torchaudio packages in conda environment. Running the following commands ensured the installed versions were compatible:*

    conda install pytorch==1.9.0 torchvision==0.10.0 torchaudio==0.9.0 cpuonly -c pytorch

In [4]:
class SubsetSC(SPEECHCOMMANDS):
    def __init__(self, subset: str = None):
        super().__init__("./", download=True)

        def load_list(filename):
            filepath = os.path.join(self._path, filename)
            with open(filepath) as fileobj:
                return [os.path.normpath(os.path.join(self._path, line.strip())) for line in fileobj]

        if subset == "validation":
            self._walker = load_list("validation_list.txt")
        elif subset == "testing":
            self._walker = load_list("testing_list.txt")
        elif subset == "training":
            excludes = load_list("validation_list.txt") + load_list("testing_list.txt")
            excludes = set(excludes)
            self._walker = [w for w in self._walker if w not in excludes]

In [5]:
# Create training and testing split of the data. We do not use validation in this tutorial.
train_set = SubsetSC("training")
test_set = SubsetSC("testing")

waveform, sample_rate, label, speaker_id, utterance_number = train_set[0]

In [6]:
N = len(train_set[0][0][0])

In [7]:
Labels = sorted(list(set(datapoint[2] for datapoint in train_set)))

def label_to_index(word):
    # Return the position of the word in labels
    return torch.tensor(Labels.index(word))


def index_to_label(index):
    # Return the word corresponding to the index in labels
    # This is the inverse of label_to_index
    return Labels[index]


word_start = "yes"
index = label_to_index(word_start)
word_recovered = index_to_label(index)

print(word_start, "-->", index, "-->", word_recovered)

yes --> tensor(33) --> yes


In [8]:
def pad_sequence(batch):
    # Make all tensor in a batch the same length by padding with zeros
    batch = [item.t() for item in batch]
    batch = torch.nn.utils.rnn.pad_sequence(batch, batch_first=True, padding_value=0.)
    return batch.permute(0, 2, 1)


def collate_fn(batch):

    # A data tuple has the form:
    # waveform, sample_rate, label, speaker_id, utterance_number

    tensors, targets = [], []

    # Gather in lists, and encode labels as indices
    for waveform, _, label, *_ in batch:
        tensors += [waveform]
        targets += [label_to_index(label)]

    # Group the list of tensors into a batched tensor
    tensors = pad_sequence(tensors)
    targets = torch.stack(targets)

    return tensors, targets


batch_size = 256

if device == "cuda":
    num_workers = 1
    pin_memory = True
else:
    num_workers = 0
    pin_memory = False

train_loader = torch.utils.data.DataLoader(
    train_set,
    batch_size=batch_size,
    shuffle=True,
    collate_fn=collate_fn,
    num_workers=num_workers,
    pin_memory=pin_memory,
)
test_loader = torch.utils.data.DataLoader(
    test_set,
    batch_size=batch_size,
    shuffle=False,
    drop_last=False,
    collate_fn=collate_fn,
    num_workers=num_workers,
    pin_memory=pin_memory,
)

## Defining the network

Here we will deviate from the official `Speech Commands` tutorial. We will define two network types: ReLU DNN and Maxout DNN and try to compare the two.

In [9]:
# Maxout hyperparameters:

num_epochs = 5
batch_size = 64
learning_rate = 1

In [10]:
class MaxoutNet(nn.Module):
    def __init__(self):
        super(MaxoutNet, self).__init__()
        self.lay1lin1 = nn.Linear(N, 128)
        self.lay1lin2 = nn.Linear(N, 128)
        self.lay1lin3 = nn.Linear(N, 128)
        self.lay2lin1 = nn.Linear(128, 35)
        self.lay2lin2 = nn.Linear(128, 35)
        self.lay2lin3 = nn.Linear(128, 35)
        self.maxout_rank = 3


    def forward(self, x):
        x = x.view(x.size(0), -1) # I do not understand this black magic, x is not a single image, but a tensor of images. How does this code work?
        x = x.unsqueeze(0) # make vector of length 784 into 1*784 matrix
        X = torch.cat( (self.lay1lin1(x),self.lay1lin2(x),self.lay1lin3(x)), 0)
              # concatenate output vectors into matrix (row-wise by default)
              # size: rank * width layer 1
        x,dummy = torch.max(X,0)
              # go through each column and compute max
              # size: 1 * width layer 1
        x = x.unsqueeze(0)
        X = torch.cat( (self.lay2lin1(x),self.lay2lin2(x),self.lay2lin3(x)), 0)
              # concatenate output vectors into matrix (row-wise by default)
              # size: rank * width layer 2
        x,dummy = torch.max(X,0)
              # go through each column and compute max
              # size: 1 * width layer 2
        # x = mySoftmax(x) # wth does this make loss worse?
        return x

model_maxout = MaxoutNet().to(device)


In [11]:
activation_relu = nn.ReLU()
output_normalisation = nn.LogSoftmax(dim=-1)

class ReLUNet(nn.Module):
    def __init__(self):
        super(ReLUNet, self).__init__()
        self.layer1 = nn.Linear(N, 1024)
        #self.layer2 = nn.Linear(10, 10)
        self.layer3 = nn.Linear(1024, 35)


    def forward(self, x):
        x = x.view(x.size(0), -1) # I do not understand this black magic, x is not a single image, but a tensor of images. How does this code work?
        x = activation_relu(self.layer1(x))
        x# = activation(self.layer2(x))
        x = output_normalisation(self.layer3(x))
        return x

model_relu = ReLUNet().to(device)

In [12]:
#criterion_maxout = nn.CrossEntropyLoss()
optimizer_maxout = optim.Adam(model_maxout.parameters(), lr=learning_rate)#, weight_decay=0.0001)

#optimizer_maxout = torch.optim.SGD(model_maxout.parameters(), lr=learning_rate)#optimizer_maxout = #optim.Adam(model_maxout.parameters(), lr=0.01, weight_decay=0.0001)
#scheduler_maxout = #optim.lr_scheduler.StepLR(optimizer_maxout, step_size=20, gamma=0.1)  # reduce the learning after 20 epochs by a factor of 10
optimizer_relu = optim.Adam(model_relu.parameters(), lr=0.01, weight_decay=0.0001)
scheduler_relu = optim.lr_scheduler.StepLR(optimizer_relu, step_size=20, gamma=0.1)  # reduce the learning after 20 epochs by a factor of 10

In [13]:
def train_maxout(model, epoch, log_interval):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):

        data = data.to(device)
        target = target.to(device)

        # apply transform and model on whole batch directly on device
        #data = transform(data)
        output = model(data)

        # negative log-likelihood for a tensor of size (batch x 1 x n_output)
        #criterion = nn.CrossEntropyLoss()#F.nll_loss(output.squeeze(), target)
        #loss = criterion_maxout(output.squeeze(), target)
        loss = F.nll_loss(output.squeeze(), target)

        optimizer_maxout.zero_grad()
        loss.backward()
        optimizer_maxout.step()

        # print training stats
        if batch_idx % log_interval == 0:
            print(f"Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} ({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}")

        # update progress bar
        pbar.update(pbar_update)
        # record loss
        losses_maxout.append(loss.item())

In [14]:
losses_maxout = []

In [15]:
def number_of_correct(pred, target):
    # count number of correct predictions
    return pred.squeeze().eq(target).sum().item()


def get_likely_index(tensor):
    # find most likely label index for each element in the batch
    return tensor.argmax(dim=-1)


def test(model, epoch):
    model.eval()
    correct = 0
    for data, target in test_loader:

        data = data.to(device)
        target = target.to(device)

        # apply transform and model on whole batch directly on device
        #data = transform(data)
        output = model(data)

        pred = get_likely_index(output)
        correct += number_of_correct(pred, target)

        # update progress bar
        pbar.update(pbar_update)

    print(f"\nTest Epoch: {epoch}\tAccuracy: {correct}/{len(test_loader.dataset)} ({100. * correct / len(test_loader.dataset):.0f}%)\n")

In [16]:
# log_interval = 20
# n_epoch = 2

# pbar_update = 1 / (len(train_loader) + len(test_loader))
# #losses = []

# # The transform needs to live on the same device as the model and the data.
# #transform = transform.to(device)
# with tqdm(total=n_epoch) as pbar:
#     for epoch in range(1, n_epoch + 1):
#         train_maxout(model_maxout, epoch, log_interval)
#         test(model_maxout, epoch)
#         scheduler_maxout.step()

In [18]:
n_total_steps = len(train_loader)
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        #print(i)
        images, labels = images.to(device), labels.to(device)

        # Forward pass
        outputs = model_maxout(images)
        loss = F.nll_loss(outputs.squeeze(), labels)

        # Backward and optimize
        optimizer_maxout.zero_grad()
        loss.backward()
        optimizer_maxout.step()

        if (i+1) % 50 == 0:
            print (f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')

print('Finished Training')
# PATH = './maxoutMNIST.pth'
# torch.save(model.state_dict(), PATH)

with torch.no_grad():
    n_correct = 0
    n_samples = 0
    n_class_correct = [0 for i in range(35)]
    n_class_samples = [0 for i in range(35)]
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model_maxout(images)
        # max returns (value ,index)
        _, predicted = torch.max(outputs, 1)
        n_samples += labels.size(0)
        n_correct += (predicted == labels).sum().item()

        for i in range(labels.size(0)):
            label = labels[i]
            pred = predicted[i]
            if (label == pred):
                n_class_correct[label] += 1
            n_class_samples[label] += 1

    acc = 100.0 * n_correct / n_samples
    print(f'Accuracy of the network: {acc} %')

    for i in range(35):
        acc = 100.0 * n_class_correct[i] / n_class_samples[i]
        print(f'Accuracy of {Labels[i]}: {acc} %')


Epoch [1/5], Step [50/332], Loss: -6765002752.0000
Epoch [1/5], Step [100/332], Loss: -7062084096.0000
Epoch [1/5], Step [150/332], Loss: -8598843392.0000
Epoch [1/5], Step [200/332], Loss: -6564698112.0000
Epoch [1/5], Step [250/332], Loss: -8970320896.0000
Epoch [1/5], Step [300/332], Loss: -7301060096.0000
Epoch [2/5], Step [50/332], Loss: -10095347712.0000
Epoch [2/5], Step [100/332], Loss: -9870902272.0000
Epoch [2/5], Step [150/332], Loss: -10928656384.0000
Epoch [2/5], Step [200/332], Loss: -10719845376.0000
Epoch [2/5], Step [250/332], Loss: -11512903680.0000
Epoch [2/5], Step [300/332], Loss: -11539314688.0000
Epoch [3/5], Step [50/332], Loss: -10926978048.0000
Epoch [3/5], Step [100/332], Loss: -12152360960.0000
Epoch [3/5], Step [150/332], Loss: -11847414784.0000
Epoch [3/5], Step [200/332], Loss: -12777988096.0000
Epoch [3/5], Step [250/332], Loss: -13549804544.0000
Epoch [3/5], Step [300/332], Loss: -14111321088.0000
Epoch [4/5], Step [50/332], Loss: -14587561984.0000
Epoc