# Speech recognition on SpeechCommands with VGG16

The following notebook trains a VGG16 from scratch on SpeechCommands
to classify the samples among the 35 classes
and achieves 0.972 test accuracy after 12 epochs (around 1M samples seen)
with the given hyperparameters.

## Imports

In [None]:
import random

import numpy as np
import torch
import torch.optim as optim
from torch import nn

from topography.models import speech_vgg
from topography.training import Writer, evaluate, train
from topography.utils import LinearWarmupCosineAnnealingLR
from topography.utils.data import SpeechCommands

## Hyperparameters and random seed

In [None]:
seed = 0  # Random seed
root = "./speech_commands"  # Output directory

epochs = 12  # Number of training epochs
batch_size = 256  # Batch size
lr = 0.01  # Base learning rate
weight_decay = 0.01  # Weight decay
momentum = 0.9  # SGD momentum

In [None]:
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

## Data loading

In [None]:
train_set = SpeechCommands("../../data", subset="training", build=True)
train_loader = torch.utils.data.DataLoader(
    train_set,
    batch_size=batch_size,
    shuffle=True,
    num_workers=2,
    pin_memory=True,
)
val_set = SpeechCommands("../../data", subset="validation")
val_loader = torch.utils.data.DataLoader(
    val_set,
    batch_size=batch_size,
    shuffle=False,
    num_workers=2,
    pin_memory=True,
)
test_set = SpeechCommands("../../data", subset="testing")
test_loader = torch.utils.data.DataLoader(
    test_set,
    batch_size=batch_size,
    shuffle=False,
    num_workers=2,
    pin_memory=True,
)

## Defining the main componenents

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
criterion = nn.CrossEntropyLoss()
model = speech_vgg().to(device)
optimizer = optim.SGD(
    model.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay
)

scheduler = LinearWarmupCosineAnnealingLR(
    optimizer, warmup_epochs=epochs * 0.3, max_epochs=epochs
)
writer = Writer(root)

writer.log_config(
    dict(
        epochs=epochs,
        batch_size=batch_size,
        lr=lr,
        weight_decay=weight_decay,
        momentum=momentum,
        model="speech_vgg",
        optimizer="sgd",
        scheduler="LinearWarmupCosineAnnealingLR",
    )
)

## Training the model

In [None]:
for _ in range(epochs):
    train(
        model,
        train_loader,
        optimizer,
        criterion,
        device,
        writer,
        is_pytorch_loss=True,
    )
    evaluate(
        model,
        val_loader,
        criterion,
        device,
        writer,
        mode="val",
        is_pytorch_loss=True,
    )
    scheduler.step()
    writer.save(
        "val", "acc", model=model, optimizer=optimizer, scheduler=scheduler
    )

## Final evaluation

In [None]:
evaluate(
    model,
    test_loader,
    criterion,
    device,
    writer,
    mode="test",
    is_pytorch_loss=True,
)
writer.close()