# Birdsong detection with VGG16

The following notebook trains a VGG16 from scratch on Bird-DCASE
to detect birdsong and achieves 0.816 test accuracy and 0.816 ROC AUC
after 36 epochs (around 1M seconds of training data seen) with the given hyperparameters.

The dataset is prepared by separating each intermediate dataset into 3 subsets
for training, validation and testing (with 80%, 10% and 10% of the data) then
combining them between intermediate datasets.

## Imports

In [None]:
import random
from collections import Counter

import numpy as np
import torch
import torch.optim as optim
from torch import nn

from topography.models import speech_vgg
from topography.training import Writer, train, evaluate
from topography.utils import LinearWarmupCosineAnnealingLR
from topography.utils.data import BirdDCASE, evaluate_avg_voting

## Hyperparameters and random seed

In [None]:
seed = 0  # Random seed
root = "./bird_dcase"  # Output directory

epochs = 12
batch_size = 256  # Batch size
lr = 0.01  # Base learning rate
weight_decay = 0.01  # Weight decay
momentum = 0.9  # SGD momentum

In [None]:
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

## Data loading

In [None]:
# Set to True if you run this for the first time
download, process = False, False

train_set = BirdDCASE(
    "../../data", "training", download=download, process=process
)
val_set = BirdDCASE("../../data", "validation")
test_set = BirdDCASE("../../data", "testing")

train_loader = torch.utils.data.DataLoader(
    train_set,
    batch_size=batch_size,
    shuffle=True,
    num_workers=2,
    pin_memory=True,
)

In [None]:
for name, dataset in zip(
    ["Training:\t", "Validation:\t", "Testing:\t"],
    [train_set, val_set, test_set],
):  # Dataset split across sources
    print(name, Counter([meta.datasetid for meta in dataset.metadata.values()]))

In [None]:
epochs = 1 + 1_000_000 // len(train_set)
print("Number of training epochs:", epochs)

## Defining the main componenents

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
criterion = nn.CrossEntropyLoss()
model = speech_vgg(num_classes=2).to(device)
optimizer = optim.SGD(
    model.parameters(), lr=lr, momentum=momentum, weight_decay=weight_decay
)

scheduler = LinearWarmupCosineAnnealingLR(
    optimizer, warmup_epochs=epochs * 0.3, max_epochs=epochs
)
writer = Writer(root)

writer.log_config(
    dict(
        epochs=epochs,
        batch_size=batch_size,
        lr=lr,
        weight_decay=weight_decay,
        momentum=momentum,
        model="speech_vgg",
        optimizer="sgd",
        scheduler="LinearWarmupCosineAnnealingLR",
    )
)

## Training the model

In [None]:
for _ in range(epochs):
    train(
        model,
        train_loader,
        optimizer,
        criterion,
        device,
        writer,
        is_pytorch_loss=True,
    )
    evaluate_avg_voting(model, val_set, device, writer, mode="val")
    scheduler.step()
    writer.save(
        "val", "acc", model=model, optimizer=optimizer, scheduler=scheduler
    )

## Final evaluation

In [None]:
evaluate_avg_voting(model, test_set, device, writer, mode="test")
writer.close()