# Birdsong detection with VGG16

The following notebook trains a VGG16 from scratch on Bird-DCASE
to detect birdsong and achieves (around 1M samples seen)
with the given hyperparameters.

## Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import random

import numpy as np
import torch
import torch.optim as optim
from torch import nn

from topography.models import speech_vgg
from topography.training import Writer, train, evaluate
from topography.utils import LinearWarmupCosineAnnealingLR
from topography.utils.data import BirdDCASE, evaluate_with_crop

## Hyperparameters and random seed

In [3]:
seed = 0  # Random seed
root = "./bird_dcase"  # Output directory

epochs = 12 
batch_size = 256  # Batch size
lr = 0.01  # Base learning rate
weight_decay = 0.01  # Weight decay
momentum = 0.9  # SGD momentum

In [4]:
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

## Data loading

In [5]:
# validation_set = "warblrb10k"
validation_set = "BirdVox-DCASE-20k"
duration = 1

train_set = BirdDCASE(
    "../../data",
    subset="training",
    validation_set=validation_set,
    crop=True,
    duration=duration
)
train_loader = torch.utils.data.DataLoader(
    train_set,
    batch_size=batch_size,
    shuffle=True,
    num_workers=2,
    pin_memory=True,
)
val_set = BirdDCASE(
    "../../data",
    subset="validation", 
    validation_set=validation_set,
    crop=False,
)

In [7]:
epochs = 1 + 1_000_000 // len(train_set)  # Number of training epochs
print(epochs)

64


## Defining the main componenents

In [8]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
criterion = nn.CrossEntropyLoss()
model = speech_vgg(num_classes=2).to(device)
optimizer = optim.SGD(model.parameters(),
                      lr=lr, momentum=momentum, weight_decay=weight_decay)

scheduler = LinearWarmupCosineAnnealingLR(
    optimizer, warmup_epochs=epochs*0.3, max_epochs=epochs)
writer = Writer(root)

writer.log_hparams(epochs=epochs, batch_size=batch_size, lr=lr,
                   weight_decay=weight_decay, momentum=momentum,
                   model="speech_vgg",
                   optimizer='sgd', scheduler='LinearWarmupCosineAnnealingLR')

## Training the model

In [None]:
for _ in range(epochs):
    train(
        model,
        train_loader,
        optimizer,
        criterion,
        device,
        writer,
        is_pytorch_loss=True,
    )
    evaluate_with_crop(
        model,
        val_set,
        device,
        writer,
        mode="val",
        duration=duration
    )
    scheduler.step()
    writer.save(
        "val", "acc", model=model, optimizer=optimizer, scheduler=scheduler
    )

train, epoch 1:   0%|          | 0/62 [00:00<?, ?it/s]

val, epoch 1, roc_auc 0.505, acc 0.506


train, epoch 2:   0%|          | 0/62 [00:00<?, ?it/s]

val, epoch 2, roc_auc 0.439, acc 0.438


train, epoch 3:   0%|          | 0/62 [00:00<?, ?it/s]

val, epoch 3, roc_auc 0.468, acc 0.468


train, epoch 4:   0%|          | 0/62 [00:00<?, ?it/s]

val, epoch 4, roc_auc 0.421, acc 0.420


train, epoch 5:   0%|          | 0/62 [00:00<?, ?it/s]

val, epoch 5, roc_auc 0.412, acc 0.412


train, epoch 6:   0%|          | 0/62 [00:00<?, ?it/s]

val, epoch 6, roc_auc 0.449, acc 0.449


train, epoch 7:   0%|          | 0/62 [00:00<?, ?it/s]

val, epoch 7, roc_auc 0.465, acc 0.465


train, epoch 8:   0%|          | 0/62 [00:00<?, ?it/s]

val, epoch 8, roc_auc 0.505, acc 0.505


train, epoch 9:   0%|          | 0/62 [00:00<?, ?it/s]

val, epoch 9, roc_auc 0.500, acc 0.500


train, epoch 10:   0%|          | 0/62 [00:00<?, ?it/s]

val, epoch 10, roc_auc 0.516, acc 0.515


train, epoch 11:   0%|          | 0/62 [00:00<?, ?it/s]

val, epoch 11, roc_auc 0.536, acc 0.535


train, epoch 12:   0%|          | 0/62 [00:00<?, ?it/s]

val, epoch 12, roc_auc 0.532, acc 0.532


train, epoch 13:   0%|          | 0/62 [00:00<?, ?it/s]

val, epoch 13, roc_auc 0.525, acc 0.524


train, epoch 14:   0%|          | 0/62 [00:00<?, ?it/s]

val, epoch 14, roc_auc 0.528, acc 0.528


train, epoch 15:   0%|          | 0/62 [00:00<?, ?it/s]

## Final evaluation

In [None]:
evaluate_with_crop(
    model,
    val_set,
    device,
    writer,
    mode="test",
    duration=duration
)
# writer.close()

In [None]:
train_set_nocrop = BirdDCASE(
    "../../data",
    subset="training",
    validation_set=validation_set,
    crop=False,
    duration=duration
)

evaluate_with_crop(
    model,
    train_set_nocrop,
    device,
    writer,
    mode="train_test",
    duration=duration
)
writer.close()