# Homework (10 points)

In this homework we train Sound Event Detection model.

Dataset: https://disk.yandex.ru/d/NRpDIp4jg2ODqg

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import requests
import tqdm.notebook as tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as torch_data
import torchaudio
import urllib

# realization of Dataset for given data
import dataset

from IPython.display import clear_output

%matplotlib inline

In [2]:
# base_url = 'https://cloud-api.yandex.net/v1/disk/public/resources/download?'
# public_key = 'https://disk.yandex.ru/d/NRpDIp4jg2ODqg'
# final_url = base_url + urllib.parse.urlencode(dict(public_key=public_key))
# response = requests.get(final_url)
# download_url = response.json()['href']
# !wget -O data.tar.gz "{download_url}"
# !tar -xf data.tar.gz

In [16]:
if torch.cuda.is_available():
    DEVICE = 'cuda'
elif torch.backends.mps.is_available():
    DEVICE = 'mps'
else:
    DEVICE = 'cpu'
print(f"using {DEVICE}")
    
DATADIR = 'data'
LOADER_WORKERS = 8

using mps


In [4]:
# FBANK 80 by default, but you can choose something else
FEATS = 80
transform = torchaudio.transforms.MelSpectrogram(n_mels=FEATS)
trainset = dataset.Dataset('train', 'data', transform)
testset = dataset.Dataset('eval', 'data', transform)
N_CLASSES = trainset.classes()

In [5]:
trainset[124][0].shape

torch.Size([80, 801])

### Eval part (1 point)

Write balanced accuracy:
$$BAcc = \frac{1}{classes}\sum_{c = 1}^{classes} \frac{\sum_i^n I(y_i = p_i = c)}{\sum_i^n I(y_i = c)}$$

Where:
- $y_i$ -- target class for $i$ element
- $p_i$ -- predicted class for $i$ element

In [6]:
# Get list of pairs (target_class, predicted_class)
def balanced_accuracy(items: list[tuple[int, int]]) -> float:
    classes = np.unique([item[0] for item in items])
    acc = 0
    for cls in classes:
        acc += sum([item[0] == item[1] == cls for item in items]) / sum([item[0] == cls for item in items])
    return acc / len(classes)

In [7]:
assert np.isclose(balanced_accuracy([(0, 0), (0, 0), (1, 1)]), 1.0)
assert np.isclose(balanced_accuracy([(0, 1), (1, 0)]), 0.0)
assert np.isclose(balanced_accuracy([(0, 0), (0, 0), (1, 0)]), 0.5)
assert np.isclose(balanced_accuracy([(0, 0), (1, 1), (0, 0), (0, 0), (1, 0), (0, 1)]), 0.625)
assert np.isclose(balanced_accuracy([(1, 1), (0, 1), (2, 2)]), 0.66666666666666)

### Train part (9 points)

Train some model with test accuracy > 0.5

You can train any model you want. The only limitation is that it must be trained from scratch on the data provided in the task. For example you can choose model from:
- DNN
- CNN 1d
- CNN 2d
- Transformer
- RNN
- mixes of given models

Hints:
- No need to train large models for this task. 10 million parameters is more than you need.
- Watch to overfitting, try to add Augmentation, Dropout, BatchNorm, L1/L2-Regulatization or something else.
- Use poolings or strides to reduce time-dimenstion. It is better to reduce the dimension gradually rather than at the end.
- Try different features (mel-spec, log-mel-spec, mfcc)

P.S. Points can be deducted for unclear training charts

PP.S. A partial score will be awarded for a test accuracy < 0.5

PPP.S. Add log to Melspectrogram in torchaudio.transform

In [8]:
# !pip install wandb
# to begin with, you should create an account on WanbB website

import wandb

# this will ask your for your API key
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /Users/dimaaspisov/.netrc


True

In [23]:
def stage(
    model: nn.Module,
    data: dataset.Dataset,
    opt: optim.Optimizer,
    batch_size: int = 256,
    train: bool = True,
):
    loader = torch_data.DataLoader(
        data,
        batch_size=batch_size,
        shuffle=True,
        num_workers=LOADER_WORKERS,
        collate_fn=dataset.collate_fn,
    )
    if train:
        model.train()
    else:
        model.eval()
    loss_sum, batches = 0.0, 0
    pred_pairs = []
    for X, Y in tqdm.tqdm(loader):
        pred = model.forward(X.to(DEVICE))
        loss = F.cross_entropy(pred.squeeze(), Y.squeeze().to(DEVICE))
        if train:
            opt.zero_grad()
            loss.backward()
            opt.step()
        loss_sum += loss.item()
        batches += 1
        with torch.no_grad():
            pred_pairs.extend(
                zip(
                    Y.data.numpy().reshape(-1),
                    torch.argmax(pred, dim=1).cpu().data.numpy().reshape(-1),
                )
            )
    return loss_sum / batches, balanced_accuracy(pred_pairs)


def train(
    model: nn.Module,
    opt,
    batch_size: int = 64,
    epochs: int = 10,
):
    train_losses, test_losses, train_accs, test_accs = [], [], [], []
    for epoch in range(epochs):
        train_loss, train_acc = stage(
            model, trainset, opt, batch_size=batch_size
        )
        test_loss, test_acc = stage(
            model, testset, opt, batch_size=batch_size, train=False
        )
        wandb.log(
            {
                "train_loss": train_loss,
                "train_acc": train_acc,
                "test_loss": test_loss,
                "test_acc": test_acc,
            },
            step=epoch + 1,
        )

        train_losses.append(train_loss)
        train_accs.append(train_acc)
        test_losses.append(test_loss)
        test_accs.append(test_acc)
        clear_output()
        fig, axis = plt.subplots(1, 2, figsize=(15, 7))
        axis[0].plot(np.arange(1, epoch + 2), train_losses, label="train")
        axis[0].plot(np.arange(1, epoch + 2), test_losses, label="test")
        axis[1].plot(np.arange(1, epoch + 2), train_accs, label="train")
        axis[1].plot(np.arange(1, epoch + 2), test_accs, label="test")
        axis[0].set(xlabel="epoch", ylabel="CE Loss")
        axis[1].set(xlabel="epoch", ylabel="Accuracy")
        fig.legend()
        plt.show()
        print(f"Epoch {epoch + 1}.")
        print(f"Train loss {train_loss}. Train accuracy {train_acc}.")
        print(f"Test loss {test_loss}. Test accuracy {test_acc}")

In [32]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SEDModel(nn.Module):
    def __init__(self, in_dim=FEATS, out_dim=N_CLASSES, hidden_size=64):
        super().__init__()
        self.hidden_size = hidden_size

        # Convolutional layers
        self.conv_layers = nn.ModuleList([
            self._make_conv_layer(in_dim, hidden_size, 3),
            self._make_conv_layer(hidden_size, hidden_size * 2, 3),
            self._make_conv_layer(hidden_size * 2, hidden_size * 4, 3),
        ])

        # Adaptive pooling to handle variable input lengths
        self.adaptive_pool = nn.AdaptiveAvgPool1d(50)

        # Fully connected layers
        self.fc_layers = nn.Sequential(
            nn.Linear(hidden_size * 4 * 50, hidden_size * 4),
            nn.ReLU(),
            nn.Dropout(p=0.3),
            nn.Linear(hidden_size * 4, hidden_size * 2),
            nn.ReLU(),
            nn.Dropout(p=0.3),
            nn.Linear(hidden_size * 2, out_dim)
        )

        # Add Temporal Attention
        self.attention = TemporalAttention(hidden_size * 4)

        print(f"Total parameters: {self.count_parameters()/1e6:.02f}M")

    def _make_conv_layer(self, in_channels, out_channels, kernel_size):
        return nn.Sequential(
            nn.Conv1d(in_channels, out_channels, kernel_size, padding=kernel_size//2),
            nn.BatchNorm1d(out_channels),
            nn.ReLU(),
            nn.MaxPool1d(2)
        )

    def count_parameters(self):
        return sum(p.numel() for p in self.parameters() if p.requires_grad)

    def forward(self, x):
        # x shape: (batch_size, in_dim, length)
        for conv_layer in self.conv_layers:
            x = conv_layer(x)

        # Apply attention
        x = self.attention(x)

        # Adaptive pooling
        x = self.adaptive_pool(x)

        # Flatten
        x = x.view(x.size(0), -1)

        # Fully connected layers
        logits = self.fc_layers(x)

        return logits

class TemporalAttention(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.attention = nn.Sequential(
            nn.Linear(hidden_size, hidden_size // 2),
            nn.ReLU(),
            nn.Linear(hidden_size // 2, 1)
        )

    def forward(self, x):
        # x shape: (batch_size, hidden_size, time_steps)
        attention_weights = F.softmax(self.attention(x.transpose(1, 2)).squeeze(-1), dim=1)
        attended_x = torch.bmm(x, attention_weights.unsqueeze(-1)).squeeze(-1)
        return attended_x.unsqueeze(-1)  # Reshape to (batch_size, hidden_size, 1)

In [31]:
model = SEDModel().to(DEVICE)
opt = optim.Adam(model.parameters())

Total parameters: 3.49M


In [33]:
with wandb.init(
                project="ysda_speech_week02_SED",
                name="conv+attention"
            ) as run:
    train(model, opt)

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011160015733265836, max=1.0…

  0%|          | 0/410 [00:00<?, ?it/s]

Traceback (most recent call last):
  File "/var/folders/_w/w_4qyk_x2mb0twsw_84xz5880000gn/T/ipykernel_33428/237800860.py", line 5, in <module>
    train(model, opt)
  File "/var/folders/_w/w_4qyk_x2mb0twsw_84xz5880000gn/T/ipykernel_33428/2553845526.py", line 48, in train
    train_loss, train_acc = stage(
                            ^^^^^^
  File "/var/folders/_w/w_4qyk_x2mb0twsw_84xz5880000gn/T/ipykernel_33428/2553845526.py", line 22, in stage
    pred = model.forward(X.to(DEVICE))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/_w/w_4qyk_x2mb0twsw_84xz5880000gn/T/ipykernel_33428/1793830011.py", line 56, in forward
    x = self.adaptive_pool(x)
        ^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1553, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1562, in _call_i

VBox(children=(Label(value='0.011 MB of 0.011 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

RuntimeError: Adaptive pool MPS: input sizes must be divisible by output sizes. Non-divisible input sizes are not implemented on MPS device yet. For now, you can manually transfer tensor to cpu in this case. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/96056)