# Task 1 - Data exploration

In [1]:
# Cell 1: Imports
import os
from glob import glob
import torchaudio
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence


In [2]:
# Cell 2: Dataset + helpers

class AccentAudioDataset(Dataset):
    """
    Lazily loads .wav files and extracts accent & gender from filename.
    Filenames like '3f_utterance123.wav' where:
      - first char = accent [1-5]
      - second char = gender 'm' or 'f'
    """
    def __init__(self, data_dir, transform=None):
        self.filepaths = sorted(glob(os.path.join(data_dir, '*.wav')))
        if not self.filepaths:
            raise RuntimeError(f"[AccentAudioDataset] No .wav files in {data_dir!r}")
        self.transform = transform

    def __len__(self):
        return len(self.filepaths)

    def __getitem__(self, idx):
        path = self.filepaths[idx]
        waveform, sr = torchaudio.load(path)          # [1, L]
        if sr != 16000:
            waveform = torchaudio.functional.resample(waveform, sr, 16000)
        # online standardization
        waveform = (waveform - waveform.mean()) / (waveform.std() + 1e-9)
        fname = os.path.basename(path)
        accent = int(fname[0]) - 1                   # 0–4
        gender = 0 if fname[1] == 'm' else 1
        return waveform.squeeze(0), accent, gender

def collate_fn(batch):
    """ Pad to batch-max length, return lengths + labels """
    waves, accents, genders = zip(*batch)
    lengths = torch.tensor([w.size(0) for w in waves], dtype=torch.long)
    padded  = pad_sequence(waves, batch_first=True)
    return padded, lengths, torch.tensor(accents), torch.tensor(genders)

def compute_dataset_stats(data_dir, sample_rate=16000):
    """ Scan WAVs for duration stats; errors if folder’s empty """
    paths = glob(os.path.join(data_dir, '*.wav'))
    if not paths:
        raise RuntimeError(f"[compute_dataset_stats] No .wav files in {data_dir!r}")
    durations = torch.tensor([torchaudio.info(p).num_frames / sample_rate for p in paths])
    return {
        'count': len(durations),
        'min_s':  float(durations.min()),
        'max_s':  float(durations.max()),
        'mean_s': float(durations.mean()),
        'p90_s':  float(durations.kthvalue(int(0.9*len(durations))).values)
    }


In [3]:
# Cell 3: Point to your folders & verify
# ↳ Copy paths exactly, no trailing spaces ↓
TRAIN_DIR = "/Users/bramdewaal/Desktop/Uni/VSC/Deep Learning/Assignment/Train"
TEST_DIR  = "/Users/bramdewaal/Desktop/Uni/VSC/Deep Learning/Assignment/Test"

# Strip accidental whitespace just in case
TRAIN_DIR = TRAIN_DIR.strip()
TEST_DIR  = TEST_DIR.strip()

print("TRAIN_DIR:", TRAIN_DIR, "→ exists?", os.path.isdir(TRAIN_DIR))
print("  sample files:", os.listdir(TRAIN_DIR)[:5])
print(" TEST_DIR:", TEST_DIR, "→ exists?", os.path.isdir(TEST_DIR))
print("  sample files:", os.listdir(TEST_DIR)[:5])


TRAIN_DIR: /Users/bramdewaal/Desktop/Uni/VSC/Deep Learning/Assignment/Train → exists? True
  sample files: ['2m_9039.wav', '4f_1887.wav', '4f_9571.wav', '1m_3736.wav', '1m_3078.wav']
 TEST_DIR: /Users/bramdewaal/Desktop/Uni/VSC/Deep Learning/Assignment/Test → exists? True
  sample files: ['9430.wav', '4458.wav', '1534.wav', '8510.wav', '7192.wav']


In [16]:
# Cell 1a: Monkey-patch gender parsing in existing AccentAudioDataset

# Fetch original class
OrigDataset = AccentAudioDataset

# Create a subclass to override only gender logic
class FixedAccentAudioDataset(OrigDataset):
    def __getitem__(self, idx):
        waveform, accent, gender = super().__getitem__(idx)
        # re-parse gender from filename case-insensitive
        fname = os.path.basename(self.filepaths[idx])
        gender_char = fname[1].lower()
        if gender_char == 'm':
            gender = 0
        elif gender_char == 'f':
            gender = 1
        else:
            raise ValueError(f"Unrecognized gender '{fname[1]}'")
        return waveform, accent, gender

# Replace dataset class
AccentAudioDataset = FixedAccentAudioDataset

# Recreate loaders without re-importing torchaudio
train_ds = AccentAudioDataset(TRAIN_DIR)
test_ds  = AccentAudioDataset(TEST_DIR)
train_loader = DataLoader(train_ds, batch_size=32, collate_fn=collate_fn, num_workers=0)
test_loader  = DataLoader(test_ds,  batch_size=32, collate_fn=collate_fn, num_workers=0)

In [4]:
# Cell 4 (fast sanity check): instantiate datasets, build loaders & grab one batch

# 1) Instantiate datasets
train_ds = AccentAudioDataset(TRAIN_DIR)
test_ds  = AccentAudioDataset(TEST_DIR)

# 2) Create DataLoaders with num_workers=0 to avoid spawn overhead in Jupyter
train_loader = DataLoader(
    train_ds,
    batch_size=32,
    shuffle=True,
    num_workers=0,       # no worker processes
    pin_memory=False,    # lower overhead in notebook
    collate_fn=collate_fn
)
test_loader = DataLoader(
    test_ds,
    batch_size=32,
    shuffle=False,
    num_workers=0,
    pin_memory=False,
    collate_fn=collate_fn
)

# 3) Sanity-check one batch from each loader
for name, loader in [("Train", train_loader), ("Test", test_loader)]:
    waves, lengths, accents, genders = next(iter(loader))
    print(f"{name} batch → waves: {waves.shape}, lengths: {lengths.shape}")


Train batch → waves: torch.Size([32, 141994]), lengths: torch.Size([32])
Test batch → waves: torch.Size([32, 150186]), lengths: torch.Size([32])


# Task 2 - Raw signal 1d cnn

In [5]:
# Cell 5: Define Raw-Signal 1D CNN
import torch.nn as nn
import torch.nn.functional as F

class RawCNN(nn.Module):
    """
    1D CNN for raw waveform classification (5 accents).
    """
    def __init__(self, num_classes=5, dropout=0.2):
        super().__init__()
        self.net = nn.Sequential(
            # Conv stage 1: 1→16 channels, downsample by 4
            nn.Conv1d(1, 16, kernel_size=7, stride=2, padding=3),
            nn.BatchNorm1d(16),
            nn.ReLU(inplace=True),
            nn.MaxPool1d(2),  # seq_len / 4 total

            # Conv stage 2: 16→32, downsample by 2
            nn.Conv1d(16, 32, kernel_size=5, stride=2, padding=2),
            nn.BatchNorm1d(32),
            nn.ReLU(inplace=True),
            nn.MaxPool1d(2),  # seq_len / 8 total

            # Conv stage 3: 32→64, downsample by 2
            nn.Conv1d(32, 64, kernel_size=3, stride=2, padding=1),
            nn.BatchNorm1d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool1d(2),  # seq_len / 16 total

            nn.AdaptiveAvgPool1d(1),  # collapse time dim
            nn.Flatten(),             # → [B, 64]
            nn.Dropout(dropout),
            nn.Linear(64, num_classes)
        )

    def forward(self, x):
        """
        x: FloatTensor [B, T] raw wave
        """
        # add channel dim
        x = x.unsqueeze(1)  # [B,1,T]
        return self.net(x)


In [6]:
# Cell 6: Device selection & forward pass test (using MPS if available)

import torch

# 1) Select device: MPS (Apple GPU) if available, else CPU
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
print("Using device:", device)

# 2) Move model to device
model = RawCNN(num_classes=5, dropout=0.2).to(device)
model.eval()

# 3) Grab one batch, move to device, and test forward pass
with torch.no_grad():
    waves, lengths, accents, genders = next(iter(train_loader))
    waves = waves.to(device)
    logits = model(waves)
    assert logits.shape == (waves.size(0), 5), f"Expected [B,5], got {logits.shape}"
    print(f"✅ RawCNN forward pass on {device} OK: {logits.shape}")


Using device: mps
✅ RawCNN forward pass on mps OK: torch.Size([32, 5])


In [7]:
# Cell 7: Training & Evaluation Functions with MPS Support

import time
import torch
import torch.nn.functional as F

def train_one_epoch(model, loader, optimizer, device):
    model.train()
    total_loss = 0.0
    correct = 0
    total = 0
    start = time.time()
    for waves, lengths, accents, _ in loader:
        waves, accents = waves.to(device), accents.to(device)
        optimizer.zero_grad()
        logits = model(waves)
        loss = F.cross_entropy(logits, accents)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        total_loss += loss.item() * waves.size(0)
        preds = logits.argmax(dim=1)
        correct += (preds == accents).sum().item()
        total += waves.size(0)
    elapsed = time.time() - start
    return total_loss / total, correct / total, elapsed

@torch.no_grad()
def evaluate(model, loader, device):
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0
    for waves, lengths, accents, _ in loader:
        waves, accents = waves.to(device), accents.to(device)
        logits = model(waves)
        loss = F.cross_entropy(logits, accents)

        total_loss += loss.item() * waves.size(0)
        preds = logits.argmax(dim=1)
        correct += (preds == accents).sum().item()
        total += waves.size(0)
    return total_loss / total, correct / total

# Quick function test (one batch)
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
model = RawCNN(num_classes=5, dropout=0.2).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

train_loss, train_acc, train_time = train_one_epoch(model, train_loader, optimizer, device)
val_loss,   val_acc   = evaluate(model, test_loader, device)
print(f"Train   → loss: {train_loss:.4f}, acc: {train_acc:.3f}, time: {train_time:.1f}s")
print(f"Validate→ loss: {val_loss:.4f}, acc: {val_acc:.3f}")


Train   → loss: 1.5678, acc: 0.297, time: 17.8s
Validate→ loss: 0.8872, acc: 0.105


In [20]:
def evaluate_and_report(model, loader, device):
    import numpy as np
    from sklearn.metrics import classification_report, confusion_matrix

    model.eval()
    all_preds, all_labels, all_genders = [], [], []
    with torch.no_grad():
        for waves, lengths, accents, genders in loader:
            waves = waves.to(device)
            logits = model(waves)
            preds = logits.argmax(dim=1).cpu().numpy()
            all_preds.append(preds)
            all_labels.append(accents.numpy())
            all_genders.append(genders.numpy())

    all_preds   = np.concatenate(all_preds)
    all_labels  = np.concatenate(all_labels)
    all_genders = np.concatenate(all_genders)

    labels       = [0, 1, 2, 3, 4]
    target_names = [f"Accent {i}" for i in range(1, 6)]

    print("Classification Report (Accents):")
    print(classification_report(
        all_labels,
        all_preds,
        labels=labels,
        target_names=target_names,
        zero_division=0
    ))

In [22]:
# Cell 8: Full Training Loop with Scheduler & Early Stopping (using val_loader)

num_epochs = 5
patience = 2
best_val_loss = float('inf')
epochs_no_improve = 0

# Ensure you're using the RawCNN model and its optimizer
model_raw     = RawCNN(num_classes=5, dropout=0.3).to(device)
optimizer_raw = torch.optim.Adam(model_raw.parameters(), lr=1e-3, weight_decay=1e-3)
scheduler_raw = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer_raw, mode='min', factor=0.5, patience=1
)

for epoch in range(1, num_epochs + 1):
    # — Training on train_loader —
    tr_loss, tr_acc, tr_time = train_one_epoch(
        model_raw, train_loader, optimizer_raw, device
    )
    # — Validation on val_loader —
    vl_loss, vl_acc = evaluate(
        model_raw, val_loader, device
    )
    # Scheduler step on validation loss
    scheduler_raw.step(vl_loss)

    print(f"Epoch {epoch:02d} | "
          f"Train Loss: {tr_loss:.4f}, Acc: {tr_acc:.3f} | "
          f"Val   Loss: {vl_loss:.4f}, Acc: {vl_acc:.3f} | "
          f"Time: {tr_time:.1f}s")

    # Early stopping check
    if vl_loss < best_val_loss - 1e-4:
        best_val_loss = vl_loss
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
        if epochs_no_improve > patience:
            print(f"⏹ Early stopping at epoch {epoch}")
            break

# After training, run your detailed report on validation set
print("\nFinal Validation Set Metrics:")
evaluate_and_report(model_raw, val_loader, device)

Epoch 01 | Train Loss: 1.5783, Acc: 0.283 | Val   Loss: 1.5457, Acc: 0.329 | Time: 29.9s
Epoch 02 | Train Loss: 1.5294, Acc: 0.352 | Val   Loss: 1.5066, Acc: 0.395 | Time: 36.8s
Epoch 03 | Train Loss: 1.4947, Acc: 0.383 | Val   Loss: 1.4795, Acc: 0.392 | Time: 47.5s
Epoch 04 | Train Loss: 1.4527, Acc: 0.409 | Val   Loss: 1.4466, Acc: 0.414 | Time: 38.2s
Epoch 05 | Train Loss: 1.4241, Acc: 0.421 | Val   Loss: 1.4030, Acc: 0.471 | Time: 35.5s

Final Validation Set Metrics:
Classification Report (Accents):
              precision    recall  f1-score   support

    Accent 1       0.74      0.79      0.76       146
    Accent 2       0.34      0.27      0.30       126
    Accent 3       0.00      0.00      0.00       113
    Accent 4       0.39      0.78      0.52       156
    Accent 5       0.43      0.30      0.36        92

    accuracy                           0.47       633
   macro avg       0.38      0.43      0.39       633
weighted avg       0.40      0.47      0.42       633



In [23]:
# Cell 10: Spectrogram CNN Definition & Forward Test
import torch.nn as nn

class SpectrogramCNN(nn.Module):
    def __init__(self, num_classes=5, dropout=0.3):
        super().__init__()
        self.net = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(True),
            nn.MaxPool2d(2),

            nn.Conv2d(16, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(True),
            nn.MaxPool2d(2),

            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(True),
            nn.MaxPool2d(2),

            nn.AdaptiveAvgPool2d((1,1)),
            nn.Flatten(),
            nn.Dropout(dropout),
            nn.Linear(64, num_classes)
        )

    def forward(self, x):
        return self.net(x)

# # Forward‐pass test on MPS/CPU
# device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
# model = SpectrogramCNN().to(device)
# with torch.no_grad():
#     specs = specs.to(device)
#     logits = model(specs)
#     print("✅ SpectrogramCNN forward:", logits.shape)


In [24]:
# ── Cell 9 (replace your old Spectrogram DataLoader cell) ──

from torch.utils.data import Subset

# 1) Build the full SpectrogramDataset over your TRAIN_DIR
spec_full_ds = SpectrogramDataset(TRAIN_DIR, mel_transform)

# 2) Carve out the same 80/20 split you used for raw-wave data
#    `train_ds` and `val_ds` are Subset objects over the raw AccentAudioDataset
#    and they carry `.indices` into the file list. We reuse those here.
spec_train_ds = Subset(spec_full_ds, train_ds.indices)
spec_val_ds   = Subset(spec_full_ds, val_ds.indices)

# 3) DataLoaders
spec_train_loader = DataLoader(
    spec_train_ds, batch_size=32, shuffle=True,
    num_workers=0, pin_memory=False,
    collate_fn=spectrogram_collate_fn
)
spec_val_loader = DataLoader(
    spec_val_ds, batch_size=32, shuffle=False,
    num_workers=0, pin_memory=False,
    collate_fn=spectrogram_collate_fn
)

# 4) Sanity check
batch = next(iter(spec_train_loader))
print("SpecTrain batch:", batch[0].shape, batch[1].shape)

SpecTrain batch: torch.Size([32, 1, 64, 486]) torch.Size([32])


In [25]:
# Cell 10: Spectrogram CNN Definition & Forward Test
import torch.nn as nn

class SpectrogramCNN(nn.Module):
    def __init__(self, num_classes=5, dropout=0.3):
        super().__init__()
        self.net = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(True),
            nn.MaxPool2d(2),

            nn.Conv2d(16, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(True),
            nn.MaxPool2d(2),

            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(True),
            nn.MaxPool2d(2),

            nn.AdaptiveAvgPool2d((1,1)),
            nn.Flatten(),
            nn.Dropout(dropout),
            nn.Linear(64, num_classes)
        )

    def forward(self, x):
        return self.net(x)

# ── Cell 10 (replace your old SpectrogramCNN one-batch test) ──

model_spec = SpectrogramCNN(num_classes=5, dropout=0.3).to(device)
model_spec.eval()
with torch.no_grad():
    specs, lengths, accents, genders = next(iter(spec_train_loader))
    specs = specs.to(device)
    logits = model_spec(specs)
    assert logits.shape == (specs.size(0), 5), logits.shape
print("✅ SpectrogramCNN forward on spec_train_loader OK:", logits.shape)

✅ SpectrogramCNN forward on spec_train_loader OK: torch.Size([32, 5])


In [11]:
# Cell 11: Train & Eval SpectrogramCNN on MPS/CPU (one‐batch smoke test)

import torch.optim as optim

# 1) Device
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
print("Using device:", device)

# 2) Instantiate model, optimizer
model_spec = SpectrogramCNN(num_classes=5, dropout=0.3).to(device)
optimizer_spec = optim.Adam(model_spec.parameters(), lr=1e-3)

# 3) Quick one‐batch train & eval
train_loss, train_acc, _ = train_one_epoch(model_spec, spec_train_loader, optimizer_spec, device)
val_loss,   val_acc       = evaluate   (model_spec, spec_test_loader,        device)

print(f"SpectrogramCNN → Train loss: {train_loss:.4f}, acc: {train_acc:.3f}")
print(f"                 Val   loss: {val_loss:.4f}, acc: {val_acc:.3f}")


Using device: mps
SpectrogramCNN → Train loss: 1.5469, acc: 0.315
                 Val   loss: 0.9278, acc: 0.109


In [19]:
# Cell X: Create train/val split for local evaluation
from torch.utils.data import random_split

# Full dataset on the labeled Train folder
full_ds = AccentAudioDataset(TRAIN_DIR)

# 80/20 split
n = len(full_ds)
n_val = int(0.2 * n)
n_train = n - n_val
train_ds, val_ds = random_split(full_ds, [n_train, n_val], generator=torch.Generator().manual_seed(42))

# DataLoaders
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True,
                          num_workers=0, pin_memory=False, collate_fn=collate_fn)
val_loader   = DataLoader(val_ds,   batch_size=32, shuffle=False,
                          num_workers=0, pin_memory=False, collate_fn=collate_fn)

print(f"Split {n} samples → {n_train} train, {n_val} val")

Split 3166 samples → 2533 train, 633 val


In [26]:
# ── Cell 11 (replace your old SpectrogramCNN full-training loop) ──

optimizer_spec = torch.optim.Adam(model_spec.parameters(), lr=1e-3, weight_decay=1e-3)
scheduler_spec = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer_spec, mode='min', factor=0.5, patience=1
)

best_val, no_improve = float('inf'), 0
for epoch in range(1, 6):
    tr_loss, tr_acc, tr_time = train_one_epoch(
        model_spec, spec_train_loader, optimizer_spec, device
    )
    vl_loss, vl_acc = evaluate(
        model_spec, spec_val_loader, device
    )
    scheduler_spec.step(vl_loss)
    print(f"Epoch {epoch:02d} | "
          f"Train Acc: {tr_acc:.3f}, Val Acc: {vl_acc:.3f} | "
          f"Time: {tr_time:.1f}s")
    if vl_loss < best_val - 1e-4:
        best_val, no_improve = vl_loss, 0
    else:
        no_improve += 1
        if no_improve > 2:
            print("⏹ Early stopping")
            break

print("\nFinal SpectrogramCNN on val set:")
evaluate_and_report(model_spec, spec_val_loader, device)

Epoch 01 | Train Acc: 0.325, Val Acc: 0.259 | Time: 18.2s
Epoch 02 | Train Acc: 0.393, Val Acc: 0.355 | Time: 10.5s
Epoch 03 | Train Acc: 0.450, Val Acc: 0.479 | Time: 12.0s
Epoch 04 | Train Acc: 0.487, Val Acc: 0.482 | Time: 10.2s
Epoch 05 | Train Acc: 0.527, Val Acc: 0.523 | Time: 11.5s

Final SpectrogramCNN on val set:
Classification Report (Accents):
              precision    recall  f1-score   support

    Accent 1       1.00      0.60      0.75       146
    Accent 2       0.49      0.73      0.58       126
    Accent 3       0.79      0.10      0.17       113
    Accent 4       0.43      0.88      0.58       156
    Accent 5       0.12      0.03      0.05        92

    accuracy                           0.52       633
   macro avg       0.56      0.47      0.43       633
weighted avg       0.59      0.52      0.47       633



In [None]:
# Cell 12: Full Training Loop for SpectrogramCNN with Scheduler & Early Stopping

num_epochs = 5
patience = 2
best_val_loss = float('inf')
epochs_no_improve = 0

# Prepare model, optimizer, scheduler
model_raw = RawCNN(num_classes=5, dropout=0.3).to(device)
optimizer_raw = torch.optim.Adam(model_raw.parameters(), lr=1e-3, weight_decay=1e-3)
scheduler_raw = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer_raw, mode='min', factor=0.5, patience=1
)

for epoch in range(1, num_epochs + 1):
    # Train on training split
    tr_loss, tr_acc, tr_time = train_one_epoch(model_raw, train_loader, optimizer_raw, device)
    # Validate on validation split
    vl_loss, vl_acc = evaluate(model_raw, val_loader, device)
    # Step scheduler
    scheduler_raw.step(vl_loss)
    print(f"Epoch {epoch:02d} | "
          f"Train Loss: {tr_loss:.4f}, Acc: {tr_acc:.3f} | "
          f"Val   Loss: {vl_loss:.4f}, Acc: {vl_acc:.3f} | "
          f"Time: {tr_time:.1f}s")
    # Early stopping
    if vl_loss < best_val_loss - 1e-4:
        best_val_loss = vl_loss
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
        if epochs_no_improve > patience:
            print(f"⏹ Early stopping at epoch {epoch}")
            break

# After training, print detailed validation report
print("\nValidation Set Results:")
evaluate_and_report(model_raw, val_loader, device)

Epoch 01 | Train Loss: 1.4304, Acc: 0.437 | Val   Loss: 0.9086, Acc: 0.120 | Time: 18.7s
Epoch 02 | Train Loss: 1.3113, Acc: 0.505 | Val   Loss: 0.9614, Acc: 0.118 | Time: 18.1s
Epoch 03 | Train Loss: 1.2123, Acc: 0.543 | Val   Loss: 0.9655, Acc: 0.114 | Time: 13.3s
Epoch 04 | Train Loss: 1.1359, Acc: 0.582 | Val   Loss: 0.9930, Acc: 0.114 | Time: 12.9s
⏹ Early stopping at epoch 4


# Task 3 regularization

In [27]:
# ── Cell 13 (replace your old Spec-regularization sweep) ──

import itertools
import torch.optim as optim

results = []
for do, wd in itertools.product([0.1,0.3,0.5], [0.0,1e-4,1e-3]):
    # instantiate
    m = SpectrogramCNN(num_classes=5, dropout=do).to(device)
    opt = optim.Adam(m.parameters(), lr=1e-3, weight_decay=wd)
    sched = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, mode='min', factor=0.5, patience=1)
    # train w/ early stop
    best, no_imp = float('inf'), 0
    for _ in range(5):
        train_one_epoch(m, spec_train_loader, opt, device)
        vl_loss, vl_acc = evaluate(m, spec_val_loader, device)
        sched.step(vl_loss)
        if vl_loss < best - 1e-4:
            best, no_imp = vl_loss, 0
        else:
            no_imp += 1
            if no_imp > 2: break
    results.append({'dropout':do, 'wd':wd, 'val_acc':vl_acc})
    print(f"do={do}, wd={wd} → val_acc={vl_acc:.3f}")

import pandas as pd
print(pd.DataFrame(results))

do=0.1, wd=0.0 → val_acc=0.622
do=0.1, wd=0.0001 → val_acc=0.608
do=0.1, wd=0.001 → val_acc=0.485
do=0.3, wd=0.0 → val_acc=0.532
do=0.3, wd=0.0001 → val_acc=0.551
do=0.3, wd=0.001 → val_acc=0.551
do=0.5, wd=0.0 → val_acc=0.474
do=0.5, wd=0.0001 → val_acc=0.539
do=0.5, wd=0.001 → val_acc=0.458
   dropout      wd   val_acc
0      0.1  0.0000  0.622433
1      0.1  0.0001  0.608215
2      0.1  0.0010  0.484992
3      0.3  0.0000  0.532385
4      0.3  0.0001  0.551343
5      0.3  0.0010  0.551343
6      0.5  0.0000  0.473934
7      0.5  0.0001  0.538705
8      0.5  0.0010  0.458136


In [29]:
# Cell 14 (updated): Train RawCNN with Best Regularization & Early Stopping
# — now evaluating on val_loader —

import torch.optim as optim

device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

# Hyperparams from SpectrogramCNN grid: dropout=0.1, weight_decay=0.0 (best val_acc=0.622)
model_raw     = RawCNN(num_classes=5, dropout=0.1).to(device)
optimizer_raw = optim.Adam(model_raw.parameters(), lr=1e-3, weight_decay=0.0)
scheduler_raw = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer_raw, mode='min', factor=0.5, patience=1
)

num_epochs   = 5
patience     = 2
best_val     = float('inf')
no_improve   = 0

for epoch in range(1, num_epochs + 1):
    t0 = time.time()
    # Train on the training split
    tr_loss, tr_acc, tr_time = train_one_epoch(
        model_raw, train_loader, optimizer_raw, device
    )
    # Validate on the validation split
    vl_loss, vl_acc = evaluate(
        model_raw, val_loader, device
    )
    scheduler_raw.step(vl_loss)
    dt = time.time() - t0

    print(f"[RawCNN] Epoch {epoch:02d} | "
          f"Train Loss: {tr_loss:.4f}, Acc: {tr_acc:.3f} | "
          f"Val   Loss: {vl_loss:.4f}, Acc: {vl_acc:.3f} | "
          f"Time: {dt:.1f}s")

    # Early stopping
    if vl_loss < best_val - 1e-4:
        best_val, no_improve = vl_loss, 0
    else:
        no_improve += 1
        if no_improve > patience:
            print(f"⏹ Early stopping at epoch {epoch}")
            break

print(f"✔️ Finished RawCNN training: best val_acc = {vl_acc:.3f}")

# Final detailed report on the validation set
print("\nValidation Set Metrics for RawCNN:")
evaluate_and_report(model_raw, val_loader, device)

[RawCNN] Epoch 01 | Train Loss: 1.5761, Acc: 0.292 | Val   Loss: 1.5535, Acc: 0.344 | Time: 57.1s
[RawCNN] Epoch 02 | Train Loss: 1.5136, Acc: 0.379 | Val   Loss: 1.4898, Acc: 0.423 | Time: 36.8s
[RawCNN] Epoch 03 | Train Loss: 1.4669, Acc: 0.415 | Val   Loss: 1.4558, Acc: 0.414 | Time: 45.7s
[RawCNN] Epoch 04 | Train Loss: 1.4131, Acc: 0.438 | Val   Loss: 1.4052, Acc: 0.415 | Time: 39.0s
[RawCNN] Epoch 05 | Train Loss: 1.3682, Acc: 0.460 | Val   Loss: 1.3428, Acc: 0.482 | Time: 33.9s
✔️ Finished RawCNN training: best val_acc = 0.482

Validation Set Metrics for RawCNN:
Classification Report (Accents):
              precision    recall  f1-score   support

    Accent 1       0.60      0.96      0.74       146
    Accent 2       0.45      0.20      0.27       126
    Accent 3       0.75      0.03      0.05       113
    Accent 4       0.39      0.79      0.52       156
    Accent 5       0.61      0.15      0.24        92

    accuracy                           0.48       633
   macro av

In [None]:
# # Cell 15: Detailed Performance Evaluation by Accent & Gender (fully self-contained)

# def evaluate_and_report(model, loader, device):
#     import numpy as np
#     from sklearn.metrics import classification_report, confusion_matrix

#     model.eval()
#     all_preds, all_labels, all_genders = [], [], []
#     with torch.no_grad():
#         for waves, lengths, accents, genders in loader:
#             waves = waves.to(device)
#             logits = model(waves)
#             preds = logits.argmax(dim=1).cpu().numpy()
#             all_preds.append(preds)
#             all_labels.append(accents.numpy())
#             all_genders.append(genders.numpy())

#     all_preds   = np.concatenate(all_preds)
#     all_labels  = np.concatenate(all_labels)
#     all_genders = np.concatenate(all_genders)

#     labels       = [0, 1, 2, 3, 4]
#     target_names = [f"Accent {i}" for i in range(1, 6)]

#     print("Classification Report (Accents):")
#     print(classification_report(
#         all_labels,
#         all_preds,
#         labels=labels,
#         target_names=target_names,
#         zero_division=0
#     ))

#     print("\nConfusion Matrix:")
#     print(confusion_matrix(all_labels, all_preds, labels=labels))

#     print("\nGender-specific accuracy:")
#     for val, name in zip([0, 1], ["Male", "Female"]):
#         idxs = all_genders == val
#         count = idxs.sum()
#         if count > 0:
#             acc = (all_preds[idxs] == all_labels[idxs]).mean()
#             print(f"  {name}: {acc:.3f} ({count} samples)")
#         else:
#             print(f"  {name}: N/A ({count} samples)")

# # Run it:
# evaluate_and_report(model_raw, test_loader, device)

ValueError: Unrecognized gender '0'

In [31]:
# Cell 16: Define SpecAugment masks
from torchaudio.transforms import FrequencyMasking, TimeMasking

freq_mask = FrequencyMasking(freq_mask_param=15)  # mask up to 15 mel bins
time_mask = TimeMasking(time_mask_param=35)       # mask up to 35 time frames

In [32]:
# Cell 17: Build SpecAugmented train loader (val loader stays un-augmented)
class SpecAugmentedDataset(SpectrogramDataset):
    def __init__(self, data_dir, mel_transform, masks):
        super().__init__(data_dir, mel_transform)
        self.masks = masks
    def __getitem__(self, idx):
        spec, accent, gender = super().__getitem__(idx)
        for m in self.masks:
            spec = m(spec)
        return spec, accent, gender

# Create dataset & loader on the TRAIN split indices
from torch.utils.data import Subset
spec_full_ds      = SpectrogramDataset(TRAIN_DIR, mel_transform)
spec_aug_train_ds = Subset(spec_full_ds, train_ds.indices)
spec_aug_train_ds.dataset = SpecAugmentedDataset(TRAIN_DIR, mel_transform, [freq_mask, time_mask])
spec_aug_train_loader = DataLoader(
    spec_aug_train_ds, batch_size=16, shuffle=True,
    num_workers=0, pin_memory=False, collate_fn=spectrogram_collate_fn
)

# Sanity check
batch = next(iter(spec_aug_train_loader))
print("Augmented spec batch:", batch[0].shape, "lengths:", batch[1].shape)

Augmented spec batch: torch.Size([16, 1, 64, 550]) lengths: torch.Size([16])


In [34]:
# Cell 18 (fixed): One‐batch train/eval with SpecAugment

model_sa = SpectrogramCNN(num_classes=5, dropout=0.1).to(device)
opt_sa   = torch.optim.Adam(model_sa.parameters(), lr=1e-3, weight_decay=0.0)

# Train on SpecAugmented train split
train_loss_sa, train_acc_sa, _ = train_one_epoch(model_sa, spec_aug_train_loader, opt_sa, device)
# Validate on SpecAugmented val split
val_loss_sa,   val_acc_sa       = evaluate(model_sa, spec_val_loader, device)

print(f"SpecAugment → train_acc: {train_acc_sa:.3f}, val_acc: {val_acc_sa:.3f}")

SpecAugment → train_acc: 0.308, val_acc: 0.340


In [35]:
# Cell 19 (fixed): Full 5‐epoch loop w/ early stopping (SpecAugment → spec_val_loader)

scheduler_sa = torch.optim.lr_scheduler.ReduceLROnPlateau(opt_sa, mode='min', factor=0.5, patience=1)
best_val, no_improve = float('inf'), 0

for ep in range(1, 6):
    tr_loss, tr_acc, _ = train_one_epoch(model_sa, spec_aug_train_loader, opt_sa, device)
    vl_loss, vl_acc    = evaluate(model_sa, spec_val_loader, device)
    scheduler_sa.step(vl_loss)
    print(f"Epoch {ep:02d} | train_acc: {tr_acc:.3f} | val_acc: {vl_acc:.3f}")
    if vl_loss < best_val - 1e-4:
        best_val, no_improve = vl_loss, 0
    else:
        no_improve += 1
        if no_improve > 2:
            print("⏹ Early stopping")
            break

print("\nValidation Set Metrics for SpecAugment:")
evaluate_and_report(model_sa, spec_val_loader, device)

Epoch 01 | train_acc: 0.379 | val_acc: 0.336
Epoch 02 | train_acc: 0.437 | val_acc: 0.401
Epoch 03 | train_acc: 0.465 | val_acc: 0.419
Epoch 04 | train_acc: 0.485 | val_acc: 0.414
Epoch 05 | train_acc: 0.510 | val_acc: 0.529

Validation Set Metrics for SpecAugment:
Classification Report (Accents):
              precision    recall  f1-score   support

    Accent 1       0.94      0.80      0.86       146
    Accent 2       0.32      0.97      0.48       126
    Accent 3       0.00      0.00      0.00       113
    Accent 4       0.94      0.53      0.67       156
    Accent 5       0.33      0.15      0.21        92

    accuracy                           0.53       633
   macro avg       0.51      0.49      0.45       633
weighted avg       0.56      0.53      0.49       633


Confusion Matrix:
[[117  23   0   0   6]
 [  2 122   0   0   2]
 [  1  98   0   0  14]
 [  0  67   0  82   7]
 [  5  68   0   5  14]]

Gender-specific accuracy:
  Male: 0.475 (295 samples)
  Female: 0.577 (338 s

In [48]:
# Cell 20a: Which classes does RawCNN predict on the raw‐wave validation set?

import numpy as np

model_raw.eval()
raw_preds = []

with torch.no_grad():
    for waves, lengths, accents, genders in val_loader:
        waves = waves.to(device)
        logits = model_raw(waves)         # RawCNN expects [B, T]
        preds = logits.argmax(dim=1).cpu().numpy()
        raw_preds.extend(preds)

raw_preds = np.array(raw_preds)
print("RawCNN predicted classes (0-indexed):", np.unique(raw_preds))
print("Counts:", np.bincount(raw_preds, minlength=5))

RawCNN predicted classes (0-indexed): [0 1 2 3 4]
Counts: [234  56   4 316  23]


In [49]:
# Cell 20b: Which classes does SpectrogramCNN predict on the spec‐val set?

model_sa.eval()
spec_preds = []

with torch.no_grad():
    for specs, lengths, accents, genders in spec_val_loader:
        specs = specs.to(device)
        logits = model_sa(specs)          # SpecCNN expects [B,1,n_mels,T]
        preds = logits.argmax(dim=1).cpu().numpy()
        spec_preds.extend(preds)

spec_preds = np.array(spec_preds)
print("SpecCNN predicted classes (0-indexed):", np.unique(spec_preds))
print("Counts:", np.bincount(spec_preds, minlength=5))

SpecCNN predicted classes (0-indexed): [0 1 3 4]
Counts: [125 378   0  87  43]


---------

# Competition submission

In [40]:
# Cell X: Define a custom collate_fn for the Test set
from torch.nn.utils.rnn import pad_sequence
import torch

def test_collate_fn(batch):
    """
    Pads a batch of variable‐length waveforms and returns:
      - padded_waves: Tensor [B, T_max]
      - lengths      : LongTensor [B]
      - filenames    : list[str] of length B
    """
    waves, fnames = zip(*batch)
    lengths = torch.tensor([w.shape[0] for w in waves], dtype=torch.long)
    padded = pad_sequence(waves, batch_first=True)  # pads with zeros
    return padded, lengths, fnames

# Re-create your TestAudioDataset (no labels) and DataLoader with the new collate_fn
test_ds     = TestAudioDataset(TEST_DIR)
test_loader = DataLoader(
    test_ds,
    batch_size=32,
    shuffle=False,
    num_workers=0,
    pin_memory=False,
    collate_fn=test_collate_fn
)

In [43]:

# Cell Y: Generate submission.csv from SpectrogramCNN

model_final = model_sa  # your SpecAugment‐trained SpectrogramCNN
model_final.eval()

preds, files = [], []
with torch.no_grad():
    for specs, filenames in spec_test_loader:
        specs = specs.to(device)             # [B,1,n_mels,T]
        logits = model_final(specs)          # OK: 4D input
        batch_preds = logits.argmax(dim=1).cpu().tolist()
        preds.extend([p+1 for p in batch_preds])  # convert to 1–5
        files.extend(filenames)

import pandas as pd
df = pd.DataFrame({"filename": files, "accent": preds})
df.to_csv("submission.csv", index=False)
print(f"✅ submission.csv created with {len(df)} entries.")

ValueError: too many values to unpack (expected 2)

In [45]:
# Cell Z-1: SpectrogramTestDataset & correct collate_fn

import os
import torch
from torch.utils.data import Dataset, DataLoader
import torchaudio

class SpectrogramTestDataset(Dataset):
    """
    Loads .wav files and returns log-mel-spectrogram + filename.
    """
    def __init__(self, data_dir, mel_transform):
        self.paths = sorted(
            os.path.join(data_dir, f)
            for f in os.listdir(data_dir) if f.endswith('.wav')
        )
        self.mel = mel_transform

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, idx):
        path = self.paths[idx]
        wav, sr = torchaudio.load(path)
        if sr != 16000:
            wav = torchaudio.functional.resample(wav, sr, 16000)
        wav = (wav - wav.mean()) / (wav.std() + 1e-9)
        spec = self.mel(wav)             # [1, n_mels, T]
        spec = torch.log(spec + 1e-9)
        return spec, os.path.basename(path)

def spec_test_collate_fn(batch):
    """
    Pads spectrograms in time to the max length in this batch.
    Returns:
      - padded_specs: Tensor [B,1,n_mels,T_max]
      - filenames    : list[str]
    """
    specs, fnames = zip(*batch)
    # find max time dim
    T_max = max(s.size(-1) for s in specs)
    # pad each spec to T_max
    padded = [
        torch.nn.functional.pad(s, (0, T_max - s.size(-1)))
        for s in specs
    ]
    padded = torch.stack(padded, dim=0)
    return padded, fnames

# instantiate loader
spec_test_ds     = SpectrogramTestDataset(TEST_DIR, mel_transform)
spec_test_loader = DataLoader(
    spec_test_ds,
    batch_size=32,
    shuffle=False,
    num_workers=0,
    pin_memory=False,
    collate_fn=spec_test_collate_fn
)
print("SpecTest batch:", next(iter(spec_test_loader))[0].shape)  # should be [B,1,n_mels,T]

SpecTest batch: torch.Size([32, 1, 64, 587])


In [46]:
# Cell Z-2: Generate submission.csv from SpectrogramCNN

import csv

model_final = model_sa  # or model_raw
model_final.eval()

files, accents = [], []
with torch.no_grad():
    for specs, fnames in spec_test_loader:
        specs = specs.to(device)
        logits = model_final(specs)            # shape [B,5]
        preds = logits.argmax(dim=1).cpu().tolist()
        # convert to 1–5
        accents.extend([p+1 for p in preds])
        files.extend(fnames)

# write CSV
with open("submission.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["filename", "accent"])
    for fn, ac in zip(files, accents):
        writer.writerow([fn, ac])

print(f"✅ submission.csv written: {len(accents)} rows")

✅ submission.csv written: 551 rows
