# Complete Fixed Notebook
This notebook fully implements Tasks 1–9 with all fixes.

## Imports & Helpers

In [1]:
import os, time, random, csv
from glob import glob
import torch
import torch.nn.functional as F
import torchaudio
import torchaudio.transforms as T
from torch import nn
from torch.utils.data import Dataset, DataLoader, random_split, Subset
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import classification_report, confusion_matrix


## Task 1: Data Preparation

In [2]:
# Directories
TRAIN_DIR = "Train"
TEST_DIR  = "Test"

# AccentAudioDataset
class AccentAudioDataset(Dataset):
    def __init__(self, data_dir):
        self.paths = sorted(glob(os.path.join(data_dir,'*.wav')))
        if not self.paths: raise RuntimeError(f"No .wav in {data_dir}")
    def __len__(self): return len(self.paths)
    def __getitem__(self, idx):
        p = self.paths[idx]
        w, sr = torchaudio.load(p)
        if sr!=16000: w = torchaudio.functional.resample(w, sr, 16000)
        w = (w - w.mean())/(w.std()+1e-9)
        fname = os.path.basename(p)
        accent = int(fname[0]) - 1
        gender = 0 if fname[1].lower()=='m' else 1
        return w.squeeze(0), accent, gender

def collate_fn(batch):
    waves, accents, genders = zip(*batch)
    lengths = torch.tensor([w.size(0) for w in waves])
    padded = pad_sequence(waves, batch_first=True)
    return padded, lengths, torch.tensor(accents), torch.tensor(genders)

# Split
full_ds = AccentAudioDataset(TRAIN_DIR)
n_tot = len(full_ds)
n_val = int(0.2*n_tot); n_train = n_tot - n_val
train_ds, val_ds = random_split(full_ds, [n_train, n_val], generator=torch.Generator().manual_seed(42))
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True,  num_workers=0, pin_memory=False, collate_fn=collate_fn)
val_loader   = DataLoader(val_ds,   batch_size=32, shuffle=False, num_workers=0, pin_memory=False, collate_fn=collate_fn)

print("Train/Val:", n_train, n_val)

Train/Val: 2533 633


## Task 2: RawCNNv2 Model

In [3]:
class RawCNNv2(nn.Module):
    def __init__(self,num_classes=5,dropout=0.2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Conv1d(1,32,7,stride=2,padding=3),
            nn.BatchNorm1d(32), nn.ReLU(True), nn.MaxPool1d(2),
            nn.Conv1d(32,64,5,stride=2,padding=2),
            nn.BatchNorm1d(64), nn.ReLU(True), nn.MaxPool1d(2),
            nn.Conv1d(64,128,3,stride=2,padding=1),
            nn.BatchNorm1d(128), nn.ReLU(True), nn.MaxPool1d(2),
            nn.AdaptiveAvgPool1d(1), nn.Flatten(),
            nn.Dropout(dropout), nn.Linear(128,num_classes)
        )
    def forward(self,x):
        return self.net(x.unsqueeze(1))

## Task 3: Waveform Augmentation

In [4]:
class AddNoise:
    def __call__(self,w):
        rms=w.pow(2).mean().sqrt(); return w+torch.randn_like(w)*rms*0.05
class RandomShift:
    def __call__(self,w):
        return torch.roll(w, shifts=random.randint(-1000,1000))
class WaveAugDataset(AccentAudioDataset):
    def __init__(self,d,augs): super().__init__(d); self.augs=augs
    def __getitem__(self,i):
        w,a,g=super().__getitem__(i)
        for aug in self.augs: w=aug(w)
        w=(w-w.mean())/(w.std()+1e-9)
        return w,a,g

aug_loader = DataLoader(WaveAugDataset(TRAIN_DIR,[AddNoise(),RandomShift()]),
    batch_size=32,shuffle=True,num_workers=0,pin_memory=False,collate_fn=collate_fn)
print("Aug loader:", len(aug_loader.dataset))

Aug loader: 3166


## Task 4: Train RawCNNv2 with OneCycleLR & Checkpointing

In [5]:
# Cell X: Redefine helper functions without re-importing

def train_one_epoch(model, loader, optimizer, device):
    model.train()
    total_loss = 0.0
    correct = 0
    total = 0
    start_time = time.time()
    
    for inputs, lengths, accents, genders in loader:
        inputs = inputs.to(device)
        accents = accents.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = F.cross_entropy(outputs, accents)
        loss.backward()
        optimizer.step()
        
        batch = inputs.size(0)
        total_loss += loss.item() * batch
        preds = outputs.argmax(dim=1)
        correct += (preds == accents).sum().item()
        total += batch
    
    return total_loss/total, correct/total, time.time() - start_time

def evaluate(model, loader, device):
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for inputs, lengths, accents, genders in loader:
            inputs = inputs.to(device)
            accents = accents.to(device)
            outputs = model(inputs)
            loss = F.cross_entropy(outputs, accents)
            
            batch = inputs.size(0)
            total_loss += loss.item() * batch
            preds = outputs.argmax(dim=1)
            correct += (preds == accents).sum().item()
            total += batch
    
    return total_loss/total, correct/total

print("✅ Helper functions redefined (no imports).")

✅ Helper functions redefined (no imports).


In [6]:
device = torch.device('mps') if torch.backends.mps.is_available() else torch.device('cpu')
model_raw = RawCNNv2().to(device)
opt_raw = torch.optim.Adam(model_raw.parameters(), lr=1e-3, weight_decay=1e-4)
sched_raw = torch.optim.lr_scheduler.OneCycleLR(opt_raw, max_lr=1e-3, total_steps=50, pct_start=0.3)

best_acc, best_ep = 0,0
for ep in range(1,51):
    tr_loss,tr_acc,_=train_one_epoch(model_raw,aug_loader,opt_raw,device)
    vl_loss,vl_acc  =evaluate(model_raw,val_loader,device)
    sched_raw.step()
    if vl_acc>best_acc:
        best_acc,best_ep=vl_acc,ep; torch.save(model_raw.state_dict(),'best_rawcnn.pt')
    print(f"Epoch {ep:02d} | tr_acc {tr_acc:.3f} | val_acc {vl_acc:.3f}")
print("Best RawCNN2:",best_acc,"@ ep",best_ep)

Epoch 01 | tr_acc 0.210 | val_acc 0.264
Epoch 02 | tr_acc 0.260 | val_acc 0.302
Epoch 03 | tr_acc 0.317 | val_acc 0.303
Epoch 04 | tr_acc 0.369 | val_acc 0.362
Epoch 05 | tr_acc 0.401 | val_acc 0.445
Epoch 06 | tr_acc 0.423 | val_acc 0.346
Epoch 07 | tr_acc 0.444 | val_acc 0.442
Epoch 08 | tr_acc 0.457 | val_acc 0.409
Epoch 09 | tr_acc 0.472 | val_acc 0.392
Epoch 10 | tr_acc 0.474 | val_acc 0.374
Epoch 11 | tr_acc 0.489 | val_acc 0.472
Epoch 12 | tr_acc 0.494 | val_acc 0.547
Epoch 13 | tr_acc 0.513 | val_acc 0.561
Epoch 14 | tr_acc 0.529 | val_acc 0.415
Epoch 15 | tr_acc 0.534 | val_acc 0.370
Epoch 16 | tr_acc 0.546 | val_acc 0.455
Epoch 17 | tr_acc 0.538 | val_acc 0.534
Epoch 18 | tr_acc 0.562 | val_acc 0.532
Epoch 19 | tr_acc 0.563 | val_acc 0.384
Epoch 20 | tr_acc 0.559 | val_acc 0.477
Epoch 21 | tr_acc 0.569 | val_acc 0.518
Epoch 22 | tr_acc 0.572 | val_acc 0.479
Epoch 23 | tr_acc 0.580 | val_acc 0.483
Epoch 24 | tr_acc 0.591 | val_acc 0.379
Epoch 25 | tr_acc 0.603 | val_acc 0.624


## Task 5: Evaluate RawCNNv2

In [7]:
model_raw.load_state_dict(torch.load('best_rawcnn.pt'))
evaluate(model_raw,val_loader,device)

(0.8114912832894408, 0.7251184834123223)

## Task 6: Spectrogram Dataset and Collate

In [None]:
# SpectrogramDataset & collate (3-tuple)
class SpectrogramDataset(Dataset):
    def __init__(self,d,tf): self.paths=sorted(glob(os.path.join(d,'*.wav'))); self.tf=tf
    def __len__(self): return len(self.paths)
    def __getitem__(self,i):
        p=self.paths[i]; w,sr=torchaudio.load(p)
        if sr!=16000: w=torchaudio.functional.resample(w,sr,16000)
        w=(w-w.mean())/(w.std()+1e-9)
        spec=self.tf(w); spec=torch.log(spec+1e-9)
        fname=os.path.basename(p); a=int(fname[0])-1; g=0 if fname[1].lower()=='m' else 1
        return spec,a,g

def spec_collate(batch):
    specs,accs,gens=zip(*batch)
    T_max=max(s.size(2) for s in specs)
    padded=[torch.nn.functional.pad(s,(0,T_max-s.size(2))) for s in specs]
    return torch.stack(padded,0), torch.tensor(accs), torch.tensor(gens)



mel_tf = T.MelSpectrogram(sample_rate=16000, n_mels=64)
spec_full  = SpectrogramDataset(TRAIN_DIR, mel_tf)
spec_train = Subset(spec_full, train_ds.indices)
spec_val   = Subset(spec_full, val_ds.indices)

spec_train_loader = DataLoader(
    spec_train,
    batch_size=32,
    shuffle=True,
    num_workers=0,
    pin_memory=False,
    collate_fn=spec_collate
)
spec_val_loader = DataLoader(
    spec_val,
    batch_size=32,
    shuffle=False,
    num_workers=0,
    pin_memory=False,
    collate_fn=spec_collate
)

print("Spec loaders:", len(spec_train), len(spec_val))

Spec loaders: 2533 633


## Task 7: SpectrogramCNN & SpecAugment Training

## Task 8: Generate Submission CSV

In [11]:
# ─── Cell PREP-0: Define SpecAugDataset ───

from torchaudio.transforms import FrequencyMasking, TimeMasking

# 1) Create the two masks once
freq_mask = FrequencyMasking(freq_mask_param=15)
time_mask = TimeMasking(time_mask_param=35)

# 2) Inherit from your existing SpectrogramDataset
class SpecAugDataset(SpectrogramDataset):
    """
    Same as SpectrogramDataset but applies SpecAugment (freq & time masking).
    """
    def __init__(self, data_dir, transform):
        super().__init__(data_dir, transform)
        self.freq_mask = freq_mask
        self.time_mask = time_mask

    def __getitem__(self, idx):
        spec, accent, gender = super().__getitem__(idx)
        # apply masking in-place
        spec = self.freq_mask(spec)
        spec = self.time_mask(spec)
        return spec, accent, gender

print("✅ SpecAugDataset defined.")

✅ SpecAugDataset defined.


In [13]:
# ─── Cell PREP-1: Define SpectrogramCNN ───

import torch
import torch.nn as nn

class SpectrogramCNN(nn.Module):
    """
    2D CNN for log‐Mel spectrograms (5 accents).
    """
    def __init__(self, num_classes=5, dropout=0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),

            nn.Conv2d(16, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),

            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2),

            nn.AdaptiveAvgPool2d((1,1)),
            nn.Flatten(),
            nn.Dropout(dropout),
            nn.Linear(64, num_classes)
        )

    def forward(self, x):
        # x: [B, 1, F, T]
        return self.net(x)

print("✅ SpectrogramCNN defined.")

✅ SpectrogramCNN defined.


In [14]:
# Cell PREP: Redefine Missing Variables & Functions for SpectrogramCNN Pipeline (Fixed)

import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchaudio.transforms import FrequencyMasking, TimeMasking
from torch.utils.data import DataLoader, Subset, WeightedRandomSampler
import numpy as np

# Device
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")

# 1) Build your augmented‐spectrogram training subset
#    (SpecAugDataset, TRAIN_DIR, mel_tf and train_ds must already be defined)
spec_aug_full  = SpecAugDataset(TRAIN_DIR, mel_tf)
spec_aug_train = Subset(spec_aug_full, train_ds.indices)

# 2) Compute per‐class counts & inverse‐frequency weights
train_labels   = [label for _, label, _ in spec_aug_train]
train_counts   = np.bincount(train_labels, minlength=5)
class_weights  = 1.0 / train_counts

# 3) Normalize and move to device
weights = torch.tensor(class_weights, dtype=torch.float, device=device)
weights = weights / weights.sum() * len(weights)

# 4) Create weighted CrossEntropyLoss
criterion = nn.CrossEntropyLoss(weight=weights)

# 5) Spec‐specific train/eval helpers
def train_one_epoch_spec(model, loader, optimizer, device):
    model.train()
    total_loss = correct = total = 0
    start = time.time()
    for specs, accents, genders in loader:
        specs, accents = specs.to(device), accents.to(device)
        optimizer.zero_grad()
        logits = model(specs)
        loss   = criterion(logits, accents)   # uses our weighted loss
        loss.backward()
        optimizer.step()
        b = specs.size(0)
        total_loss += loss.item() * b
        correct    += (logits.argmax(1) == accents).sum().item()
        total      += b
    return total_loss/total, correct/total, time.time() - start

def evaluate_spec(model, loader, device):
    model.eval()
    total_loss = correct = total = 0
    with torch.no_grad():
        for specs, accents, genders in loader:
            specs, accents = specs.to(device), accents.to(device)
            logits = model(specs)
            loss   = F.cross_entropy(logits, accents)
            b = specs.size(0)
            total_loss += loss.item() * b
            correct    += (logits.argmax(1) == accents).sum().item()
            total      += b
    return total_loss/total, correct/total

# 6) Build your oversampled SpecAug DataLoader
example_weights = [class_weights[l] for l in train_labels]
sampler = WeightedRandomSampler(
    weights     = example_weights,
    num_samples = len(example_weights),
    replacement = True
)
spec_aug_loader = DataLoader(
    spec_aug_train,
    batch_size   = 32,
    sampler      = sampler,        # no shuffle
    num_workers  = 0,
    pin_memory   = False,
    collate_fn   = spec_collate
)

# 7) Validation loader
spec_val_full   = SpectrogramDataset(TRAIN_DIR, mel_tf)
spec_val_subset = Subset(spec_val_full, val_ds.indices)
spec_val_loader = DataLoader(
    spec_val_subset,
    batch_size   = 32,
    shuffle      = False,
    num_workers  = 0,
    pin_memory   = False,
    collate_fn   = spec_collate
)

# 8) Model + Optimizer + Scheduler setup
model_spec     = SpectrogramCNN(num_classes=5, dropout=0.1).to(device)
optimizer_spec = optim.Adam(model_spec.parameters(), lr=1e-3, weight_decay=1e-3)
scheduler_spec = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer_spec, mode='min', factor=0.5, patience=1
)

print("✅ PREP complete: class_counts defined, weighted loss, loaders, model, optimizer & scheduler ready.")

✅ PREP complete: class_counts defined, weighted loss, loaders, model, optimizer & scheduler ready.


In [16]:

import numpy as np
import torch.nn as nn

# 1) Extract all accent labels from your train_ds
#    If you only have a DataLoader, you can iterate it instead.
train_labels = [label for _, label, _ in train_ds]  
# train_labels is now a list of ints in {0,…,4}

# 2) Compute counts per class
train_counts = np.bincount(train_labels, minlength=5)
print("Train‐split counts (support) per accent:", train_counts)

# 3) Build inverse‐frequency weights (use float32 on MPS)
weights = torch.tensor(1.0 / train_counts, dtype=torch.float32, device=device)
# Normalize so average weight ≈ 1
weights = weights / weights.sum() * len(weights)
print("Computed class weights:", weights)

# 4) Create weighted loss
criterion = nn.CrossEntropyLoss(weight=weights)
print("✅ criterion ready with weighted classes")

Train‐split counts (support) per accent: [594 500 451 598 390]
Computed class weights: tensor([0.8304, 0.9865, 1.0936, 0.8248, 1.2647], device='mps:0')
✅ criterion ready with weighted classes


In [17]:
# … after you have spec_aug_loader and spec_val_loader defined …

import torch.nn as nn
import numpy as np

# Compute true class counts on your **train** split, not val.
# Replace these with your train-split support values.
train_counts = np.array([146, 126, 113, 156,  92])  

# Inverse-frequency weights

weights = weights / weights.sum() * len(weights)  

# Create a weighted cross-entropy loss
criterion = nn.CrossEntropyLoss(weight=weights)

In [18]:
# Cell X: Train+Checkpoint → Load Best → Eval → Submission

import os, csv
import torch
import torchaudio
from glob import glob
from torch.utils.data import Dataset, DataLoader, Subset
from torchaudio.transforms import FrequencyMasking, TimeMasking

best_val_acc = 0.0
ckpt_path    = "best_speccnn.pt"

for epoch in range(1, 51):
    tr_loss, tr_acc, _ = train_one_epoch_spec(
        model_spec, spec_aug_loader, optimizer_spec, device
    )
    vl_loss, vl_acc    = evaluate_spec(
        model_spec, spec_val_loader, device
    )
    scheduler_spec.step(vl_loss)

    # checkpoint on improvement
    if vl_acc > best_val_acc + 1e-4:
        best_val_acc = vl_acc
        torch.save(model_spec.state_dict(), ckpt_path)

    print(f"[SpecAug] Epoch {epoch:02d} | "
          f"Train Acc: {tr_acc:.3f} | Val Acc: {vl_acc:.3f}")

print(f"\n✔️ Best val_acc = {best_val_acc:.3f}, saved to {ckpt_path}")

############################################
# 2) Load best weights & final validation
############################################
model_spec.load_state_dict(torch.load(ckpt_path, map_location=device))
model_spec.eval()

final_loss, final_acc = evaluate_spec(model_spec, spec_val_loader, device)
print(f"\nFinal Validation → Loss: {final_loss:.4f}, Acc: {final_acc:.3f}")

############################################
# 3) Build test loader and write submission.csv
############################################
class SpecTestDataset(Dataset):
    def __init__(self, data_dir, transform):
        self.paths = sorted(glob(os.path.join(data_dir, "*.wav")))
        self.transform = transform
    def __len__(self):
        return len(self.paths)
    def __getitem__(self, idx):
        path = self.paths[idx]
        wav, sr = torchaudio.load(path)
        if sr != 16000:
            wav = torchaudio.functional.resample(wav, sr, 16000)
        wav = (wav - wav.mean())/(wav.std()+1e-9)
        spec = self.transform(wav)
        spec = torch.log(spec + 1e-9)
        return spec, os.path.basename(path)

def test_collate(batch):
    specs, fnames = zip(*batch)
    T_max = max(s.size(2) for s in specs)
    padded = [torch.nn.functional.pad(s, (0, T_max-s.size(2))) for s in specs]
    return torch.stack(padded, 0), list(fnames)

# instantiate & run
test_ds     = SpecTestDataset(TEST_DIR, mel_tf)
test_loader = DataLoader(test_ds, batch_size=32, shuffle=False,
                         num_workers=0, pin_memory=False,
                         collate_fn=test_collate)

filenames, accents = [], []
with torch.no_grad():
    for specs, fnames in test_loader:
        specs = specs.to(device)
        preds = model_spec(specs).argmax(dim=1).cpu().tolist()
        filenames.extend(fnames)
        accents.extend([p+1 for p in preds])

# write CSV
with open("submission.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["filename", "accent"])
    writer.writerows(zip(filenames, accents))

print(f"✅ submission.csv created with {len(accents)} rows")

[SpecAug] Epoch 01 | Train Acc: 0.276 | Val Acc: 0.185
[SpecAug] Epoch 02 | Train Acc: 0.302 | Val Acc: 0.327
[SpecAug] Epoch 03 | Train Acc: 0.335 | Val Acc: 0.243
[SpecAug] Epoch 04 | Train Acc: 0.384 | Val Acc: 0.485
[SpecAug] Epoch 05 | Train Acc: 0.403 | Val Acc: 0.363
[SpecAug] Epoch 06 | Train Acc: 0.468 | Val Acc: 0.359
[SpecAug] Epoch 07 | Train Acc: 0.443 | Val Acc: 0.398
[SpecAug] Epoch 08 | Train Acc: 0.490 | Val Acc: 0.510
[SpecAug] Epoch 09 | Train Acc: 0.498 | Val Acc: 0.280
[SpecAug] Epoch 10 | Train Acc: 0.504 | Val Acc: 0.477
[SpecAug] Epoch 11 | Train Acc: 0.521 | Val Acc: 0.621
[SpecAug] Epoch 12 | Train Acc: 0.539 | Val Acc: 0.550
[SpecAug] Epoch 13 | Train Acc: 0.549 | Val Acc: 0.605
[SpecAug] Epoch 14 | Train Acc: 0.554 | Val Acc: 0.499
[SpecAug] Epoch 15 | Train Acc: 0.565 | Val Acc: 0.545
[SpecAug] Epoch 16 | Train Acc: 0.587 | Val Acc: 0.607
[SpecAug] Epoch 17 | Train Acc: 0.585 | Val Acc: 0.664
[SpecAug] Epoch 18 | Train Acc: 0.593 | Val Acc: 0.643
[SpecAug] 

In [19]:
# Cell: Detailed per‐accent breakdown

from sklearn.metrics import classification_report, confusion_matrix

# Make sure evaluate_and_report is defined; if not, re‐define it as:
def evaluate_and_report(model, loader, device):
    import numpy as np
    from sklearn.metrics import classification_report, confusion_matrix

    model.eval()
    all_preds, all_labels, all_genders = [], [], []
    with torch.no_grad():
        for specs, accents, genders in loader:
            specs = specs.to(device)
            logits = model(specs)
            preds = logits.argmax(dim=1).cpu().numpy()
            all_preds.append(preds)
            all_labels.append(accents.numpy())
            all_genders.append(genders.numpy())

    all_preds  = np.concatenate(all_preds)
    all_labels = np.concatenate(all_labels)

    print("Classification Report (Accents):")
    print(classification_report(
        all_labels,
        all_preds,
        labels=[0,1,2,3,4],
        target_names=[f"Accent {i}" for i in range(1,6)],
        zero_division=0
    ))
    print("\nConfusion Matrix:")
    print(confusion_matrix(all_labels, all_preds, labels=[0,1,2,3,4]))

# Run it:
evaluate_and_report(model_spec, spec_val_loader, device)

Classification Report (Accents):
              precision    recall  f1-score   support

    Accent 1       0.90      0.90      0.90       146
    Accent 2       0.82      0.71      0.76       126
    Accent 3       0.66      0.67      0.66       113
    Accent 4       0.93      0.66      0.77       156
    Accent 5       0.35      0.58      0.44        92

    accuracy                           0.72       633
   macro avg       0.73      0.70      0.71       633
weighted avg       0.77      0.72      0.73       633


Confusion Matrix:
[[132   1   1   0  12]
 [  1  89   5   0  31]
 [  7   3  76   0  27]
 [  1   3  21 103  28]
 [  6  12  13   8  53]]
