# Deep Learning Project

## Task 1: Raw-Signal Model Comparison

### 1. Dataset Definition



In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import os


In [10]:
# 1. Select device
if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
print("Using device:", device)

Using device: mps


In [11]:
class AccentWaveformDataset(Dataset):
    def __init__(self, tensor_path, file_list, max_len=16000):
        """
        tensor_path: path to audio_tensors.pt (dict: filename -> tensor)
        file_list: list of filenames (with extensions) from Train/ or Test set/
        max_len: fixed waveform length for batching
        """
        self.data = torch.load(tensor_path)  # {filename: Tensor([L])}
        self.file_list = file_list
        self.max_len = max_len

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        fn = self.file_list[idx]
        waveform = self.data[fn]
        # Pad or trim to max_len
        if waveform.size(0) < self.max_len:
            pad = self.max_len - waveform.size(0)
            waveform = F.pad(waveform, (0, pad))
        else:
            waveform = waveform[:self.max_len]
        # Standardize
        waveform = (waveform - waveform.mean()) / (waveform.std() + 1e-6)

        # Parse label and gender: filename like "3f_speakerX.wav"
        accent = int(fn[0]) - 1      # accents 0–4
        gender = 1 if fn[1] == 'm' else 0

        return waveform.unsqueeze(0), accent, gender

### 2. Dataloader setup


In [12]:
from torch.utils.data import DataLoader

def make_loaders(tensor_path, train_dir="Train", test_dir="Test set", batch_size=32):
    # List only .wav (or tensor) files in each folder
    train_files = [f for f in os.listdir(train_dir) if f.endswith(".wav") or f.endswith(".pt")]
    test_files  = [f for f in os.listdir(test_dir) if f.endswith(".wav") or f.endswith(".pt")]

    train_ds = AccentWaveformDataset(tensor_path, train_files)
    test_ds  = AccentWaveformDataset(tensor_path, test_files)

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=4)
    test_loader  = DataLoader(test_ds,  batch_size=batch_size, shuffle=False, num_workers=4)

    return train_loader, test_loader

# Example:
# train_loader, test_loader = make_loaders("audio_tensors.pt")


### 3. 1D CNN

In [13]:
import torch.nn as nn
import torch.nn.functional as F

class CNN1D(nn.Module):
    def __init__(self, num_classes=5):
        super().__init__()
        self.conv_block1 = nn.Sequential(
            nn.Conv1d(1, 16, kernel_size=9, padding=4),
            nn.BatchNorm1d(16),
            nn.ReLU(),
            nn.MaxPool1d(4),
        )
        self.conv_block2 = nn.Sequential(
            nn.Conv1d(16, 32, kernel_size=7, padding=3),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.MaxPool1d(4),
        )
        self.conv_block3 = nn.Sequential(
            nn.Conv1d(32, 64, kernel_size=5, padding=2),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.MaxPool1d(4),
        )
        self.flatten_dim = (16000 // (4*4*4)) * 64
        self.fc1 = nn.Linear(self.flatten_dim, 128)
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.conv_block1(x)
        x = self.conv_block2(x)
        x = self.conv_block3(x)
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        return self.fc2(x)


### 4. GRU model definition

In [14]:
class GRUModel(nn.Module):
    def __init__(self, num_classes=5, hidden_size=128, n_layers=1):
        super().__init__()
        self.frontend = nn.Conv1d(1, 16, kernel_size=9, padding=4, stride=4)
        self.gru = nn.GRU(input_size=16, hidden_size=hidden_size,
                          num_layers=n_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        x = self.frontend(x)           # (batch,16,L/4)
        x = x.permute(0, 2, 1)         # (batch,seq,feat)
        _, h_n = self.gru(x)
        return self.fc(h_n[-1])


In [15]:
# 2. Instantiate and move models to the selected device
cnn = CNN1D().to(device)
rnn = GRUModel().to(device)


### 6. Verifying instantiation

In [16]:
if __name__ == "__main__":
    dummy = torch.randn(8, 1, 16000).to(device)  # move dummy to device too
    assert cnn(dummy).shape == (8, 5)
    assert rnn(dummy).shape == (8, 5)
    print("✅ Models load and run correctly on", device)


✅ Models load and run correctly on mps


### 7. Train and eval functions

In [17]:
import torch.optim as optim
from sklearn.metrics import accuracy_score

def train_epoch(model, loader, optimizer, criterion):
    model.train()
    total_loss, preds, targets = 0, [], []
    for wave, accent, _ in loader:
        wave, accent = wave.to(device), accent.to(device)
        optimizer.zero_grad()
        logits = model(wave)
        loss   = criterion(logits, accent)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * wave.size(0)
        preds.extend(logits.argmax(dim=1).cpu().tolist())
        targets.extend(accent.cpu().tolist())

    avg_loss = total_loss / len(loader.dataset)
    acc      = accuracy_score(targets, preds)
    return avg_loss, acc

def eval_epoch(model, loader, criterion):
    model.eval()
    total_loss, preds, targets = 0, [], []
    with torch.no_grad():
        for wave, accent, _ in loader:
            wave, accent = wave.to(device), accent.to(device)
            logits = model(wave)
            loss   = criterion(logits, accent)

            total_loss += loss.item() * wave.size(0)
            preds.extend(logits.argmax(dim=1).cpu().tolist())
            targets.extend(accent.cpu().tolist())

    avg_loss = total_loss / len(loader.dataset)
    acc      = accuracy_score(targets, preds)
    return avg_loss, acc


### 8. Training loop

In [18]:
# Hyperparameters
lr         = 1e-3
weight_dec = 1e-4
epochs     = 10
batch_size = 32

# Data
train_loader, test_loader = make_loaders("audio_tensors.pt", batch_size=batch_size)

# Criterion & Optimizers
criterion = nn.CrossEntropyLoss()
opt_cnn   = optim.Adam(cnn.parameters(),   lr=lr, weight_decay=weight_dec)
opt_rnn   = optim.Adam(rnn.parameters(),   lr=lr, weight_decay=weight_dec)

# Run training
for name, model, optimizer in [("CNN1D", cnn, opt_cnn), ("GRUModel", rnn, opt_rnn)]:
    print(f"\n=== Training {name} ===")
    for epoch in range(1, epochs+1):
        tr_loss, tr_acc = train_epoch(model, train_loader, optimizer, criterion)
        va_loss, va_acc = eval_epoch(model, test_loader,  criterion)
        print(f"Epoch {epoch:02d} | Train loss {tr_loss:.3f}, acc {tr_acc:.3f} | "
              f"Test loss {va_loss:.3f}, acc {va_acc:.3f}")



=== Training CNN1D ===


Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Users/bramdewaal/anaconda3/lib/python3.11/multiprocessing/spawn.py", line 122, in spawn_main
    exitcode = _main(fd, parent_sentinel)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/bramdewaal/anaconda3/lib/python3.11/multiprocessing/spawn.py", line 132, in _main
    self = reduction.pickle.load(from_parent)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: Can't get attribute 'AccentWaveformDataset' on <module '__main__' (built-in)>


KeyboardInterrupt: 

In [19]:
# Right after you create train_loader:
print("Number of training samples:", len(train_loader.dataset))


Number of training samples: 3166


In [20]:
import time

# Grab one batch
wave, accent, _ = next(iter(train_loader))
wave, accent = wave.to(device), accent.to(device)

# Time one training update
t0 = time.time()
optimizer.zero_grad()
logits = cnn(wave)               # or rnn(wave) to test the other model
loss   = criterion(logits, accent)
loss.backward()
optimizer.step()
print("One batch update took", time.time() - t0, "seconds")


Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Users/bramdewaal/anaconda3/lib/python3.11/multiprocessing/spawn.py", line 122, in spawn_main
    exitcode = _main(fd, parent_sentinel)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/bramdewaal/anaconda3/lib/python3.11/multiprocessing/spawn.py", line 132, in _main
    self = reduction.pickle.load(from_parent)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: Can't get attribute 'AccentWaveformDataset' on <module '__main__' (built-in)>


KeyboardInterrupt: 

In [None]:
import time
t0 = time.time()
tr_loss, tr_acc = train_epoch(cnn, train_loader, opt_cnn, criterion)  # or rnn
print("Epoch time:", time.time() - t0, "seconds")


In [22]:
import os, time
from torch.utils.data import DataLoader

# Recreate a minimal loader: no workers, tiny batch
train_files = [f for f in os.listdir("Train") if f.endswith((".wav", ".pt"))]
debug_loader = DataLoader(
    AccentWaveformDataset("audio_tensors.pt", train_files),
    batch_size=8,
    shuffle=True,
    num_workers=0  # important!
)

# Grab one batch
wave, accent, _ = next(iter(debug_loader))
wave, accent = wave.to(device), accent.to(device)

# Time one forward+backward+step
t0 = time.time()
opt_cnn.zero_grad()
logits = cnn(wave)
loss   = criterion(logits, accent)
loss.backward()
opt_cnn.step()
print("One batch update took", time.time() - t0, "seconds")


TypeError: list indices must be integers or slices, not str