In [1]:
# 1) all the imports you need up front
import os
import pandas as pd
import torchaudio
import torch
import torchaudio.transforms as T
from torch.utils.data import Dataset, DataLoader

# 2) point this to wherever you unzipped the assignment's Train/ folder
DATA_DIR = "./Train"  

def build_metadata(data_dir: str) -> pd.DataFrame:
    """
    Walk through data_dir and collect file paths, accents, and genders.
    """
    records = []
    for root, _, files in os.walk(data_dir):
        for fname in sorted(files):
            if fname.lower().endswith(".wav"):
                path   = os.path.join(root, fname)
                accent = int(fname[0])        # '1'–'5'
                gender = fname[1].lower()     # 'm' or 'f'
                records.append({
                    "path":   path,
                    "accent": accent,
                    "gender": gender
                })
    return pd.DataFrame(records)

# Quick check
df_meta = build_metadata(DATA_DIR)
print(f"Found {len(df_meta)} files; accents = {df_meta['accent'].unique()}")
df_meta.head()

Found 3166 files; accents = [1 2 3 4 5]


Unnamed: 0,path,accent,gender
0,./Train/1f_1018.wav,1,f
1,./Train/1f_1026.wav,1,f
2,./Train/1f_1031.wav,1,f
3,./Train/1f_1070.wav,1,f
4,./Train/1f_1075.wav,1,f


In [2]:
# define your target sample‐rate and maximum length in samples:
TARGET_SR = 16_000
MAX_LEN  = 3 * TARGET_SR   # e.g. 3 seconds

# transforms
mel_transform = T.MelSpectrogram(
    sample_rate=TARGET_SR,
    n_fft=1024,
    hop_length=256,
    n_mels=64
)

def load_and_preprocess(path: str, mode: str="raw"):
    """Load a .wav file, resample to TARGET_SR, pad/trim to MAX_LEN, then
       either return raw waveform [1×MAX_LEN] or mel-spec [1×n_mels×T]."""
    wav, sr = torchaudio.load(path)           # wav: [1, L]
    if sr != TARGET_SR:
        wav = torchaudio.functional.resample(wav, sr, TARGET_SR)
    # pad/trim
    if wav.size(1) < MAX_LEN:
        pad = MAX_LEN - wav.size(1)
        wav = torch.nn.functional.pad(wav, (0, pad))
    else:
        wav = wav[:, :MAX_LEN]
    if mode == "raw":
        return wav
    elif mode == "mel":
        m = mel_transform(wav)                # [1, n_mels, T]
        # optional: convert to log scale
        return torch.log1p(m)
    else:
        raise ValueError("mode must be 'raw' or 'mel'")

In [3]:
class AccentDataset(Dataset):
    def __init__(self, meta_df: pd.DataFrame, mode="raw", accent_map=None):
        """
        mode: "raw" or "mel" selects which features to compute
        accent_map: dict mapping accents to 0…4 indices (optional)
        """
        self.df   = meta_df.reset_index(drop=True)
        self.mode = mode
        # allow arbitrary accent labels 1–5 => contiguous 0–4
        if accent_map is None:
            labels = sorted(self.df["accent"].unique())
            self.accent_map = {lab:i for i,lab in enumerate(labels)}
        else:
            self.accent_map = accent_map

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row      = self.df.iloc[idx]
        x        = load_and_preprocess(row["path"], mode=self.mode)
        y        = self.accent_map[row["accent"]]
        return x, y

In [4]:
# split into train/val
from torch.utils.data import random_split
full_ds = AccentDataset(df_meta, mode="raw")
n_train = int(0.8 * len(full_ds))
n_val   = len(full_ds) - n_train
train_ds, val_ds = random_split(full_ds, [n_train, n_val])

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=32)

In [5]:
import torch.nn as nn

class RawCNN1D(nn.Module):
    def __init__(self, n_classes=5):
        super().__init__()
        self.net = nn.Sequential(
            nn.Conv1d(1, 16, kernel_size=9, stride=1, padding=4),
            nn.BatchNorm1d(16),
            nn.ReLU(),
            nn.MaxPool1d(4),

            nn.Conv1d(16, 32, kernel_size=9, padding=4),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.MaxPool1d(4),

            nn.Conv1d(32, 64, kernel_size=9, padding=4),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.AdaptiveAvgPool1d(1),

            nn.Flatten(),
            nn.Dropout(0.3),
            nn.Linear(64, n_classes)
        )

    def forward(self, x):
        return self.net(x)    # expects x.shape = [B, 1, MAX_LEN]

In [6]:
class MelCNN2D(nn.Module):
    def __init__(self, n_classes=5):
        super().__init__()
        self.net = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d((2,2)),

            nn.Conv2d(16, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d((2,2)),

            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d((1,1)),

            nn.Flatten(),
            nn.Dropout(0.3),
            nn.Linear(64, n_classes)
        )

    def forward(self, x):
        return self.net(x)    # expects x.shape = [B, 1, n_mels, T]

In [7]:
import torch.optim as optim
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def run_experiment(model, train_loader, val_loader, epochs=10, lr=1e-3, weight_decay=1e-4):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model  = model.to(device)
    opt    = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    crit   = nn.CrossEntropyLoss()

    history = {"train_loss": [], "val_loss": [], "val_acc": []}

    for epoch in range(1, epochs+1):
        # ——— train ———
        model.train()
        total_loss = 0
        for x, y in train_loader:
            x, y = x.to(device), y.to(device)
            opt.zero_grad()
            logits = model(x)
            loss   = crit(logits, y)
            loss.backward()
            opt.step()
            total_loss += loss.item() * x.size(0)
        history["train_loss"].append(total_loss / len(train_loader.dataset))

        # ——— eval ———
        model.eval()
        total_loss = 0
        all_preds, all_targets = [], []
        with torch.no_grad():
            for x, y in val_loader:
                x, y = x.to(device), y.to(device)
                logits = model(x)
                total_loss += crit(logits, y).item() * x.size(0)
                preds = logits.argmax(dim=1).cpu().tolist()
                all_preds.extend(preds)
                all_targets.extend(y.cpu().tolist())

        val_loss = total_loss / len(val_loader.dataset)
        val_acc  = accuracy_score(all_targets, all_preds)
        history["val_loss"].append(val_loss)
        history["val_acc"].append(val_acc)

        print(f"Epoch {epoch:02d} | Train loss: {history['train_loss'][-1]:.4f} "
              f"| Val loss: {val_loss:.4f} | Val acc: {val_acc:.4f}")

    return model, history

In [9]:
# Raw-waveform experiment
raw_ds  = AccentDataset(df_meta, mode="raw")
n_train = int(0.8*len(raw_ds))
train_ds, val_ds = random_split(raw_ds, [n_train, len(raw_ds)-n_train])
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=32)

raw_model = RawCNN1D(n_classes=5)
raw_model, raw_hist = run_experiment(raw_model, train_loader, val_loader,
                                     epochs=5, lr=1e-3)

# Mel-spectrogram experiment
# mel_ds = AccentDataset(df_meta, mode="mel", accent_map=raw_ds.accent_map)
# train_ds, val_ds = random_split(mel_ds, [n_train, len(mel_ds)-n_train])
# train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
# val_loader   = DataLoader(val_ds,   batch_size=32)

# mel_model, mel_hist = run_experiment(MelCNN2D(5), train_loader, val_loader,
#                                      epochs=15, lr=1e-3)

Epoch 01 | Train loss: 1.5311 | Val loss: 1.4905 | Val acc: 0.3454
Epoch 02 | Train loss: 1.4048 | Val loss: 1.5587 | Val acc: 0.2918


KeyboardInterrupt: 