# Group assignment DL - Accent classification

## Data pre-processing

In [4]:
# %pip install torch
# %pip install -r requirements.txt 
import os
import pandas as pd
import torch
import torchaudio
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F


In [5]:
def build_metadata(data_dir: str):
    """
    Analyzing and collecting all metadata from the audio files (gender, male or female)
    """
    records = []
    for root, _, files in os.walk(data_dir):
        for fname in files:
            if fname.lower().endswith(".wav"):
                path = os.path.join(root, fname)
                accent = int(fname[0])          # '1'–'5'
                gender = fname[1].lower()       # 'm' or 'f'
                records.append({"path": path, "accent": accent, "gender": gender})
    return pd.DataFrame(records)


class AccentDataset(Dataset):
    """
    PyTorch Dataset for loading, preprocessing, and feature-extracting audio.
    """
    def __init__(
        self,
        metadata_df: pd.DataFrame,
        approach: str = "raw",  # "raw" or "mel"
        max_length: int = 16000 * 5,  # 5 seconds
        sample_rate: int = 16000,
        transform: torch.nn.Module = None,
        target_transform = None
    ):
        self.df = metadata_df.reset_index(drop=True)
        self.approach = approach
        self.max_length = max_length
        self.sample_rate = sample_rate
        self.transform = transform
        self.target_transform = target_transform

        # Silence trimming (VAD)
        self.vad = torchaudio.transforms.Vad(sample_rate=sample_rate)

        # Feature transforms (for 'mel' approach)
        self.mel_spectrogram = torchaudio.transforms.MelSpectrogram(
            sample_rate=sample_rate,
            n_mels=64,
            n_fft=1024,
            hop_length=512
        )
        self.db_transform = torchaudio.transforms.AmplitudeToDB()

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        waveform, sr = torchaudio.load(row["path"])

        # Resample if needed
        if sr != self.sample_rate:
            waveform = torchaudio.transforms.Resample(sr, self.sample_rate)(waveform)

        # Convert to mono
        if waveform.size(0) > 1:
            waveform = waveform.mean(dim=0, keepdim=True)

        # Trim leading/trailing silence
        waveform = self.vad(waveform)

        # Pad or truncate to fixed length
        length = waveform.size(1)
        if length < self.max_length:
            pad_amt = self.max_length - length
            waveform = F.pad(waveform, (0, pad_amt))
        else:
            waveform = waveform[:, :self.max_length]

        # Per-sample normalization
        waveform = (waveform - waveform.mean()) / (waveform.std() + 1e-9)

        # Optional augmentations
        if self.transform is not None:
            waveform = self.transform(waveform)

        # Feature extraction
        if self.approach == "raw":
            features = waveform  # shape: [1, max_length]
        elif self.approach == "mel":
            mel_spec = self.mel_spectrogram(waveform)
            features = self.db_transform(mel_spec)  # shape: [1, n_mels, time_steps]
        else:
            raise ValueError("approach must be 'raw' or 'mel'")

        label = row["accent"]
        if self.target_transform is not None:
            label = self.target_transform(label)

        return features, label



In [11]:
df = build_metadata( "Train" ) # Training dataframe based on accent & gender metadata
raw_ds = AccentDataset(df, approach="raw",  max_length=16000*5)
mel_ds = AccentDataset(df, approach="mel",  max_length=16000*5)

# Dataloaders
batch_size = 32
raw_loader = DataLoader(raw_ds, batch_size=batch_size, shuffle=True, num_workers=4)
mel_loader = DataLoader(mel_ds, batch_size=batch_size, shuffle=True, num_workers=4)

## 1.2a: Raw input signal -> analyze as 1D signal -> standardize

In [14]:
import os
import pandas as pd
import torch
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split


# Splitting into train/val (80/20) with stratification on accent, so they each appear ~ in the same proportion in train/validation set
val_fraction = 0.2
df_train, df_val = train_test_split(
    df,
    test_size=val_fraction,
    random_state=42,
    stratify=df["accent"]
)
df_train = df_train.reset_index(drop=True)
df_val   = df_val.reset_index(drop=True)



max_length = 16000 * 5  # 5 seconds
batch_size = 32
num_workers = 4

# Raw waveform datasets and loaders
train_ds = AccentDataset(
    metadata_df=df_train,
    approach="raw",       # raw 1D signal
    max_length=max_length,
)
val_ds   = AccentDataset(
    metadata_df=df_val,
    approach="raw",
    max_length=max_length,
)

train_loader = DataLoader(
    train_ds,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers
)
val_loader = DataLoader(
    val_ds,
    batch_size=batch_size,
    shuffle=False,
    num_workers=num_workers
)




## Raw models

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import classification_report



class RawRNN1D(nn.Module):
    def __init__(self, input_size=1, hidden_size=128, num_layers=2,
                 num_classes=5, p_dropout=0.3):
        super().__init__()
        self.rnn = nn.RNN(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=p_dropout,
            nonlinearity='tanh'
        )
        self.classifier = nn.Sequential(
            nn.Dropout(p_dropout),
            nn.Linear(hidden_size, num_classes)
        )

    def forward(self, x):
        # x: [B, 1, T] -> [B, T, 1]
        x = x.transpose(1, 2)
        # rnn_out: [B, T, hidden]; h_n: [num_layers, B, hidden]
        rnn_out, h_n = self.rnn(x)
        # use last hidden state from top layer
        last_h = h_n[-1]               # [B, hidden]
        return self.classifier(last_h)


class RawLSTM1D(nn.Module):
    def __init__(self, input_size=1, hidden_size=128, num_layers=2,
                 num_classes=5, p_dropout=0.3):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=p_dropout
        )
        self.classifier = nn.Sequential(
            nn.Dropout(p_dropout),
            nn.Linear(hidden_size, num_classes)
        )

    def forward(self, x):
        # x: [B, 1, T] -> [B, T, 1]
        x = x.transpose(1, 2)
        # lstm_out: [B, T, hidden]; (h_n, c_n)
        _, (h_n, _) = self.lstm(x)
        # last hidden state
        last_h = h_n[-1]               # [B, hidden]
        return self.classifier(last_h)


## Training models

In [None]:
# %conda create -n dl-dml python=3.10
# %conda activate dl-dml


ValueError: The python kernel does not appear to be a conda environment.  Please use ``%pip install`` instead.

In [29]:
!pip install --upgrade pip




In [27]:
!pip install https://files.pythonhosted.org/packages/84/8b/00528e6c75e030cc5f1fc1d08c58c46ecdbec9cd406b1dfd03023e3af4aa/torch_directml-0.2.5.dev240914-cp311-cp311-win_amd64.whl  

Collecting torch-directml==0.2.5.dev240914
  Using cached torch_directml-0.2.5.dev240914-cp311-cp311-win_amd64.whl (9.0 MB)


In [28]:

try:
    import torch_directml
    device = torch_directml.device()
    print("Using DirectML on AMD GPU:", device)
except ImportError:
    import torch
    device = torch.device("cpu")
    print("DirectML unavailable—using CPU:", device)


DirectML unavailable—using CPU: cpu


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import classification_report
import torch_directml                # ← add this

# --- 1. Define RNN and LSTM models ---

class RawRNN1D(nn.Module):
    def __init__(self, input_size=1, hidden_size=128, num_layers=2,
                 num_classes=5, p_dropout=0.3):
        super().__init__()
        self.rnn = nn.RNN(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=p_dropout,
            nonlinearity='tanh'
        )
        self.classifier = nn.Sequential(
            nn.Dropout(p_dropout),
            nn.Linear(hidden_size, num_classes)
        )

    def forward(self, x):
        x = x.transpose(1, 2)        # [B,1,T] → [B,T,1]
        rnn_out, h_n = self.rnn(x)
        last_h = h_n[-1]             # [B, hidden]
        return self.classifier(last_h)


class RawLSTM1D(nn.Module):
    def __init__(self, input_size=1, hidden_size=128, num_layers=2,
                 num_classes=5, p_dropout=0.3):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=p_dropout
        )
        self.classifier = nn.Sequential(
            nn.Dropout(p_dropout),
            nn.Linear(hidden_size, num_classes)
        )

    def forward(self, x):
        x = x.transpose(1, 2)        # [B,1,T] → [B,T,1]
        _, (h_n, _) = self.lstm(x)
        last_h = h_n[-1]             # [B, hidden]
        return self.classifier(last_h)


# --- 2. Instantiate, optimize, and train both models ---

device = torch_directml.device()  # ← use DirectML on your AMD GPU

criterion = nn.CrossEntropyLoss()

def run_experiment(model_cls, train_loader, val_loader, **model_kwargs):
    model = model_cls(**model_kwargs).to(device)
    optimizer = optim.Adam(
        model.parameters(),
        lr=1e-3,
        weight_decay=1e-4
    )

    history = {"train_loss": [], "train_acc": [], "val_loss": [], "val_acc": []}
    for epoch in range(1, 11):
        # --- Training ---
        model.train()
        total_loss, correct, count = 0, 0, 0
        for x, y in train_loader:
            x, y = x.to(device), (y-1).to(device)
            optimizer.zero_grad()
            logits = model(x)
            loss = criterion(logits, y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * x.size(0)
            preds = logits.argmax(dim=1)
            correct += (preds == y).sum().item()
            count += x.size(0)
        train_loss = total_loss / count
        train_acc = correct / count

        # --- Validation ---
        model.eval()
        total_loss, correct, count = 0, 0, 0
        with torch.no_grad():
            for x, y in val_loader:
                x, y = x.to(device), (y-1).to(device)
                logits = model(x)
                loss = criterion(logits, y)
                total_loss += loss.item() * x.size(0)
                preds = logits.argmax(dim=1)
                correct += (preds == y).sum().item()
                count += x.size(0)
        val_loss = total_loss / count
        val_acc = correct / count

        history["train_loss"].append(train_loss)
        history["train_acc"].append(train_acc)
        history["val_loss"].append(val_loss)
        history["val_acc"].append(val_acc)

        print(f"{model.__class__.__name__} "
              f"Epoch {epoch:02d}  "
              f"Train: {train_loss:.3f}, {train_acc:.3f} | "
              f"Val: {val_loss:.3f}, {val_acc:.3f}")

    return model, history

# …then call run_experiment(RawRNN1D, …) and run_experiment(RawLSTM1D, …) as before.


In [30]:


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
criterion = nn.CrossEntropyLoss()

def run_experiment(model_cls, train_loader, val_loader, **model_kwargs):
    model = model_cls(**model_kwargs).to(device)
    optimizer = optim.Adam(
        model.parameters(),
        lr=1e-3,
        weight_decay=1e-4    # L2 regularization
    )

    history = {"train_loss": [], "train_acc": [], "val_loss": [], "val_acc": []}
    for epoch in range(1, 11):
        # Train
        model.train()
        total_loss, correct, count = 0, 0, 0
        for x, y in train_loader:
            x, y = x.to(device), (y-1).to(device)
            optimizer.zero_grad()
            logits = model(x)
            loss = criterion(logits, y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * x.size(0)
            preds = logits.argmax(dim=1)
            correct += (preds==y).sum().item()
            count   += x.size(0)
        train_loss = total_loss/count
        train_acc  = correct/count

        # Validate
        model.eval()
        total_loss, correct, count = 0, 0, 0
        with torch.no_grad():
            for x, y in val_loader:
                x, y = x.to(device), (y-1).to(device)
                logits = model(x)
                loss = criterion(logits, y)
                total_loss += loss.item() * x.size(0)
                preds = logits.argmax(dim=1)
                correct += (preds==y).sum().item()
                count   += x.size(0)
        val_loss = total_loss/count
        val_acc  = correct/count

        history["train_loss"].append(train_loss)
        history["train_acc"].append(train_acc)
        history["val_loss"].append(val_loss)
        history["val_acc"].append(val_acc)

        print(f"{model.__class__.__name__} Epoch {epoch:02d}  "
              f"Train: {train_loss:.3f}, {train_acc:.3f} | "
              f"Val: {val_loss:.3f}, {val_acc:.3f}")

    return model, history

# Run on your DataLoaders (raw_loader defined earlier)
rnn_model, rnn_hist  = run_experiment(
    RawRNN1D, train_loader=raw_loader, val_loader=val_loader,
    input_size=1, hidden_size=128, num_layers=2, p_dropout=0.3, num_classes=5
)
lstm_model, lstm_hist = run_experiment(
    RawLSTM1D, train_loader=raw_loader, val_loader=val_loader,
    input_size=1, hidden_size=128, num_layers=2, p_dropout=0.3, num_classes=5
)


Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/Users/bramdewaal/anaconda3/lib/python3.11/multiprocessing/spawn.py", line 122, in spawn_main
    exitcode = _main(fd, parent_sentinel)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/bramdewaal/anaconda3/lib/python3.11/multiprocessing/spawn.py", line 132, in _main
    self = reduction.pickle.load(from_parent)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: Can't get attribute 'AccentDataset' on <module '__main__' (built-in)>


KeyboardInterrupt: 