# Group assignment DL - Accent classification

## Data pre-processing

In [1]:
# %pip install torch
# %pip install -r requirements.txt # uncomment when all required libraries are defined

import os
import pandas as pd
import torch
import torchaudio
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F


In [None]:
# Load the WAV file
waveform, sample_rate = torchaudio.load('path/to/your/file.wav')

# Convert to mono if stereo
if waveform.shape[0] > 1:
    waveform = waveform.mean(dim=0, keepdim=True)

# Option 1: Using Raw Waveform
# Split into fixed-length chunks (e.g., 1 second at 16kHz)
chunk_length = 16000
chunks = [waveform[:, i:i+chunk_length] for i in range(0, waveform.shape[1], chunk_length)]

# Each chunk is a tensor of shape (1, chunk_length), ready for RNN input
# For a single file, you can use these chunks directly
print("Raw waveform chunks:", [chunk.shape for chunk in chunks])

# Option 2: Using MFCC Features
mfcc_transform = torchaudio.transforms.MFCC(sample_rate=sample_rate, n_mfcc=40)
mfcc = mfcc_transform(waveform)  # Shape: (1, num_frames, 40)

# Reshape for RNN: (num_frames, batch_size=1, 40)
mfcc_input = mfcc.permute(1, 0, 2)
print("MFCC shape for RNN:", mfcc_input.shape)

# Example: If you want to use the entire waveform without splitting (for short files)
# Add batch dimension: (batch_size=1, 1, num_frames)
raw_input = waveform.unsqueeze(0)
print("Raw waveform full sequence shape:", raw_input.shape)

In [None]:
# Set target sample rate (e.g., 16000 Hz)
target_sr = 16000

# Directory containing the WAV files
wav_dir = 'path/to/wav_files'  # Replace with your directory path

# Get list of WAV files in the directory
wav_files = [os.path.join(wav_dir, f) for f in os.listdir(wav_dir) if f.endswith('.wav')]

# Function to load and preprocess a single WAV file
def load_and_preprocess(wav_path, target_sr):
    # Load the WAV file
    waveform, original_sr = torchaudio.load(wav_path)
    
    # Resample if the original sample rate differs from the target
    if original_sr != target_sr:
        resampler = torchaudio.transforms.Resample(original_sr, target_sr)
        waveform = resampler(waveform)
    
    # Convert stereo to mono by averaging the channels
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)
    
    # Return the 1D tensor (shape: (num_samples,))
    return waveform.squeeze(0)

# Load and preprocess all WAV files into a list of tensors
audio_tensors = [load_and_preprocess(wav_file, target_sr) for wav_file in wav_files]

# Save the list of tensors to a file
output_file = 'audio_tensors.pt'
torch.save(audio_tensors, output_file)

# Optional: To load the tensors later, use:
# audio_tensors = torch.load('audio_tensors.pt')

print(f"Loaded {len(audio_tensors)} audio samples and saved to '{output_file}'")

In [None]:
import os
import torchaudio
import torch
import torchaudio.transforms as T

# Set target sample rate (e.g., 16000 Hz)
target_sr = 16000

# Directory containing the WAV files
wav_dir = 'path/to/wav_files'  # Replace with your directory path

# Get list of WAV files in the directory
wav_files = [os.path.join(wav_dir, f) for f in os.listdir(wav_dir) if f.endswith('.wav')]

# Define MFCC transform
mfcc_transform = T.MFCC(sample_rate=target_sr, n_mfcc=40)

# Function to load and preprocess a single WAV file into MFCCs
def load_and_preprocess(wav_path, target_sr, mfcc_transform):
    # Load the WAV file
    waveform, original_sr = torchaudio.load(wav_path)
    
    # Resample if the original sample rate differs from the target
    if original_sr != target_sr:
        resampler = T.Resample(original_sr, target_sr)
        waveform = resampler(waveform)
    
    # Convert stereo to mono by averaging the channels
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)
    # else, it's already (1, num_samples)
    
    # Apply MFCC transform
    mfcc = mfcc_transform(waveform)  # shape: (1, num_mfcc, num_frames)
    
    # Squeeze the channel dimension and transpose
    mfcc = mfcc.squeeze(0).T  # shape: (num_frames, num_mfcc)
    
    return mfcc

# Load and preprocess all WAV files into a list of MFCC tensors
audio_mfccs = [load_and_preprocess(wav_file, target_sr, mfcc_transform) for wav_file in wav_files]

# Save the list of MFCC tensors to a file
output_file = 'audio_mfccs.pt'
torch.save(audio_mfccs, output_file)

print(f"Loaded {len(audio_mfccs)} audio samples' MFCCs and saved to '{output_file}'")

In [2]:
def build_metadata(data_dir: str):
    """
    Walk through data_dir and collect file paths, accents, and genders.
    """
    records = []
    for root, _, files in os.walk(data_dir):
        for fname in files:
            if fname.lower().endswith(".wav"):
                path = os.path.join(root, fname)
                accent = int(fname[0])          # '1'–'5'
                gender = fname[1].lower()       # 'm' or 'f'
                records.append({"path": path, "accent": accent, "gender": gender})
    return pd.DataFrame(records)

In [None]:
import torch
from torch.utils.data import Dataset

class MFCCDataset(Dataset):
    def __init__(self, audio_mfcc, labels):
        """
        Args:
            audio_mfcc (list): List of MFCC tensors, each [num_frames, n_mfcc].
            labels (list or tensor): List or tensor of labels, one per audio sample.
        """
        self.audio_mfcc = audio_mfcc
        self.labels = labels

    def __len__(self):
        """Return the total number of samples."""
        return len(self.audio_mfcc)

    def __getitem__(self, idx):
        """Return the MFCC tensor, label, and sequence length for the sample at idx."""
        mfcc = self.audio_mfcc[idx]  # Shape: [num_frames, n_mfcc]
        label = self.labels[idx]     # Scalar label
        length = mfcc.size(0)        # Number of frames (sequence length)
        return mfcc, label, length

In [None]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    """
    Args:
        batch (list): List of tuples (mfcc, label, length) from the dataset.
    Returns:
        padded_mfccs (tensor): Padded MFCCs [batch_size, max_seq_len, n_mfcc].
        labels (tensor): Labels [batch_size].
        lengths (tensor): Original sequence lengths [batch_size].
    """
    # Sort batch by sequence length (descending)
    batch = sorted(batch, key=lambda x: x[2], reverse=True)
    
    # Unzip the batch into mfccs, labels, and lengths
    mfccs, labels, lengths = zip(*batch)
    
    # Pad the MFCC tensors to the longest sequence in the batch
    padded_mfccs = pad_sequence(mfccs, batch_first=True)  # [batch_size, max_seq_len, n_mfcc]
    
    # Convert labels and lengths to tensors
    labels = torch.tensor(labels, dtype=torch.long)
    lengths = torch.tensor(lengths, dtype=torch.long)
    
    return padded_mfccs, labels, lengths

In [None]:
from torch.utils.data import DataLoader

# Create the dataset
dataset = MFCCDataset(audio_mfcc, labels)

# Create the DataLoader
batch_size = 32
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

In [None]:
df = build_metadata("/Users/bramdewaal/Desktop/Uni/VSC/Deep Learning/Assignment/Train") # Training dataframe based on accent & gender metadata
raw_ds = AccentDataset(df, approach="raw",  max_length=16000*5)
mel_ds = AccentDataset(df, approach="mel",  max_length=16000*5)

# Dataloaders
batch_size = 32
raw_loader = DataLoader(raw_ds, batch_size=batch_size, shuffle=True, num_workers=4)
mel_loader = DataLoader(mel_ds, batch_size=batch_size, shuffle=True, num_workers=4)


In [None]:
import os
import pandas as pd
import torch
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split


# Splitting into train/val (80/20) with stratification on accent, so they each appear ~ in the same proportion in train/validation set
val_fraction = 0.2
df_train, df_val = train_test_split(
    df,
    test_size=val_fraction,
    random_state=42,
    stratify=df["accent"]
)
df_train = df_train.reset_index(drop=True)
df_val   = df_val.reset_index(drop=True)



max_length = 16000 * 5  # 5 seconds
batch_size = 32
num_workers = 4

# Raw waveform datasets and loaders
train_ds = AccentDataset(
    metadata_df=df_train,
    approach="raw",       # raw 1D signal
    max_length=max_length,
)
val_ds   = AccentDataset(
    metadata_df=df_val,
    approach="raw",
    max_length=max_length,
)

train_loader = DataLoader(
    train_ds,
    batch_size=batch_size,
    shuffle=True,
    num_workers=num_workers
)
val_loader = DataLoader(
    val_ds,
    batch_size=batch_size,
    shuffle=False,
    num_workers=num_workers
)


