# Group assignment DL - Accent classification

## Data pre-processing

In [1]:
# %pip install torch
# %pip install -r requirements.txt # uncomment when all required libraries are defined

import os
import pandas as pd
import torch
import torchaudio
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F


In [3]:
# Load the WAV file
waveform, sample_rate = torchaudio.load('/Users/bramdewaal/Desktop/Uni/VSC/Deep Learning/Assignment/Train')

# Convert to mono if stereo
if waveform.shape[0] > 1:
    waveform = waveform.mean(dim=0, keepdim=True)

# Option 1: Using Raw Waveform
# Split into fixed-length chunks (e.g., 1 second at 16kHz)
chunk_length = 16000
chunks = [waveform[:, i:i+chunk_length] for i in range(0, waveform.shape[1], chunk_length)]

# Each chunk is a tensor of shape (1, chunk_length), ready for RNN input
# For a single file, you can use these chunks directly
print("Raw waveform chunks:", [chunk.shape for chunk in chunks])

# Option 2: Using MFCC Features
mfcc_transform = torchaudio.transforms.MFCC(sample_rate=sample_rate, n_mfcc=40)
mfcc = mfcc_transform(waveform)  # Shape: (1, num_frames, 40)

# Reshape for RNN: (num_frames, batch_size=1, 40)
mfcc_input = mfcc.permute(1, 0, 2)
print("MFCC shape for RNN:", mfcc_input.shape)

# Example: If you want to use the entire waveform without splitting (for short files)
# Add batch dimension: (batch_size=1, 1, num_frames)
raw_input = waveform.unsqueeze(0)
print("Raw waveform full sequence shape:", raw_input.shape)

LibsndfileError: Error opening '/Users/bramdewaal/Desktop/Uni/VSC/Deep Learning/Assignment/Train': Format not recognised.

In [2]:
import os
import torchaudio
import torch
import torchaudio.transforms as T

# Set target sample rate (e.g., 16000 Hz)
target_sr = 16000

# Directory containing the WAV files
wav_dir = '/Users/bramdewaal/Desktop/Uni/VSC/Deep Learning/Assignment/Train'  # Replace with your directory path

# Get list of WAV files in the directory
wav_files = [os.path.join(wav_dir, f) for f in os.listdir(wav_dir) if f.endswith('.wav')]

# Define MFCC transform
mfcc_transform = T.MFCC(sample_rate=target_sr, n_mfcc=40)

# Function to load and preprocess a single WAV file into MFCCs
def load_and_preprocess(wav_path, target_sr, mfcc_transform):
    # Load the WAV file
    waveform, original_sr = torchaudio.load(wav_path)
    
    # Resample if the original sample rate differs from the target
    if original_sr != target_sr:
        resampler = T.Resample(original_sr, target_sr)
        waveform = resampler(waveform)
    
    # Convert stereo to mono by averaging the channels
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)
    # else, it's already (1, num_samples)
    
    # Apply MFCC transform
    mfcc = mfcc_transform(waveform)  # shape: (1, num_mfcc, num_frames)
    
    # Squeeze the channel dimension and transpose
    mfcc = mfcc.squeeze(0).T  # shape: (num_frames, num_mfcc)
    
    return mfcc

# Load and preprocess all WAV files into a list of MFCC tensors
audio_mfccs = [load_and_preprocess(wav_file, target_sr, mfcc_transform) for wav_file in wav_files]

# Save the list of MFCC tensors to a file
output_file = 'audio_mfccs.pt'
torch.save(audio_mfccs, output_file)

print(f"Loaded {len(audio_mfccs)} audio samples' MFCCs and saved to '{output_file}'")



Loaded 3166 audio samples' MFCCs and saved to 'audio_mfccs.pt'


In [1]:
def build_metadata(data_dir: str):
    """
    Walk through data_dir and collect file paths, accents, and genders.
    """
    records = []
    for root, _, files in os.walk(data_dir):
        for fname in files:
            if fname.lower().endswith(".wav"):
                path = os.path.join(root, fname)
                accent = int(fname[0])          # '1'–'5'
                gender = fname[1].lower()       # 'm' or 'f'
                records.append({"path": path, "accent": accent, "gender": gender})
    return pd.DataFrame(records)

In [6]:
df = build_metadata("/Users/bramdewaal/Desktop/Uni/VSC/Deep Learning/Assignment/Train") # Training dataframe based on accent & gender metadata
