

## Folder structure

The following folder structure will be used in this project

VoiceSimilarityAnalysis-code
├── data
│   ├── download              # Contains downloaded zip files (e.g. ABI-1_Corpus.zip)
│   ├── raw/                   # Unzipped original dataset (14 accent folders)
│   └── cleansed/              # Contains only the filtered "shortpassage" .wav files
│
├── reports/                  # Drafts and final version of the report
│
├── results/                  # Output files: embeddings, similarity scores, matrices, plots
│
├── appendix/                 # Generative AI chat logs for submission


In [8]:
# all imports go here
import os
import zipfile
import gdown
import shutil
import re

## Step 1 - download the dataset

In an effort to totally automate the process, the dataset will be downloaded in a raw-data folder using the following code


In [6]:
# 7 minutes on Costa wifi -- execute and have a coffee

# gdrive id
file_id = "18FWBn4B6gQifOtf1C9JCQv4Lrs8C1uvu"

# download the file
download_folder = os.path.join("data", "download", "ABI-1_Corpus")
if not os.path.exists(download_folder):
    os.makedirs(download_folder, exist_ok=True)

download_path = os.path.join(download_folder, "ABI-1_Corpus.zip")
gdown.download(f"https://drive.google.com/uc?id={file_id}", download_path, quiet=False)

# open the zip
raw_folder = os.path.join("data", "raw", "ABI-1_Corpus")
if not os.path.exists(raw_folder):
    os.makedirs(raw_folder, exist_ok=True)

with zipfile.ZipFile(download_path, 'r') as zip_ref:
    zip_ref.extractall(raw_folder)

print("Raw data downloaded...")


Downloading...
From (original): https://drive.google.com/uc?id=18FWBn4B6gQifOtf1C9JCQv4Lrs8C1uvu
From (redirected): https://drive.google.com/uc?id=18FWBn4B6gQifOtf1C9JCQv4Lrs8C1uvu&confirm=t&uuid=e299ebfa-0e41-4c1f-8280-7b9ba24e9779
To: /Users/carmelgafa/Documents/my-work/ari5121-project/VoiceSimilarityAnalysis-code/data/download/ABI-1_Corpus/ABI-1_Corpus.zip
100%|██████████| 2.82G/2.82G [04:27<00:00, 10.5MB/s]


Raw data downloaded...


## Step 2 - Cleanse the dataset

We will only keep the "shortpassage*.wav" files for each accent in the dataset

In [None]:
raw_data_folder = "data/raw/ABI-1_Corpus/ABI-1 Corpus/accents"
cleansed_dir = "data/cleansed"


# list accent folders removing the annoying folders
accents = [accent_folder for accent_folder in os.listdir(raw_data_folder) if not accent_folder.startswith(".")]
# go through all accents
for accent in accents:
    accent_path = os.path.join(raw_data_folder, accent)
    # list all genders in each accent
    genders = [gender_folder for gender_folder in os.listdir(accent_path) if not gender_folder.startswith(".")]
    # go through all genders in each accent
    for gender in genders:  
        gender_folder = os.path.join(accent_path, gender)
        # go througgh each speaker in each gender
        # lsit all speakers
        speakers = [speaker for speaker in os.listdir(gender_folder) if not speaker.startswith(".")]
        for speaker in speakers:
            speaker_path = os.path.join(gender_folder, speaker)

            # store resulting  data in cleaned/accent/gender/speaker
            dest_path = os.path.join(cleansed_dir, accent, gender, speaker)
            os.makedirs(dest_path, exist_ok=True)

            # copy only filenames that  are shortpassage*.wav
            # go throough all files        
            for filename in os.listdir(speaker_path):
                if re.fullmatch(r"shortpassage.*\.wav", filename):

                    src_file = os.path.join(speaker_path, filename)
                    dst_file = os.path.join(dest_path, filename)
                    shutil.copy2(src_file, dst_file)

print("Cleansing completed...")


Cleansing completed...


In [None]:
from transformers import Wav2Vec2FeatureExtractor, AutoModel


# model folder at microsoft and corresponding local folder
model_name = "microsoft/wavlm-base-plus-sv"
local_dir = "model"

# empty the model folder
if os.path.exists(local_dir):
    print(f"🧹 Removing existing model folder: {local_dir}")
    shutil.rmtree(local_dir)



# downlad the model
processor = Wav2Vec2FeatureExtractor.from_pretrained(model_name, cache_dir=local_dir)
model = AutoModel.from_pretrained(model_name, cache_dir=local_dir)

print(f"Model downloaded...")


🧹 Removing existing model folder: model
Model downloaded...


In [None]:
Just testing the code on huggingface

In [30]:
import torch
import torchaudio
from transformers import Wav2Vec2FeatureExtractor, AutoModel

# Load model and feature extractor
model_name = "microsoft/wavlm-base-plus-sv"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()  # Inference mode

# Load a .wav file
file_path = "data/cleansed/BRM/male/spk01/shortpassage_001.wav"  # <- Change this as needed
waveform, sample_rate = torchaudio.load(file_path)

# Resample to 16kHz if needed
if sample_rate != 16000:
    resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
    waveform = resampler(waveform)

# Convert stereo to mono if needed
if waveform.shape[0] > 1:
    waveform = torch.mean(waveform, dim=0, keepdim=True)

# Extract input values
inputs = feature_extractor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt")

# Forward pass through model to get embeddings
with torch.no_grad():
    outputs = model(**inputs)
    last_hidden_state = outputs.last_hidden_state  # (batch_size, time_steps, feature_dim)

# Aggregate embeddings (e.g., mean pooling across time)
embedding = last_hidden_state.mean(dim=1).squeeze()  # shape: (feature_dim,)

print(f"✅ Extracted embedding shape: {embedding.shape}")


RuntimeError: Couldn't find appropriate backend to handle uri data/cleansed/BRM/male/spk01/shortpassage_001.wav and format None.