

## Folder structure

The following folder structure will be used in this project

VoiceSimilarityAnalysis-code
├── data
│   ├── download              # Contains downloaded zip files (e.g. ABI-1_Corpus.zip)
│   ├── raw/                   # Unzipped original dataset (14 accent folders)
│   └── cleansed/              # Contains only the filtered "shortpassage" .wav files
│
├── reports/                  # Drafts and final version of the report
│
├── results/                  # Output files: embeddings, similarity scores, matrices, plots
│
├── appendix/                 # Generative AI chat logs for submission


In [3]:
# all imports go here
import os
import zipfile
import gdown
import shutil
import re

## Step 1 - download the dataset

In an effort to totally automate the process, the dataset will be downloaded in a raw-data folder using the following code


In [4]:
# 7 minutes on Costa wifi -- execute and have a coffee

# gdrive id
file_id = "18FWBn4B6gQifOtf1C9JCQv4Lrs8C1uvu"

# download the file
download_folder = os.path.join("data", "download", "ABI-1_Corpus")
if not os.path.exists(download_folder):
    os.makedirs(download_folder, exist_ok=True)

download_path = os.path.join(download_folder, "ABI-1_Corpus.zip")
gdown.download(f"https://drive.google.com/uc?id={file_id}", download_path, quiet=False)

# open the zip
raw_folder = os.path.join("data", "raw", "ABI-1_Corpus")
if not os.path.exists(raw_folder):
    os.makedirs(raw_folder, exist_ok=True)

with zipfile.ZipFile(download_path, 'r') as zip_ref:
    zip_ref.extractall(raw_folder)

print("Raw data downloaded...")


Downloading...
From (original): https://drive.google.com/uc?id=18FWBn4B6gQifOtf1C9JCQv4Lrs8C1uvu
From (redirected): https://drive.google.com/uc?id=18FWBn4B6gQifOtf1C9JCQv4Lrs8C1uvu&confirm=t&uuid=d960d3e7-f87b-4e67-a75c-4db67582f7ff
To: f:\work\masters-ai\ari5121-project\VoiceSimilarityAnalysis-code\data\download\ABI-1_Corpus\ABI-1_Corpus.zip
100%|██████████| 2.82G/2.82G [04:04<00:00, 11.5MB/s]


Raw data downloaded...


## Step 2 - Cleanse the dataset

We will only keep the "shortpassage*.wav" files for each accent in the dataset

In [5]:
raw_data_folder = "data/raw/ABI-1_Corpus/ABI-1 Corpus/accents"
cleansed_dir = "data/cleansed"


# list accent folders removing the annoying folders
accents = [accent_folder for accent_folder in os.listdir(raw_data_folder) if not accent_folder.startswith(".")]
# go through all accents
for accent in accents:
    accent_path = os.path.join(raw_data_folder, accent)
    # list all genders in each accent
    genders = [gender_folder for gender_folder in os.listdir(accent_path) if not gender_folder.startswith(".")]
    # go through all genders in each accent
    for gender in genders:  
        gender_folder = os.path.join(accent_path, gender)
        # go througgh each speaker in each gender
        # lsit all speakers
        speakers = [speaker for speaker in os.listdir(gender_folder) if not speaker.startswith(".")]
        for speaker in speakers:
            speaker_path = os.path.join(gender_folder, speaker)

            # store resulting  data in cleaned/accent/gender/speaker
            dest_path = os.path.join(cleansed_dir, accent, gender, speaker)
            os.makedirs(dest_path, exist_ok=True)

            # copy only filenames that  are shortpassage*.wav
            # go throough all files        
            for filename in os.listdir(speaker_path):
                if re.fullmatch(r"shortpassage.*\.wav", filename):

                    src_file = os.path.join(speaker_path, filename)
                    dst_file = os.path.join(dest_path, filename)
                    shutil.copy2(src_file, dst_file)

print("Cleansing completed...")


Cleansing completed...


In [6]:
from transformers import Wav2Vec2FeatureExtractor, AutoModel


# model folder at microsoft and corresponding local folder
model_name = "microsoft/wavlm-base-plus-sv"
local_dir = "model"

# empty the model folder
if os.path.exists(local_dir):
    print(f"Removing existing model folder: {local_dir}")
    shutil.rmtree(local_dir)



# downlad the model
processor = Wav2Vec2FeatureExtractor.from_pretrained(model_name, cache_dir=local_dir)
model = AutoModel.from_pretrained(model_name, cache_dir=local_dir)

print(f"Model downloaded...")


  from .autonotebook import tqdm as notebook_tqdm


Removing existing model folder: model


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Model downloaded...


In [7]:
Just testing the code on huggingface

SyntaxError: invalid syntax (3571725436.py, line 1)

In [17]:
import torchaudio
from transformers import Wav2Vec2FeatureExtractor, WavLMForXVector
import torch
import librosa
import numpy as np


# https://stackoverflow.com/questions/77064579/module-numpy-has-no-attribute-no-nep50-warning
# def dummy_npwarn_decorator_factory():
#   def npwarn_decorator(x):
#     return x
#   return npwarn_decorator
# np._no_nep50_warning = getattr(np, '_no_nep50_warning', dummy_npwarn_decorator_factory)
# https://stackoverflow.com/questions/51912284/how-to-downgrade-numpy

print(np.__version__)

# torchaudio.set_audio_backend("soundfile")

local_model_path = os.path.join("model", "models--microsoft--wavlm-base-plus-sv", "snapshots", "feb593a6c23c1cc3d9510425c29b0a14d2b07b1e")


feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(local_model_path)
model = WavLMForXVector.from_pretrained(local_model_path)


print("--->", str(torchaudio.list_audio_backends()))

# # Path to your .wav file
wav_path = os.path.join("data", "cleansed", "brm_001", "male", "ajh001", "shortpassagea_CT.wav")
if not os.path.exists(wav_path):
    raise FileNotFoundError(f"WAV file not found at: {wav_path}")


# # # Load audio
waveform, original_sr = torchaudio.load(wav_path)

# Resample to 16kHz if needed
target_sr = 16000
if original_sr != target_sr:
    resampler = torchaudio.transforms.Resample(orig_freq=original_sr, new_freq=target_sr)
    waveform = resampler(waveform)

# Convert to 1D numpy array
audio_array = waveform.squeeze().numpy()

# Extract features
inputs = feature_extractor([audio_array], sampling_rate=target_sr, return_tensors="pt", padding=True)
with torch.no_grad():
    embeddings = model(**inputs).embeddings
    embeddings = torch.nn.functional.normalize(embeddings, dim=-1)

print(embeddings.shape)  # Should be [1, 768]
print(embeddings)


2.1.0
---> ['soundfile']




torch.Size([1, 512])
tensor([[-1.7097e-02, -1.5231e-02, -1.8405e-02,  1.5202e-02, -9.5779e-03,
          1.1019e-02,  1.7085e-03, -1.1638e-02, -3.3138e-02, -1.4006e-02,
         -1.7701e-02, -2.1907e-02, -1.9460e-02, -2.0179e-02, -9.9428e-03,
         -2.4083e-02, -1.6532e-02, -1.4741e-02, -1.8193e-02, -9.3101e-02,
         -1.8302e-02, -2.0148e-02, -1.4272e-02, -1.7597e-02, -1.8309e-02,
         -1.4506e-02, -1.6477e-02, -3.1048e-02, -1.4618e-02,  3.1411e-02,
         -1.7857e-02, -1.4265e-02, -1.6262e-02, -2.8616e-02, -1.7755e-02,
         -2.2684e-02,  3.0300e-02, -1.3263e-02, -7.4688e-03, -1.8852e-02,
         -2.5600e-02, -2.3731e-02, -1.6097e-02, -2.1447e-02, -2.4125e-02,
         -2.1679e-02, -1.9913e-02, -1.1094e-02, -2.7766e-02, -1.6725e-02,
         -1.7442e-02, -2.0374e-02, -6.5349e-02, -1.2781e-02, -1.0318e-01,
         -2.0891e-02, -1.7226e-01, -1.2774e-02, -1.8611e-02, -2.0249e-02,
         -1.6760e-02,  7.5732e-03, -5.6147e-02, -1.6838e-02, -1.0475e-01,
         -1.8159e

In [None]:
from transformers import Wav2Vec2FeatureExtractor, WavLMForXVector
from datasets import load_dataset
import torch

dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")

feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('microsoft/wavlm-base-plus-sv')
model = WavLMForXVector.from_pretrained('microsoft/wavlm-base-plus-sv')

# audio files are decoded on the fly
audio = [x["array"] for x in dataset[:2]["audio"]]
inputs = feature_extractor(audio, padding=True, return_tensors="pt")
embeddings = model(**inputs).embeddings
embeddings = torch.nn.functional.normalize(embeddings, dim=-1).cpu()

# the resulting embeddings can be used for cosine similarity-based retrieval
cosine_sim = torch.nn.CosineSimilarity(dim=-1)
similarity = cosine_sim(embeddings[0], embeddings[1])
threshold = 0.86  # the optimal threshold is dataset-dependent
if similarity < threshold:
    print("Speakers are not the same!")


In [None]:
import torch
import torchaudio
from transformers import Wav2Vec2FeatureExtractor, AutoModel

# Load model and feature extractor
model_name = "microsoft/wavlm-base-plus-sv"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()  # Inference mode

# Load a .wav file
file_path = "data/cleansed/BRM/male/spk01/shortpassage_001.wav"  # <- Change this as needed
waveform, sample_rate = torchaudio.load(file_path)

# Resample to 16kHz if needed
if sample_rate != 16000:
    resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
    waveform = resampler(waveform)

# Convert stereo to mono if needed
if waveform.shape[0] > 1:
    waveform = torch.mean(waveform, dim=0, keepdim=True)

# Extract input values
inputs = feature_extractor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt")

# Forward pass through model to get embeddings
with torch.no_grad():
    outputs = model(**inputs)
    last_hidden_state = outputs.last_hidden_state  # (batch_size, time_steps, feature_dim)

# Aggregate embeddings (e.g., mean pooling across time)
embedding = last_hidden_state.mean(dim=1).squeeze()  # shape: (feature_dim,)

print(f"Extracted embedding shape: {embedding.shape}")
