# Evaluating Model Performance
This notebook evaluates the performance and fairness of a speaker verification model trained on LibriSpeech. It includes DET curve plotting, EER calculation, confusion matrix summaries, and group-based bias evaluation based on gender and speaker representation (minutes spoken).


In [6]:
import sys
!{sys.executable} -m pip install speechbrain torchaudio soundfile librosa tqdm

Collecting speechbrain
  Using cached speechbrain-1.0.3-py3-none-any.whl.metadata (24 kB)
Collecting torchaudio
  Using cached torchaudio-2.8.0-cp313-cp313-macosx_11_0_arm64.whl.metadata (7.2 kB)
Collecting tqdm
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting hyperpyyaml (from speechbrain)
  Using cached HyperPyYAML-1.2.2-py3-none-any.whl.metadata (7.6 kB)
Collecting sentencepiece (from speechbrain)
  Using cached sentencepiece-0.2.1-cp313-cp313-macosx_11_0_arm64.whl.metadata (10 kB)
Collecting torch>=1.9 (from speechbrain)
  Using cached torch-2.8.0-cp313-none-macosx_11_0_arm64.whl.metadata (30 kB)
Collecting huggingface_hub (from speechbrain)
  Using cached huggingface_hub-0.34.4-py3-none-any.whl.metadata (14 kB)
Collecting filelock (from torch>=1.9->speechbrain)
  Using cached filelock-3.19.1-py3-none-any.whl.metadata (2.1 kB)
Collecting sympy>=1.13.3 (from torch>=1.9->speechbrain)
  Using cached sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting netwo

## Step 1: Load Score File
We begin by loading the `librispeech_sv_scores.csv` file, which contains similarity scores and metadata for speaker pairs.

In [42]:
import os
from pathlib import Path
import csv
import torch
import torch.nn.functional as F
import soundfile as sf
from tqdm.auto import tqdm
from speechbrain.inference import SpeakerRecognition


In [73]:
LIBRI_ROOT = Path("./data/train-clean-100/")
OUT_CSV = Path("./result/librispeech_sv_scores.csv")
EMB_CACHE = Path(".result/librispeech_embeddings.pt")

# Reproducibility
random.seed(42)
torch.manual_seed(42)

# Ensure output dirs exist
OUT_CSV.parent.mkdir(parents=True, exist_ok=True)
EMB_CACHE.parent.mkdir(parents=True, exist_ok=True)

In [74]:
from pathlib import Path
import random
from itertools import combinations

def list_flacs(root: Path):
    # Resolve both root and files to avoid relative_to issues
    root = root.resolve()
    return sorted([p.resolve() for p in root.glob("**/*.flac")])

def speaker_id_from_path(p: Path, root: Path):
    """
    Robust speaker parsing:
    1) Try folder under root
    2) Fallback to filename prefix: <spk>-<chapter>-<utt>.flac
    """
    p = p.resolve()
    root = root.resolve()
    try:
        return p.relative_to(root).parts[0]
    except Exception:
        spk = p.stem.split("-")[0]
        return spk

flacs = list_flacs(LIBRI_ROOT)
len(flacs), flacs[:3]

(28539,
 [PosixPath('/Users/abc/Documents/study/dga/data/train-clean-100/103/1240/103-1240-0000.flac'),
  PosixPath('/Users/abc/Documents/study/dga/data/train-clean-100/103/1240/103-1240-0001.flac'),
  PosixPath('/Users/abc/Documents/study/dga/data/train-clean-100/103/1240/103-1240-0002.flac')])

## Step 2: Training

In [76]:
!{sys.executable} -m pip install torch torchvision

import torch
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print (x)
    # output expected:
    # tensor([1.], device='mps:0')
else:
    print ("MPS device not found.")

tensor([1.], device='mps:0')


In [54]:
import torch

device = "mps" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

Using device: cpu


In [56]:
verifier = SpeakerRecognition.from_hparams(
    source="speechbrain/spkrec-ecapa-voxceleb",
    run_opts={"device": device}
)
verifier

SpeakerRecognition(
  (mods): ModuleDict(
    (compute_features): Fbank(
      (compute_STFT): STFT()
      (compute_fbanks): Filterbank()
      (compute_deltas): Deltas()
      (context_window): ContextWindow()
    )
    (mean_var_norm): InputNormalization()
    (embedding_model): ECAPA_TDNN(
      (blocks): ModuleList(
        (0): TDNNBlock(
          (conv): Conv1d(
            (conv): Conv1d(80, 1024, kernel_size=(5,), stride=(1,))
          )
          (activation): ReLU()
          (norm): BatchNorm1d(
            (norm): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          )
          (dropout): Dropout1d(p=0.0, inplace=False)
        )
        (1): SERes2NetBlock(
          (tdnn1): TDNNBlock(
            (conv): Conv1d(
              (conv): Conv1d(1024, 1024, kernel_size=(1,), stride=(1,))
            )
            (activation): ReLU()
            (norm): BatchNorm1d(
              (norm): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affi

In [61]:
import soundfile as sf

def load_audio_mono_16k(path: str):
    sig, sr = sf.read(path, dtype="float32")
    if sig.ndim > 1:
        sig = sig.mean(axis=1)
    wav = torch.from_numpy(sig).unsqueeze(0)       # [1, T]
    wav_lens = torch.tensor([1.0])                 # full length fraction
    return wav, wav_lens

@torch.no_grad()
def embed_file(path: str):
    wav, wav_lens = load_audio_mono_16k(path)
    wav = wav.to(device)
    wav_lens = wav_lens.to(device)
    emb = verifier.encode_batch(wav, wav_lens=wav_lens)  # correct arg name
    return emb.squeeze().cpu()

In [65]:
def build_trials(flac_paths, root, max_pos_per_spk=20, max_neg_per_spk=40):
    from itertools import combinations
    import random

    by_spk = {}
    for p in flac_paths:
        spk = speaker_id_from_path(p, root)
        by_spk.setdefault(spk, []).append(p)

    for spk in by_spk:
        by_spk[spk] = sorted(by_spk[spk])

    trials = []
    speakers = [s for s in by_spk if len(by_spk[s]) > 0]

    for spk in speakers:
        utts = by_spk[spk]

        if len(utts) >= 2:
            pos_pairs = list(combinations(utts, 2))
            random.shuffle(pos_pairs)
            pos_pairs = pos_pairs[:max_pos_per_spk]
            for a, b in pos_pairs:
                trials.append((a, b, 1))

        other_spks = [s for s in speakers if s != spk and len(by_spk[s]) > 0]
        random.shuffle(other_spks)
        pool_a = [random.choice(utts)] if len(utts) == 1 else random.sample(utts, k=min(5, len(utts)))
        neg_pairs = []
        for o in other_spks:
            a = random.choice(pool_a)
            b = random.choice(by_spk[o])
            neg_pairs.append((a, b, 0))
            if len(neg_pairs) >= max_neg_per_spk:
                break
        trials.extend(neg_pairs)

    uniq = {}
    for a, b, y in trials:
        key = tuple(sorted([str(a), str(b)]))
        uniq[key] = (str(a), str(b), y)

    trials = list(uniq.values())
    random.shuffle(trials)
    return trials

def build_embedding_cache(trials, cache_path: Path):
    cache = torch.load(cache_path) if cache_path.exists() else {}
    uniq_files = sorted({str(a) for a,_,_ in trials} | {str(b) for _,b,_ in trials})
    for path in tqdm(uniq_files, desc="Embedding"):
        if path not in cache:
            try:
                cache[path] = embed_file(path)
            except Exception as e:
                print(f"Failed on {path}: {e}")
    cache_path.parent.mkdir(parents=True, exist_ok=True)
    torch.save(cache, cache_path)
    return cache

In [72]:
def cosine_sim(ea, eb):
    return F.cosine_similarity(ea.unsqueeze(0), eb.unsqueeze(0)).item()

def fix_path(str1):
    file_name = str1.split("/")[-1].split("-")[0]
    return str(file_name)

def score_trials(trials, emb_cache):
    rows = []
    for a, b, y in tqdm(trials, desc="Scoring"):
        ea = emb_cache.get(str(a)); eb = emb_cache.get(str(b))
        if ea is None or eb is None:
            continue
        s = cosine_sim(ea, eb)
        rows.append((str(a), str(b), s, y, fix_path(str(a)), fix_path(str(b))))
    return rows

emb_cache = build_embedding_cache(trials, EMB_CACHE)
rows = score_trials(trials, emb_cache)

with OUT_CSV.open("w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(["utt1", "utt2", "score", "label", "clean_utt1", "clean_utt2"])
    writer.writerows(rows)


Embedding: 100%|█████████████████████| 15040/15040 [00:00<00:00, 7460067.66it/s]
Scoring: 100%|████████████████████████| 15060/15060 [00:00<00:00, 130853.15it/s]
