<a href="https://colab.research.google.com/github/ananya1331/SER-notebooks/blob/main/ECAPA_TDNN_Demo_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# --- clean slate ---
!pip uninstall -y torchaudio torch torchvision torchtext -q

# --- stable torch + audio ---
!pip install -q torch==2.1.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cu118

# --- audio + ML stack ---
!pip install -q librosa soundfile numpy scipy scikit-learn pandas tqdm

# --- speaker embeddings ---
!pip install -q speechbrain==0.5.16

[0m[31mERROR: Could not find a version that satisfies the requirement torch==2.1.2 (from versions: 2.2.0+cu118, 2.2.1+cu118, 2.2.2+cu118, 2.3.0+cu118, 2.3.1+cu118, 2.4.0+cu118, 2.4.1+cu118, 2.5.0+cu118, 2.5.1+cu118, 2.6.0+cu118, 2.7.0+cu118, 2.7.1+cu118)[0m[31m
[0m[31mERROR: No matching distribution found for torch==2.1.2[0m[31m
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
fastai 2.8.6 requires torchvision>=0.11, which is not installed.
timm 1.0.24 requires torchvision, which is not installed.[0m[31m
[0m

In [7]:
import json
import librosa
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm import tqdm

import torch
from speechbrain.pretrained import EncoderClassifier
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import normalize



In [8]:
device = "cuda" if torch.cuda.is_available() else "cpu"

classifier = EncoderClassifier.from_hparams(
    source="speechbrain/spkrec-ecapa-voxceleb",
    run_opts={"device": device}
)

hyperparams.yaml: 0.00B [00:00, ?B/s]

embedding_model.ckpt:   0%|          | 0.00/83.3M [00:00<?, ?B/s]

mean_var_norm_emb.ckpt:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

classifier.ckpt:   0%|          | 0.00/5.53M [00:00<?, ?B/s]

label_encoder.txt: 0.00B [00:00, ?B/s]

In [10]:
AUDIO_PATH = "/content/test2.wav"
TRANSCRIPT_JSON = "/content/test2.json"

audio, sr = librosa.load(AUDIO_PATH, sr=16000)

In [11]:
with open(TRANSCRIPT_JSON, "r") as f:
    segments = json.load(f)

Extract Speaker Embeddings PER SEGMENT

In [12]:

def extract_embedding(start, end):
    start_sample = int(start * sr)
    end_sample = int(end * sr)
    segment_audio = audio[start_sample:end_sample]

    if len(segment_audio) < sr * 0.5:
        return None  # ignore ultra-short junk

    wav = torch.tensor(segment_audio).unsqueeze(0).to(device)

    with torch.no_grad():
        emb = classifier.encode_batch(wav)
        emb = emb.squeeze().cpu().numpy()

    return emb

In [18]:
embeddings = []
valid_segments = []
from tqdm.auto import tqdm

for seg in tqdm(segments, desc="Extracting embeddings"):
    emb = extract_embedding(seg["start"], seg["end"])
    if emb is not None:
        embeddings.append(emb)
        valid_segments.append(seg)

print(f"Total segments: {len(segments)}")
print(f"Valid segments used: {len(valid_segments)}")

Extracting embeddings:   0%|          | 0/3 [00:00<?, ?it/s]

Total segments: 3
Valid segments used: 3


Cluster into 3 Speakers

In [19]:
X = normalize(np.vstack(embeddings))

clustering = AgglomerativeClustering(
    n_clusters=3,
    metric="cosine",
    linkage="average"
)

labels = clustering.fit_predict(X)

In [20]:
print("Embedding matrix shape:", X.shape)
print("Unique speaker labels:", np.unique(labels))

Embedding matrix shape: (3, 192)
Unique speaker labels: [0 1 2]


In [21]:
for seg, label in zip(valid_segments, labels):
    seg["speaker_id"] = f"speaker_{label}"

Diarization Results (Preview)

In [22]:
for seg in valid_segments[:5]:
    print(
        f"[{seg['start']:.2f}–{seg['end']:.2f}] "
        f"{seg['speaker_id']} :: {seg['raw_transcript'][:80]}"
    )

[0.13–7.86] speaker_2 :: Yeah, I guess. I just thought Aristotle thought flames kept going up and up and 
[8.49–46.97] speaker_1 :: No, Aristotle thought the flames would stop at some point at their natural place
[46.00–55.59] speaker_0 :: yes, Susie. Sorry, I... I'm confused again. I thought nobody knew the earth was 


Verify Clustering

In [25]:
for spk in ["speaker_0", "speaker_1", "speaker_2"]:
    print(f"\nPlaying {spk}")
    for seg in valid_segments:
        if seg["speaker_id"] == spk:
            play_segment(seg)
            break


Playing speaker_0



Playing speaker_1



Playing speaker_2


In [16]:
with open("diarized_output.json", "w") as f:
    json.dump(valid_segments, f, indent=2)

In [17]:
df = pd.DataFrame(valid_segments)
df.to_csv("diarized_output.csv", index=False)

Compute pitch per segment

In [26]:
def estimate_pitch(segment):
    f0, voiced_flag, _ = librosa.pyin(
        segment,
        fmin=50,
        fmax=300,
        sr=sr
    )
    f0 = f0[~np.isnan(f0)]
    if len(f0) == 0:
        return None
    return np.median(f0)

Attach pitch + gender label

In [27]:
for seg in valid_segments:
    s = int(seg["start"] * sr)
    e = int(seg["end"] * sr)
    segment_audio = audio[s:e]

    pitch = estimate_pitch(segment_audio)
    seg["median_pitch_hz"] = pitch

    if pitch is None:
        seg["gender_estimate"] = "unknown"
    elif pitch < 165:
        seg["gender_estimate"] = "male_sounding"
    else:
        seg["gender_estimate"] = "female_sounding"

Aggregate per speaker

In [28]:
from collections import defaultdict

speaker_pitch = defaultdict(list)

for seg in valid_segments:
    if seg["median_pitch_hz"] is not None:
        speaker_pitch[seg["speaker_id"]].append(seg["median_pitch_hz"])

for spk, pitches in speaker_pitch.items():
    median_pitch = np.median(pitches)
    gender = "male_sounding" if median_pitch < 165 else "female_sounding"
    print(f"{spk}: {median_pitch:.1f} Hz → {gender}")

speaker_2: 139.0 Hz → male_sounding
speaker_1: 119.6 Hz → male_sounding
speaker_0: 191.0 Hz → female_sounding


Group embeddings by speaker

In [29]:
from collections import defaultdict

speaker_embeddings = defaultdict(list)

for seg, emb in zip(valid_segments, embeddings):
    speaker_embeddings[seg["speaker_id"]].append(emb)

Average

In [30]:
speaker_vectors = {}

for spk, embs in speaker_embeddings.items():
    speaker_vectors[spk] = np.mean(np.vstack(embs), axis=0)

In [31]:
for spk, vec in speaker_vectors.items():
    print(spk, vec.shape)

speaker_2 (192,)
speaker_1 (192,)
speaker_0 (192,)


192 Embedding Vectors

In [32]:
np.set_printoptions(precision=4, suppress=True)

for spk, vec in speaker_vectors.items():
    print(f"\n{spk} embedding:\n{vec}")


speaker_2 embedding:
[ 33.9873   4.525   21.9577 -21.5127  29.5772 -15.2986   4.4707  30.4073
   0.0433 -36.1201  -4.3328  66.9687   7.5181 -15.915    9.3145 -35.3913
  24.2174 -16.031   14.9153 -23.0442  -6.8946 -17.6593 -38.2619  -6.9787
  12.2113   7.9942 -17.8777  20.4196  14.3117 -30.5637   8.5727  52.9676
  10.376   -3.0108 -13.7574  -5.558    4.1729 -33.837  -15.3958   2.6227
  31.1654 -14.3594  -6.7551   4.982   21.0068  -7.8555   5.7667   5.8389
  19.8501  -4.716  -17.8624  28.016  -55.6317   9.6481  12.277   -7.562
   6.4378  -1.6892  14.7244  26.122  -18.5488   1.9895  -1.1096  11.2523
  20.6427 -30.8775  -7.8844 -42.9356 -10.7245 -44.0734 -38.335   11.7652
 -12.0904 -15.8043   3.3664  -8.6727  29.055  -23.8725  39.1929  -2.1185
  39.3509 -13.8929  -2.2194 -30.4517  35.7955  -0.4179  17.0247 -21.2594
 -24.1312   5.8625  -8.1723  36.1565  15.0523  -8.7317  -5.6739  21.5691
  40.9821  -4.7373 -57.3025   8.3952  -6.5127  65.4797   4.153   -9.6378
   6.6713 -17.9085 -24.0966  2

In [33]:
df = pd.DataFrame.from_dict(speaker_vectors, orient="index")
df.to_csv("speaker_embeddings.csv")