# Speaker Diarization

In [2]:
from pathlib import Path
import os, json, torch
from pyannote.audio import Pipeline
import torch

AUDIO = Path("AUG_17_CLIF.wav")   # FILE GOES HERE
OUTDIR = Path("out"); OUTDIR.mkdir(exist_ok=True)

os.environ["HF_HOME"] = "/home/jovyan/Oracle_local/pyannote_hf_cache"
os.environ["HF_HUB_OFFLINE"] = "1"
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"

torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1")

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
pipeline = pipeline.to(device)
print("Device:", "GPU: " + torch.cuda.get_device_name(0) if device.type == "cuda" else "CPU")

assert AUDIO.exists(), f"Audio not found: {AUDIO}"

try:
    diar = pipeline(str(AUDIO))
except TypeError:
    diar = pipeline({"audio": str(AUDIO)})

segments = []
spk_map = {}
for segment, _, label in diar.itertracks(yield_label=True):
    spk = str(label)
    if spk not in spk_map:
        spk_map[spk] = f"Speaker_{len(spk_map)}"
    segments.append({"start": float(segment.start), "end": float(segment.end), "speaker": spk_map[spk]})

segments.sort(key=lambda s: s["start"])
merged = []
for s in segments:
    if merged and merged[-1]["speaker"] == s["speaker"] and s["start"] - merged[-1]["end"] <= 0.25:
        merged[-1]["end"] = max(merged[-1]["end"], s["end"])
    else:
        merged.append(s)

out_path = OUTDIR / "segments.json"
out_path.write_text(json.dumps(merged, indent=2), encoding="utf-8")

print("Speakers inferred:", sorted(set(s["speaker"] for s in merged)))
print("Segments written to:", out_path.resolve())
print("Total segments:", len(merged))

Device: GPU: NVIDIA RTX 6000 Ada Generation


It can be re-enabled by calling
   >>> import torch
   >>> torch.backends.cuda.matmul.allow_tf32 = True
   >>> torch.backends.cudnn.allow_tf32 = True
See https://github.com/pyannote/pyannote-audio/issues/1370 for more details.

  std = sequences.std(dim=-1, correction=1)


Speakers inferred: ['Speaker_0', 'Speaker_1', 'Speaker_2', 'Speaker_3', 'Speaker_4', 'Speaker_5', 'Speaker_6', 'Speaker_7', 'Speaker_8', 'Speaker_9']
Segments written to: /home/jovyan/Oracle_local/out/segments.json
Total segments: 360
