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

In [None]:
# ============================================================
# WORKING COLAB: YouTube -> robust downloader -> WhisperX + Pyannote diarization
# Single cell (paste & run in Colab; set Runtime -> GPU)
# ============================================================

# ----------------- Installs -----------------
# update pip (optional), install latest yt-dlp and models
!pip install -q --upgrade pip
!pip install -q -U yt-dlp
!pip install -q git+https://github.com/m-bain/whisperX.git
!pip install -q "pyannote.audio>=2.1" ffmpeg-python faster_whisper

# make sure ffmpeg binary available
!apt-get update -qq && apt-get install -y -qq ffmpeg

# ----------------- Imports & config -----------------
import os, shlex, subprocess, json, csv, math
from getpass import getpass
import torch

# -------------- USER CONFIG --------------
YOUTUBE_URL = "https://www.youtube.com/watch?v=LJiUDxj-2ZE"  # <-- change if desired
WHISPER_MODEL = "tiny"   # tiny recommended for Colab; change if you want
OUT_DIR = "content"
os.makedirs(OUT_DIR, exist_ok=True)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# -----------------------------------------

# -------------- Hugging Face token --------------
if "HUGGINGFACE_TOKEN" not in os.environ:
    print("Paste your Hugging Face token (it will be hidden):")
    os.environ["HUGGINGFACE_TOKEN"] = getpass("HUGGINGFACE_TOKEN: ")
HUGGINGFACE_TOKEN = os.environ["HUGGINGFACE_TOKEN"]

# ----------------- robust downloader -----------------
video_path = os.path.join(OUT_DIR, "video.mp4")
wav_path_raw = os.path.join(OUT_DIR, "audio_raw.wav")   # direct extraction may create this
audio_16k = os.path.join(OUT_DIR, "audio.wav")          # final 16k mono file

print("üîΩ Updating yt-dlp and attempting to download audio...")

def run(cmd):
    print(">", cmd)
    proc = subprocess.run(shlex.split(cmd), capture_output=True, text=True)
    if proc.returncode != 0:
        print("Command failed with return code", proc.returncode)
        print("STDOUT:", proc.stdout)
        print("STDERR:", proc.stderr)
        raise subprocess.CalledProcessError(proc.returncode, cmd)
    return proc

# Attempt direct audio extraction to WAV (preferred)
try:
    # try extracting & re-encoding to wav in one step
    cmd = f"yt-dlp -f bestaudio -x --audio-format wav -o '{wav_path_raw}' '{YOUTUBE_URL}'"
    run(cmd)
    print("‚úÖ Direct audio extraction succeeded:", wav_path_raw)
except subprocess.CalledProcessError:
    print("‚ö†Ô∏è Direct extraction failed ‚Äî falling back to download then ffmpeg convert.")
    try:
        # download bestaudio into a container (mp4 or webm)
        cmd = f"yt-dlp -f bestaudio -o '{video_path}' '{YOUTUBE_URL}'"
        run(cmd)
        print("‚úÖ Video/audio downloaded to:", video_path)
        # convert to 16k mono WAV PCM
        ffmpeg_cmd = f"ffmpeg -y -i '{video_path}' -ar 16000 -ac 1 -vn -acodec pcm_s16le '{audio_16k}'"
        run(ffmpeg_cmd)
        print("‚úÖ Converted to 16k mono WAV:", audio_16k)
    except subprocess.CalledProcessError as e:
        raise RuntimeError("Failed to download or convert the YouTube video. See error above.") from e
else:
    # If direct extraction produced wav_path_raw, convert it to 16k mono pcm_s16le for consistency
    try:
        ffmpeg_cmd = f"ffmpeg -y -i '{wav_path_raw}' -ar 16000 -ac 1 -vn -acodec pcm_s16le '{audio_16k}'"
        run(ffmpeg_cmd)
        print("‚úÖ Re-encoded extracted wav to 16k mono:", audio_16k)
    except subprocess.CalledProcessError:
        raise RuntimeError("Failed to re-encode extracted audio to 16k mono.")

# ----------------- split long audio into segments (10 min) -----------------
print("‚úÇÔ∏è Splitting audio into 10-minute segments to avoid memory pressure...")
seg_dir = os.path.join(OUT_DIR, "segments")
os.makedirs(seg_dir, exist_ok=True)
# segment_time seconds (600 = 10 minutes). Reduce if necessary.
SEGMENT_TIME = 600
split_cmd = f"ffmpeg -hide_banner -loglevel error -i '{audio_16k}' -f segment -segment_time {SEGMENT_TIME} -c copy {seg_dir}/segment_%03d.wav"
# If ffmpeg can't copy segments for pcm, we fallback to re-encoding for each segment
try:
    run(split_cmd)
except subprocess.CalledProcessError:
    # fallback: create segments by re-encoding
    print("‚ö†Ô∏è fallback segmentation (re-encoding per segment)")
    run(f"ffmpeg -y -i '{audio_16k}' -f segment -segment_time {SEGMENT_TIME} -ar 16000 -ac 1 {seg_dir}/segment_%03d.wav")
segments = sorted([os.path.join(seg_dir, f) for f in os.listdir(seg_dir) if f.startswith("segment_")])
if len(segments) == 0:
    # if segmentation produced nothing, just use the full file
    segments = [audio_16k]
print(f"üîé {len(segments)} segment(s) to process.")

# ----------------- WhisperX transcription (per-segment) -----------------
import whisperx
import torch
print(f"üéß Loading WhisperX model '{WHISPER_MODEL}' on {DEVICE} ...")
model = whisperx.load_model(WHISPER_MODEL, device=DEVICE)   # tiny recommended
all_segments = []
detected_language = None

# load alignment model lazily per detected language later; we'll collect everything then align per segment
for idx, segfile in enumerate(segments):
    print(f"\n--- Transcribing segment {idx+1}/{len(segments)}: {segfile} ---")
    try:
        res = model.transcribe(segfile, batch_size=16)
    except Exception as e:
        # sometimes giving file path instead of loaded audio works better
        print("Transcription error, retrying passing loaded audio directly...", e)
        audio = whisperx.load_audio(segfile)
        res = model.transcribe(audio, batch_size=16)
    if detected_language is None:
        detected_language = res.get("language")
    # align this segment's segments to word-level
    print("‚è±Ô∏è Running alignment for word-level timestamps...")
    model_a, metadata = whisperx.load_align_model(language_code=res["language"], device=DEVICE)
    aligned = whisperx.align(res["segments"], model_a, metadata, segfile, device=DEVICE, return_char_alignments=False)
    # keep the segments but adjust start times with segment offset (if segment file not starting at 0)
    # For safety, assume segments start at 0; we will later compute absolute times by adding segment offsets only if ffmpeg created contiguous segments at correct offsets.
    all_segments.extend(aligned["segments"])

print("\n‚úÖ Completed WhisperX transcription + alignment for all segments.")
# Save intermediate whisperx JSON
whisper_json = os.path.join(OUT_DIR, "whisperx_transcript.json")
with open(whisper_json, "w", encoding="utf-8") as f:
    json.dump({"language": detected_language, "segments": all_segments}, f, ensure_ascii=False, indent=2)
print("Saved WhisperX aligned segments:", whisper_json)

# ----------------- Pyannote diarization -----------------
print("\nüë• Loading pyannote speaker-diarization pipeline...")
from pyannote.audio import Pipeline
try:
    pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization",
    use_auth_token=HUGGINGFACE_TOKEN,
    device="cpu"   # <-- force CPU for diarization
)
except Exception as e:
    raise RuntimeError("Failed to load pyannote pipeline. Ensure your Hugging Face token is correct and has access.") from e

print("üïí Running diarization on full audio (this can take a bit)...")
try:
    diarization = pipeline(audio_16k)
except Exception as e:
    # If diarization fails due to memory, try diarizing per segment and stitch (best-effort)
    print("‚ö†Ô∏è Diarization on full audio failed, attempting per-segment diarization as fallback:", e)
    from pyannote.core import Annotation, Segment as PSegment
    combined = Annotation()
    speaker_counter = 0
    for i, segfile in enumerate(segments):
        print(f"  diarizing segment {i+1}/{len(segments)}: {segfile}")
        try:
            ann = pipeline(segfile)
        except Exception as e2:
            print("    failed for this segment, skipping:", e2)
            continue
        # rename local speakers to avoid collisions (best-effort)
        mapping = {}
        for turn, _, label in ann.itertracks(yield_label=True):
            new_label = f"S{i:02d}_{label}"
            combined[turn] = new_label
    diarization = combined
    print("‚ö†Ô∏è Per-segment diarization completed (labels may be per-segment).")

# Save RTTM
rttm_path = os.path.join(OUT_DIR, "diarization.rttm")
with open(rttm_path, "w", encoding="utf-8") as f:
    diarization.write_rttm(f)
print("Saved diarization RTTM:", rttm_path)

# ----------------- Merge WhisperX words (word-level) with diarization -----------------
# Build word list from whisperx result
words = []
# whisperx stores words inside each segment's 'words' if alignment succeeded
for seg in all_segments:
    if "words" in seg:
        for w in seg["words"]:
            # ensure numeric times
            start = float(w.get("start", seg.get("start", 0.0)))
            end = float(w.get("end", seg.get("end", 0.0)))
            words.append({"word": w.get("word") or w.get("text") or "", "start": start, "end": end})
    else:
        # fallback to segment-level text
        start = float(seg.get("start", 0.0))
        end = float(seg.get("end", 0.0))
        words.append({"word": seg.get("text", "").strip(), "start": start, "end": end})

# Convert diarization to list of segments with labels
dia_segs = []
for turn, _, label in diarization.itertracks(yield_label=True):
    dia_segs.append({"start": float(turn.start), "end": float(turn.end), "speaker": label})

# helper to assign speaker by max overlap
def assign_speaker_for_word(w_start, w_end, dia_segs):
    best_speaker = "SPEAKER_UNKNOWN"
    best_overlap = 0.0
    for seg in dia_segs:
        overlap = max(0.0, min(w_end, seg["end"]) - max(w_start, seg["start"]))
        if overlap > best_overlap:
            best_overlap = overlap
            best_speaker = seg["speaker"]
    return best_speaker

for w in words:
    w["speaker"] = assign_speaker_for_word(w["start"], w["end"], dia_segs)

# Save merged outputs
merged_json = os.path.join(OUT_DIR, "merged_words_with_speakers.json")
with open(merged_json, "w", encoding="utf-8") as f:
    json.dump(words, f, ensure_ascii=False, indent=2)
print("Saved merged words with speakers:", merged_json)

csv_path = os.path.join(OUT_DIR, "merged_words_with_speakers.csv")
with open(csv_path, "w", encoding="utf-8", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=["start", "end", "speaker", "word"])
    writer.writeheader()
    for w in words:
        writer.writerow({"start": w["start"], "end": w["end"], "speaker": w["speaker"], "word": w["word"]})
print("Saved merged CSV:", csv_path)

# ----------------- Merge words into readable speaker segments and SRT -----------------
merged_segments = []
current = None
for w in words:
    if current is None:
        current = {"speaker": w["speaker"], "start": w["start"], "end": w["end"], "text": w["word"]}
    elif w["speaker"] == current["speaker"] and w["start"] - current["end"] <= 1.0:
        current["end"] = w["end"]
        current["text"] += " " + w["word"]
    else:
        merged_segments.append(current)
        current = {"speaker": w["speaker"], "start": w["start"], "end": w["end"], "text": w["word"]}
if current:
    merged_segments.append(current)

seg_json = os.path.join(OUT_DIR, "speaker_segments.json")
with open(seg_json, "w", encoding="utf-8") as f:
    json.dump(merged_segments, f, ensure_ascii=False, indent=2)

def format_srt_time(sec):
    h = int(sec // 3600); m = int((sec % 3600) // 60); s = int(sec % 60)
    ms = int((sec - int(sec)) * 1000)
    return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"

srt_path = os.path.join(OUT_DIR, "speaker_transcript.srt")
with open(srt_path, "w", encoding="utf-8") as f:
    for i, seg in enumerate(merged_segments, start=1):
        f.write(f"{i}\n")
        f.write(f"{format_srt_time(seg['start'])} --> {format_srt_time(seg['end'])}\n")
        f.write(f"{seg['speaker']}: {seg['text']}\n\n")

print("Saved readable segments JSON:", seg_json)
print("Saved SRT subtitles:", srt_path)

# ----------------- Show outputs -----------------
print("\nüìÇ OUTPUT FILES:")
for fname in sorted(os.listdir(OUT_DIR)):
    print(" -", fname)
print("\nDone üéâ  ‚Äî download outputs from the left sidebar (folder icon) or use Colab file browser.")


  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Paste your Hugging Face token (it will be hidden):
HUGGINGFACE_TOKEN: ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑
üîΩ Updating yt-dlp and attempting to download audio...
> yt-dlp -f bestaudio -x --audio-format wav -o 'content/audio_raw.wav' 'https://www.youtube.com/watch?v=LJiUDxj-2ZE'
‚úÖ Direct audio extraction succeeded: content/audio_raw.wav
> ffmpeg -y -i 'content/audio_raw.wav' -ar 16000 -ac 1 -vn -acodec pcm_s16le 'content/audio.wav'
‚úÖ Re-encoded extracted wav to 16k mono: content/audio.wav
‚úÇÔ∏è Splitting audio into 10-minute segments to avoid memory pressure...
> ffmpeg -hide_banner -loglevel error -i 'content/audio.wav' -f segment -segment_t

  torchaudio.list_audio_backends()
  available_backends = torchaudio.list_audio_backends()
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _speechbrain_save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _speechbrain_load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _recover
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


2025-10-15 16:30:01 - whisperx.asr - INFO - No language specified, language will be detected for each audio file (increases inference time)
2025-10-15 16:30:01 - whisperx.vads.pyannote - INFO - Performing voice activity detection using Pyannote...


INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.5. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../usr/local/lib/python3.12/dist-packages/whisperx/assets/pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.4.0. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.8.0+cu126. Bad things might happen unless you revert torch to 1.x.

--- Transcribing segment 1/5: content/segments/segment_000.wav ---


It can be re-enabled by calling
   >>> import torch
   >>> torch.backends.cuda.matmul.allow_tf32 = True
   >>> torch.backends.cudnn.allow_tf32 = True
See https://github.com/pyannote/pyannote-audio/issues/1370 for more details.



In [None]:
# --- Safe audio download from YouTube ---
!pip install -U yt-dlp > /dev/null
import subprocess, shlex, os

YOUTUBE_URL = "https://www.youtube.com/watch?v=2Vv-BfVoq4g"   # <-- put your URL
os.makedirs("content", exist_ok=True)
AUDIO_PATH = "content/audio.wav"
VIDEO_PATH = "content/video.mp4"

print("üîΩ Downloading YouTube audio...")
try:
    # try direct audio extraction
    cmd = f"yt-dlp -f bestaudio -x --audio-format wav -o '{AUDIO_PATH}' '{YOUTUBE_URL}'"
    subprocess.run(shlex.split(cmd), check=True)
except subprocess.CalledProcessError:
    # fallback: download video then extract audio
    print("‚ö†Ô∏è  Direct WAV extraction failed, retrying with ffmpeg...")
    subprocess.run(shlex.split(f"yt-dlp -f bestaudio -o '{VIDEO_PATH}' '{YOUTUBE_URL}'"), check=True)
    subprocess.run(shlex.split(f"ffmpeg -y -i '{VIDEO_PATH}' -ar 16000 -ac 1 '{AUDIO_PATH}'"), check=True)

print(f"‚úÖ Audio saved at {AUDIO_PATH}")


üîΩ Downloading YouTube audio...
‚úÖ Audio saved at content/audio.wav


In [None]:
# ============================================================
# COLAB SAFE PIPELINE: YouTube -> WhisperX + Pyannote (per-segment)
# Handles long audio without GPU crashes
# ============================================================

# ----------------- Installs -----------------
!pip install -q --upgrade pip
!pip install -q -U yt-dlp
!pip install -q git+https://github.com/m-bain/whisperX.git
!pip install -q "pyannote.audio>=2.1" ffmpeg-python faster_whisper
!apt-get update -qq && apt-get install -y -qq ffmpeg

# ----------------- Imports -----------------
import os, shlex, subprocess, json, csv, torch
from getpass import getpass
from pyannote.audio import Pipeline
import whisperx

# ----------------- Config -----------------
YOUTUBE_URL = "https://www.youtube.com/watch?v=LJiUDxj-2ZE"  # change if desired
WHISPER_MODEL = "tiny"  # tiny = safe for Colab Free GPU
OUT_DIR = "content"
os.makedirs(OUT_DIR, exist_ok=True)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SEGMENT_TIME = 300  # segment length in seconds (5 min)

# ----------------- Hugging Face token -----------------
if "HUGGINGFACE_TOKEN" not in os.environ:
    print("Paste your Hugging Face token (it will be hidden):")
    os.environ["HUGGINGFACE_TOKEN"] = getpass("HUGGINGFACE_TOKEN: ")
HUGGINGFACE_TOKEN = os.environ["HUGGINGFACE_TOKEN"]

# ----------------- Helper -----------------
def run(cmd):
    print(">", cmd)
    proc = subprocess.run(shlex.split(cmd), capture_output=True, text=True)
    if proc.returncode != 0:
        print("STDOUT:", proc.stdout)
        print("STDERR:", proc.stderr)
        raise subprocess.CalledProcessError(proc.returncode, cmd)
    return proc

# ----------------- Download & convert audio -----------------
video_path = os.path.join(OUT_DIR, "video.mp4")
wav_raw = os.path.join(OUT_DIR, "audio_raw.wav")
audio_16k = os.path.join(OUT_DIR, "audio.wav")

try:
    cmd = f"yt-dlp -f bestaudio -x --audio-format wav -o '{wav_raw}' '{YOUTUBE_URL}'"
    run(cmd)
except subprocess.CalledProcessError:
    cmd = f"yt-dlp -f bestaudio -o '{video_path}' '{YOUTUBE_URL}'"
    run(cmd)
    run(f"ffmpeg -y -i '{video_path}' -ar 16000 -ac 1 -vn -acodec pcm_s16le '{audio_16k}'")
else:
    run(f"ffmpeg -y -i '{wav_raw}' -ar 16000 -ac 1 -vn -acodec pcm_s16le '{audio_16k}'")

# ----------------- Split audio into segments -----------------
seg_dir = os.path.join(OUT_DIR, "segments")
os.makedirs(seg_dir, exist_ok=True)
try:
    run(f"ffmpeg -hide_banner -loglevel error -i '{audio_16k}' -f segment -segment_time {SEGMENT_TIME} -c copy {seg_dir}/segment_%03d.wav")
except:
    run(f"ffmpeg -y -i '{audio_16k}' -f segment -segment_time {SEGMENT_TIME} -ar 16000 -ac 1 {seg_dir}/segment_%03d.wav")

segments = sorted([os.path.join(seg_dir, f) for f in os.listdir(seg_dir) if f.startswith("segment_")])
if len(segments) == 0:
    segments = [audio_16k]
print(f"üîé {len(segments)} segment(s) to process.")

# ----------------- Load WhisperX -----------------
print(f"üéß Loading WhisperX '{WHISPER_MODEL}' on {DEVICE} ...")
model = whisperx.load_model(WHISPER_MODEL, device=DEVICE)
all_segments = []
detected_language = None

# ----------------- Transcribe segments -----------------
for idx, segfile in enumerate(segments):
    print(f"\n--- Transcribing segment {idx+1}/{len(segments)} ---")
    res = model.transcribe(segfile, batch_size=16)
    if detected_language is None:
        detected_language = res.get("language")
    # align to word-level
    model_a, metadata = whisperx.load_align_model(language_code=res["language"], device=DEVICE)
    aligned = whisperx.align(res["segments"], model_a, metadata, segfile, device=DEVICE)
    # adjust start times if segment offset needed (here we assume segments are contiguous)
    all_segments.extend(aligned["segments"])

# Save intermediate transcription
whisper_json = os.path.join(OUT_DIR, "whisperx_transcript.json")
with open(whisper_json, "w", encoding="utf-8") as f:
    json.dump({"language": detected_language, "segments": all_segments}, f, ensure_ascii=False, indent=2)
print("‚úÖ WhisperX transcription done.")

# ----------------- Diarize per segment on CPU -----------------
print("\nüë• Running Pyannote diarization per segment on CPU...")
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=HUGGINGFACE_TOKEN, device="cpu")

from pyannote.core import Annotation, Segment as PSegment
combined_dia = Annotation()
speaker_map = {}
speaker_counter = 0

for idx, segfile in enumerate(segments):
    print(f"  Diarizing segment {idx+1}/{len(segments)} ...")
    ann = pipeline(segfile)
    for turn, _, label in ann.itertracks(yield_label=True):
        # rename speakers to avoid collision
        if label not in speaker_map:
            speaker_map[label] = f"S{speaker_counter:03d}"
            speaker_counter += 1
        combined_dia[turn] = speaker_map[label]

rttm_path = os.path.join(OUT_DIR, "diarization.rttm")
with open(rttm_path, "w", encoding="utf-8") as f:
    combined_dia.write_rttm(f)
print("‚úÖ RTTM saved:", rttm_path)

# ----------------- Merge transcription + diarization -----------------
words = []
for seg in all_segments:
    if "words" in seg:
        for w in seg["words"]:
            words.append({
                "word": w.get("word") or w.get("text") or "",
                "start": float(w.get("start", seg.get("start", 0))),
                "end": float(w.get("end", seg.get("end", 0)))
            })
    else:
        words.append({
            "word": seg.get("text", "").strip(),
            "start": float(seg.get("start", 0)),
            "end": float(seg.get("end", 0))
        })

# assign speakers
dia_segments = [{"start": t.start, "end": t.end, "speaker": l} for t, _, l in combined_dia.itertracks(yield_label=True)]
def assign_speaker(w_start, w_end):
    best_speaker, best_overlap = "SPEAKER_UNKNOWN", 0
    for seg in dia_segments:
        overlap = max(0, min(w_end, seg["end"]) - max(w_start, seg["start"]))
        if overlap > best_overlap:
            best_overlap = overlap
            best_speaker = seg["speaker"]
    return best_speaker

for w in words:
    w["speaker"] = assign_speaker(w["start"], w["end"])

# save merged outputs
merged_json = os.path.join(OUT_DIR, "merged_words_with_speakers.json")
with open(merged_json, "w", encoding="utf-8") as f:
    json.dump(words, f, ensure_ascii=False, indent=2)

csv_path = os.path.join(OUT_DIR, "merged_words_with_speakers.csv")
with open(csv_path, "w", encoding="utf-8", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=["start", "end", "speaker", "word"])
    writer.writeheader()
    for w in words:
        writer.writerow(w)

# ----------------- Merge words into speaker segments & SRT -----------------
merged_segments = []
current = None
for w in words:
    if current is None:
        current = {"speaker": w["speaker"], "start": w["start"], "end": w["end"], "text": w["word"]}
    elif w["speaker"] == current["speaker"] and w["start"] - current["end"] <= 1.0:
        current["end"] = w["end"]
        current["text"] += " " + w["word"]
    else:
        merged_segments.append(current)
        current = {"speaker": w["speaker"], "start": w["start"], "end": w["end"], "text": w["word"]}
if current:
    merged_segments.append(current)

seg_json = os.path.join(OUT_DIR, "speaker_segments.json")
with open(seg_json, "w", encoding="utf-8") as f:
    json.dump(merged_segments, f, ensure_ascii=False, indent=2)

def format_srt_time(sec):
    h = int(sec // 3600); m = int((sec % 3600) // 60); s = int(sec % 60)
    ms = int((sec - int(sec)) * 1000)
    return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"

srt_path = os.path.join(OUT_DIR, "speaker_transcript.srt")
with open(srt_path, "w", encoding="utf-8") as f:
    for i, seg in enumerate(merged_segments, start=1):
        f.write(f"{i}\n")
        f.write(f"{format_srt_time(seg['start'])} --> {format_srt_time(seg['end'])}\n")
        f.write(f"{seg['speaker']}: {seg['text']}\n\n")

# ----------------- Show outputs -----------------
print("\nüìÇ OUTPUT FILES:")
for f in os.listdir(OUT_DIR):
    print(" -", f)
print("\nAll done! üéâ")


  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)


  torchaudio.list_audio_backends()


Paste your Hugging Face token (it will be hidden):
HUGGINGFACE_TOKEN: ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑
> yt-dlp -f bestaudio -x --audio-format wav -o 'content/audio_raw.wav' 'https://www.youtube.com/watch?v=LJiUDxj-2ZE'
> ffmpeg -y -i 'content/audio_raw.wav' -ar 16000 -ac 1 -vn -acodec pcm_s16le 'content/audio.wav'
> ffmpeg -hide_banner -loglevel error -i 'content/audio.wav' -f segment -segment_time 300 -c copy content/segments/segment_%03d.wav
üîé 9 segment(s) to process.
üéß Loading WhisperX 'tiny' on cuda ...


  available_backends = torchaudio.list_audio_backends()
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _speechbrain_save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _speechbrain_load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for load
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint save hook for _save
DEBUG:speechbrain.utils.checkpoints:Registered checkpoint load hook for _recover
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


2025-10-15 16:35:07 - whisperx.asr - INFO - No language specified, language will be detected for each audio file (increases inference time)
2025-10-15 16:35:07 - whisperx.vads.pyannote - INFO - Performing voice activity detection using Pyannote...


INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.5. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../usr/local/lib/python3.12/dist-packages/whisperx/assets/pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.4.0. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.8.0+cu126. Bad things might happen unless you revert torch to 1.x.


  torchaudio.list_audio_backends()



--- Transcribing segment 1/9 ---


It can be re-enabled by calling
   >>> import torch
   >>> torch.backends.cuda.matmul.allow_tf32 = True
   >>> torch.backends.cudnn.allow_tf32 = True
See https://github.com/pyannote/pyannote-audio/issues/1370 for more details.

