In [None]:
# ====== Install deps ======
!pip install -q yt-dlp faster-whisper nltk
# Pin a compatible stack for pyannote on Colab
!pip install -q "numpy==1.26.4"
!pip install -q "torch==2.8.0" "torchaudio==2.8.0"
!pip install -q "pyannote.audio>=3.2,<3.5"
!pip install -q yt-dlp faster-whisper
# Only if you enable diarization; otherwise it's not installed
USE_DIARIZATION = True  # set True to enable speaker diarization
if USE_DIARIZATION:
    !pip install -q "pyannote.audio>=3.2"

# ====== Imports & config ======
import os, time, math, csv
from pathlib import Path
import yt_dlp
import torch
from faster_whisper import WhisperModel

# Optional diarization
if USE_DIARIZATION:
    from pyannote.audio import Pipeline as PyannotePipeline

# Lightweight sentence split for Mode B
import nltk
nltk.download('punkt', quiet=True)
from nltk.tokenize import sent_tokenize

# ---------- Your URLs ----------
URLS = [
    "https://www.youtube.com/watch?v=_WOBtEhxw9E&list=PLVzyrmx4CxOD0y9j5EcTM2BakMD0FGPhY&index=4",
    "https://www.youtube.com/watch?v=Xm27EegJYoY&list=PLVzyrmx4CxOD0y9j5EcTM2BakMD0FGPhY&index=6",
    "https://www.youtube.com/watch?v=ODQARH3WQDs&list=PLVzyrmx4CxOD0y9j5EcTM2BakMD0FGPhY&index=8",
    "https://www.youtube.com/watch?v=qbKnSzJQLJA&list=PLVzyrmx4CxOD0y9j5EcTM2BakMD0FGPhY&index=10",
    "https://www.youtube.com/watch?v=0oCyBIHsbBY&list=PLVzyrmx4CxOD0y9j5EcTM2BakMD0FGPhY&index=12",
    "https://www.youtube.com/watch?v=JjDWIT3CG7I&list=PLVzyrmx4CxOD0y9j5EcTM2BakMD0FGPhY&index=13",
    "https://www.youtube.com/watch?v=4xvbfyiqFl0&list=PLVzyrmx4CxOD0y9j5EcTM2BakMD0FGPhY&index=15",
    "https://www.youtube.com/watch?v=diUd0oAdcPg&list=PLVzyrmx4CxOD0y9j5EcTM2BakMD0FGPhY&index=16",
    "https://www.youtube.com/watch?v=UVg5XSKY-NE&list=PLVzyrmx4CxOD0y9j5EcTM2BakMD0FGPhY&index=17",
    "https://www.youtube.com/watch?v=TftCmJPhVzc&list=PLVzyrmx4CxOD0y9j5EcTM2BakMD0FGPhY&index=19",
    "https://www.youtube.com/watch?v=z0lPrJ8PkL0&list=PLVzyrmx4CxOD0y9j5EcTM2BakMD0FGPhY&index=20",
    "https://www.youtube.com/watch?v=qWKfxlG5xmg&list=PLVzyrmx4CxOD0y9j5EcTM2BakMD0FGPhY&index=21",
    "https://www.youtube.com/watch?v=gWG0y0Xr2Cw&list=PLVzyrmx4CxOD0y9j5EcTM2BakMD0FGPhY&index=23",
    "https://www.youtube.com/watch?v=2EYQVnI9Uc8&list=PLVzyrmx4CxOD0y9j5EcTM2BakMD0FGPhY&index=26",
]

# ---------- Paths ----------
AUDIO_DIR = Path("/content/audios")
OUT_DIR   = Path("/content/transcripts_by_turn")
CSV_DIR   = OUT_DIR / "csv"
TXT_DIR   = OUT_DIR / "txt"
ZIP_PATH  = Path("/content/transcripts_by_turn.zip")

AUDIO_DIR.mkdir(parents=True, exist_ok=True)
CSV_DIR.mkdir(parents=True, exist_ok=True)
TXT_DIR.mkdir(parents=True, exist_ok=True)

# ---------- Whisper (faster-whisper) ----------
MODEL_SIZE = "medium"      # You said accuracy is good with 'medium'
device = "cuda" if torch.cuda.is_available() else "cpu"
compute_type = "float16" if device == "cuda" else "int8"
print(f"[INFO] Transcribe with faster-whisper {MODEL_SIZE} on {device} ({compute_type})")

model = WhisperModel(MODEL_SIZE, device=device, compute_type=compute_type)

# ---------- Optional: Pyannote diarization ----------
HF_TOKEN = "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"  # paste only if USE_DIARIZATION=True and you have a token
if USE_DIARIZATION:
    if not HF_TOKEN:
        raise ValueError("Set HF_TOKEN to use diarization (pyannote.audio).")
    diar_pipeline = PyannotePipeline.from_pretrained(
        "pyannote/speaker-diarization-3.1",
        use_auth_token=HF_TOKEN
    )
    # Try to use GPU if available
    if torch.cuda.is_available():
        diar_pipeline.to(torch.device("cuda"))

# ---------- yt-dlp options (fast, robust) ----------
YDL_OPTS = {
    "format": "bestaudio[ext=m4a]/bestaudio/best",
    "outtmpl": str(AUDIO_DIR / "%(title)s-%(id)s.%(ext)s"),
    "postprocessors": [{"key": "FFmpegExtractAudio", "preferredcodec": "mp3"}],
    "retries": 3,
    "socket_timeout": 30,
    "noplaylist": True,
    "quiet": True,
}

def download_audio(url: str):
    with yt_dlp.YoutubeDL(YDL_OPTS) as ydl:
        info = ydl.extract_info(url, download=True)
        title = info.get("title") or "audio"
        vid   = info.get("id") or "unknown"
        mp3_path = Path(ydl.prepare_filename(info)).with_suffix(".mp3")
        return mp3_path, vid, title

def safe_stem(s: str) -> str:
    return "".join(c if c.isalnum() or c in " .-_()" else "_" for c in s).strip().rstrip("._")

def overlap(a_start, a_end, b_start, b_end):
    return max(0.0, min(a_end, b_end) - max(a_start, b_start))

# ---------- Main loop ----------
all_csv_files = []
start_all = time.time()

for i, url in enumerate(URLS, 1):
    print(f"\n[{i}/{len(URLS)}] {url}")
    try:
        audio_path, video_id, title = download_audio(url)
        print(f"  ↳ Downloaded: {audio_path.name}")
    except Exception as e:
        print(f"  ! Download failed: {e}")
        continue

    # Transcribe once (no alignment; faster)
    print("  ↳ Transcribing…")
    segments_iter, info = model.transcribe(
        str(audio_path),
        vad_filter=False,
        beam_size=5,
        temperature=(0.0, 0.2, 0.4),  # fallback helps reduce repetitions
        word_timestamps=False
    )
    raw_segments = [{"start": seg.start, "end": seg.end, "text": (seg.text or "").strip()} for seg in segments_iter]

    # ===== Mode A: Speaker diarization =====
    if USE_DIARIZATION:
        print("  ↳ Diarizing (speaker turns)…")
        diar = diar_pipeline(str(audio_path))  # returns pyannote Annotation
        turns = []
        for speech_turn in diar.itertracks(yield_label=True):
            (segment, _track), speaker = speech_turn
            turns.append({
                "start": float(segment.start),
                "end":   float(segment.end),
                "speaker": speaker
            })
        # Assign transcript text to each speaker turn by overlap with raw segments
        speaker_rows = []
        for t in turns:
            pieces = []
            for s in raw_segments:
                ov = overlap(t["start"], t["end"], s["start"], s["end"])
                if ov > 0:
                    pieces.append(s["text"])
            text = " ".join(pieces).strip()
            if text:
                speaker_rows.append({
                    "video_id": video_id,
                    "start": round(t["start"], 2),
                    "end": round(t["end"], 2),
                    "speaker": t["speaker"],
                    "text": text
                })
        # Save CSV + TXT
        base = safe_stem(f"{title}-{video_id}")
        csv_path = CSV_DIR / f"{base}.csv"
        txt_path = TXT_DIR / f"{base}.txt"
        with open(csv_path, "w", encoding="utf-8", newline="") as f:
            writer = csv.DictWriter(f, fieldnames=["video_id","start","end","speaker","text"])
            writer.writeheader()
            writer.writerows(speaker_rows)
        with open(txt_path, "w", encoding="utf-8") as f:
            for r in speaker_rows:
                f.write(f"[{r['start']:.2f}–{r['end']:.2f}] {r['speaker']}: {r['text']}\n")
        print(f"  ✓ Saved diarized: {csv_path.name} | {txt_path.name}")
        all_csv_files.append(csv_path)

    # ===== Mode B: Sentence chunks (no diarization) =====
    else:
        print("  ↳ Splitting into sentence-sized chunks…")
        # We’ll keep segment timestamps but split each segment’s text into sentences
        rows = []
        for s in raw_segments:
            sentences = [x.strip() for x in sent_tokenize(s["text"]) if x.strip()]
            if not sentences:
                continue
            # Heuristic: spread the segment time evenly across sentences
            seg_dur = max(0.001, (s["end"] - s["start"]))
            per = seg_dur / len(sentences)
            for idx, sent in enumerate(sentences):
                start = s["start"] + idx * per
                end = min(s["end"], s["start"] + (idx+1)*per)
                rows.append({
                    "video_id": video_id,
                    "start": round(start, 2),
                    "end": round(end, 2),
                    "speaker": "",                # unknown (no diarization)
                    "text": sent
                })
        base = safe_stem(f"{title}-{video_id}")
        csv_path = CSV_DIR / f"{base}.csv"
        txt_path = TXT_DIR / f"{base}.txt"
        with open(csv_path, "w", encoding="utf-8", newline="") as f:
            writer = csv.DictWriter(f, fieldnames=["video_id","start","end","speaker","text"])
            writer.writeheader()
            writer.writerows(rows)
        with open(txt_path, "w", encoding="utf-8") as f:
            for r in rows:
                f.write(f"[{r['start']:.2f}–{r['end']:.2f}] {r['text']}\n")
        print(f"  ✓ Saved sentence chunks: {csv_path.name} | {txt_path.name}")
        all_csv_files.append(csv_path)

# Zip everything for download
if ZIP_PATH.exists():
    ZIP_PATH.unlink()
!zip -qr /content/transcripts_by_turn.zip /content/transcripts_by_turn

elapsed = time.time() - start_all
print(f"\n[DONE] Files in {OUT_DIR}")
print(f"       CSVs: {len(all_csv_files)} | ZIP: {ZIP_PATH}")
print(f"       Elapsed: {elapsed/60:.1f} min")

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/18.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/18.0 MB[0m [31m99.3 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━[0m [32m13.1/18.0 MB[0m [31m270.5 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m17.9/18.0 MB[0m [31m257.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.0/18.0 MB[0m [31m110.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This be

ModuleNotFoundError: No module named 'numpy.strings'