In [None]:
# ============================================================
# Robust download with cookies + web-only clients
# Transcribe (full .txt) + on-the-fly sentence chunks (CSV)
# ============================================================
!python -m pip -q install --upgrade "yt-dlp @ https://github.com/yt-dlp/yt-dlp/archive/refs/heads/master.zip" faster-whisper nltk

import os, csv, time, re
from pathlib import Path
import yt_dlp
import torch
from faster_whisper import WhisperModel

import nltk
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
from nltk.tokenize import sent_tokenize

from google.colab import files

# ------------------ SETTINGS ------------------
USE_COOKIES = True  # upload cookies.txt from your browser session
URLS = [
    "https://www.youtube.com/watch?v=ZYYa4e-DxQE&list=PLpZJ7XCi1UtPAcqhYNokCtHsH4NZ_Dq_T&index=247",
    "https://www.youtube.com/watch?v=6p1N0rXA1Og&list=PLpZJ7XCi1UtPAcqhYNokCtHsH4NZ_Dq_T&index=246",
    "https://www.youtube.com/watch?v=YG6sEHnYRpM&list=PLpZJ7XCi1UtPAcqhYNokCtHsH4NZ_Dq_T&index=245",
    "https://www.youtube.com/watch?v=FZjGU7Ww2Yc&list=PLpZJ7XCi1UtPAcqhYNokCtHsH4NZ_Dq_T&index=244",
    "https://www.youtube.com/watch?v=NeutxLBEfIE&list=PLpZJ7XCi1UtPAcqhYNokCtHsH4NZ_Dq_T&index=243",
    "https://www.youtube.com/watch?v=DO8ikVs5ODc&list=PLpZJ7XCi1UtPAcqhYNokCtHsH4NZ_Dq_T&index=242",
    "https://www.youtube.com/watch?v=9WTE-VPUS6c&list=PLpZJ7XCi1UtPAcqhYNokCtHsH4NZ_Dq_T&index=241"
]

MODEL_SIZE  = "large-v3-turbo"
FORCE_LANG  = "es"
BEAM_SIZE   = 5
TEMPERATURE = (0.0, 0.2, 0.4)

# Chunking
MAX_CHARS_PER_CHUNK = 450
MAX_SENTS_PER_CHUNK = 5
SENT_OVERLAP        = 1

# Paths
AUDIO_DIR    = Path("/content/audios")
FULL_TXT_DIR = Path("/content/transcripts_full")
CSV_DIR      = Path("/content/transcripts_chunks/csv")
ZIP_TXT      = Path("/content/transcripts_full.zip")
ZIP_CSV      = Path("/content/transcripts_chunks.zip")
COOKIE_PATH  = Path("/content/cookies.txt")

AUDIO_DIR.mkdir(parents=True, exist_ok=True)
FULL_TXT_DIR.mkdir(parents=True, exist_ok=True)
CSV_DIR.mkdir(parents=True, exist_ok=True)

# ---- Upload cookies ----
if USE_COOKIES:
    print("⬆️ Upload cookies.txt (exported from a logged-in YouTube session; Netscape format)")
    uploaded = files.upload()
    fname = next(iter(uploaded.keys()))
    COOKIE_PATH.write_bytes(uploaded[fname])

def safe_name(s: str) -> str:
    return "".join(c if c.isalnum() or c in " .-_()" else "_" for c in (s or "")).strip().rstrip("._")

def normalize_text(t: str) -> str:
    return re.sub(r"\s+", " ", (t or "")).strip()

def strip_playlist_params(url: str) -> str:
    # Keep only the watch?v=... part to avoid playlist edge-cases
    m = re.search(r"(https?://www\.youtube\.com/watch\?v=[A-Za-z0-9_-]{11})", url)
    return m.group(1) if m else url

def ydl_base_opts():
    # Core options used for all attempts
    return {
        "format": "bestaudio[ext=m4a]/bestaudio/best",
        "outtmpl": str(AUDIO_DIR / "%(title)s-%(id)s.%(ext)s"),
        "postprocessors": [{"key": "FFmpegExtractAudio", "preferredcodec": "mp3"}],
        "retries": 5,
        "socket_timeout": 30,
        "noplaylist": True,
        "quiet": True,
        # Smaller chunk-size avoids YT throttling quirks (>10MB) per yt-dlp FAQ
        "http_chunk_size": 9 * 1024 * 1024,
        "http_headers": {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) "
                                       "AppleWebKit/537.36 (KHTML, like Gecko) "
                                       "Chrome/123.0 Safari/537.36"},
    }

# We will try web clients in this order:
# 1) web_safari  (often exposes HLS that avoids PO-token; works with cookies)
# 2) web         (works sometimes; SABR may be enforced)
# 3) web_embedded(only embeddable, but sometimes bypasses SABR)
CLIENT_TRIES = [
    ["web_safari"],
    ["web"],
    ["web_embedded"],
]

def download_audio(url: str):
    url = strip_playlist_params(url)
    last_err = None
    for clients in CLIENT_TRIES:
        opts = ydl_base_opts()
        opts["extractor_args"] = {"youtube": {"player_client": clients}}
        if USE_COOKIES and COOKIE_PATH.exists():
            opts["cookiefile"] = str(COOKIE_PATH)

        print(f"  ↳ Trying player_client={clients} …")
        try:
            with yt_dlp.YoutubeDL(opts) as ydl:
                info = ydl.extract_info(url, download=True)
                title = info.get("title") or "audio"
                vid   = info.get("id") or "unknown"
                mp3_path = Path(ydl.prepare_filename(info)).with_suffix(".mp3")
                if mp3_path.exists():
                    return mp3_path, vid, title
        except Exception as e:
            last_err = e
            print(f"    ! attempt failed: {e}")
            continue
    # if all attempts failed
    raise RuntimeError(f"All client attempts failed. Last error: {last_err}")

class ChunkWriter:
    def __init__(self, csv_writer, video_id, max_chars=450, max_sents=5, overlap=1):
        self.w = csv_writer; self.video_id = video_id
        self.max_chars=max_chars; self.max_sents=max_sents; self.overlap=max(0, overlap)
        self.buffer=[]; self.chunk_id=0
    def add_segment_sentences(self, sents, seg_start, seg_end):
        if not sents: return
        seg_start=float(seg_start or 0.0); seg_end=float(seg_end or seg_start)
        dur=max(0.001, seg_end-seg_start); per=dur/max(1,len(sents))
        for i, s in enumerate(sents):
            st=seg_start+i*per; en=seg_start+(i+1)*per
            self.buffer.append({"text": s, "start": st, "end": en})
        self._flush_complete()
    def _flush_complete(self, force=False):
        while self.buffer:
            txt=""; count=0; last=-1
            for i,it in enumerate(self.buffer):
                cand=((" "+it["text"]) if txt else it["text"])
                if count>=self.max_sents: break
                if len((txt+cand).strip())>self.max_chars and count>0: break
                txt=(txt+cand).strip(); count+=1; last=i
            if last==-1:
                it=self.buffer[0]; self._write(it["start"], it["end"], it["text"]); del self.buffer[0]; continue
            if not force and count<=self.overlap and len(self.buffer)<=self.max_sents: break
            st=self.buffer[0]["start"]; en=self.buffer[last]["end"]
            self._write(st, en, txt)
            remove_n=max(1, count-self.overlap) if self.overlap>=0 else count
            self.buffer=self.buffer[remove_n:]
        if force and self.buffer:
            st=self.buffer[0]["start"]; en=self.buffer[-1]["end"]
            txt=" ".join([x["text"] for x in self.buffer]).strip()
            if txt: self._write(st, en, txt)
            self.buffer.clear()
    def _write(self, st, en, text):
        self.w.writerow([self.video_id, self.chunk_id, round(st,2), round(en,2), text]); self.chunk_id+=1
    def close(self): self._flush_complete(force=True)

# ---- Transcribe + write full transcript + chunks ----
device = "cuda" if torch.cuda.is_available() else "cpu"
compute_type = "float16" if device == "cuda" else "int8"
print(f"[INFO] faster-whisper: {MODEL_SIZE} on {device} ({compute_type})")
model = WhisperModel(MODEL_SIZE, device=device, compute_type=compute_type)
force_language = FORCE_LANG.strip().lower() or None

t0 = time.time(); done = 0
for idx, url in enumerate(URLS, 1):
    print(f"\n[{idx}/{len(URLS)}] {url}")
    try:
        audio_path, video_id, title = download_audio(url)
        base = safe_name(f"{title}-{video_id}")
        full_txt_path = FULL_TXT_DIR / f"{base}.txt"
        csv_path      = CSV_DIR / f"{base}.csv"

        if full_txt_path.exists() and csv_path.exists():
            print(f"  ↳ Already processed: {base}"); done += 1; continue

        print(f"  ↳ Transcribing -> {full_txt_path.name} + {csv_path.name}")
        with open(full_txt_path, "w", encoding="utf-8") as ftxt, \
             open(csv_path, "w", encoding="utf-8", newline="") as fcsv:
            writer = csv.writer(fcsv); writer.writerow(["video_id","chunk_id","start","end","text"])
            cw = ChunkWriter(writer, video_id,
                             max_chars=MAX_CHARS_PER_CHUNK,
                             max_sents=MAX_SENTS_PER_CHUNK,
                             overlap=SENT_OVERLAP)

            seg_iter, info = model.transcribe(
                str(audio_path),
                language=force_language,
                vad_filter=False,
                beam_size=BEAM_SIZE,
                temperature=TEMPERATURE,
                word_timestamps=False
            )
            for seg in seg_iter:
                line = normalize_text(seg.text)
                if not line: continue
                ftxt.write(line + "\n")
                sents = [s.strip() for s in sent_tokenize(line) if s.strip()]
                cw.add_segment_sentences(sents, seg.start, seg.end)
            cw.close()

        print(f"  ✓ Done: {base}"); done += 1
    except Exception as e:
        print(f"  ! Error: {e}")

if ZIP_TXT.exists(): ZIP_TXT.unlink()
if ZIP_CSV.exists(): ZIP_CSV.unlink()
!zip -qr /content/transcripts_full.zip /content/transcripts_full
!zip -qr /content/transcripts_chunks.zip /content/transcripts_chunks
print(f"\n[DONE] {done}/{len(URLS)} processed")
print(f"Full transcripts: {FULL_TXT_DIR} (zip: {ZIP_TXT})")
print(f"Chunk CSVs:       {CSV_DIR}      (zip: {ZIP_CSV})")

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
⬆️ Upload cookies.txt (exported from a logged-in YouTube session; Netscape format)


Saving cookies.txt to cookies (1).txt
[INFO] faster-whisper: large-v3-turbo on cuda (float16)


preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocabulary.json: 0.00B [00:00, ?B/s]

model.bin:   0%|          | 0.00/1.62G [00:00<?, ?B/s]


[1/7] https://www.youtube.com/watch?v=ZYYa4e-DxQE&list=PLpZJ7XCi1UtPAcqhYNokCtHsH4NZ_Dq_T&index=247
  ↳ Trying player_client=['web_safari'] …




  ↳ Already processed: Cómo Ganar _10.000_ al Mes con 17 Años (DollarDorado)-ZYYa4e-DxQE

[2/7] https://www.youtube.com/watch?v=6p1N0rXA1Og&list=PLpZJ7XCi1UtPAcqhYNokCtHsH4NZ_Dq_T&index=246
  ↳ Trying player_client=['web_safari'] …




  ↳ Transcribing -> Cómo Influenciar al Mundo con tu Comunicación (Fer Miralles)-6p1N0rXA1Og.txt + Cómo Influenciar al Mundo con tu Comunicación (Fer Miralles)-6p1N0rXA1Og.csv
  ✓ Done: Cómo Influenciar al Mundo con tu Comunicación (Fer Miralles)-6p1N0rXA1Og

[3/7] https://www.youtube.com/watch?v=YG6sEHnYRpM&list=PLpZJ7XCi1UtPAcqhYNokCtHsH4NZ_Dq_T&index=245
  ↳ Trying player_client=['web_safari'] …




  ↳ Transcribing -> Cómo Vender casi Cualquier Cosa Online (Experto Facebook Ads)-YG6sEHnYRpM.txt + Cómo Vender casi Cualquier Cosa Online (Experto Facebook Ads)-YG6sEHnYRpM.csv
  ✓ Done: Cómo Vender casi Cualquier Cosa Online (Experto Facebook Ads)-YG6sEHnYRpM

[4/7] https://www.youtube.com/watch?v=FZjGU7Ww2Yc&list=PLpZJ7XCi1UtPAcqhYNokCtHsH4NZ_Dq_T&index=244
  ↳ Trying player_client=['web_safari'] …




  ↳ Transcribing -> Cómo Ser Libre con la Inversión Inmobiliaria con Poco Dinero (Carlos Galán)-FZjGU7Ww2Yc.txt + Cómo Ser Libre con la Inversión Inmobiliaria con Poco Dinero (Carlos Galán)-FZjGU7Ww2Yc.csv
  ✓ Done: Cómo Ser Libre con la Inversión Inmobiliaria con Poco Dinero (Carlos Galán)-FZjGU7Ww2Yc

[5/7] https://www.youtube.com/watch?v=NeutxLBEfIE&list=PLpZJ7XCi1UtPAcqhYNokCtHsH4NZ_Dq_T&index=243
  ↳ Trying player_client=['web_safari'] …




[download]  99.5% of ~ 695.73MiB at  296.74KiB/s ETA Unknown (frag 854/859)

[download] Got error: 56824 bytes read. Giving up after 5 retries
[download] Got error: HTTP Error 503: Service Unavailable. Giving up after 5 retries
[download] Got error: HTTP Error 503: Service Unavailable. Giving up after 5 retries
[download] Got error: HTTP Error 503: Service Unavailable. Giving up after 5 retries


  ↳ Transcribing -> 15 Hábitos para Vivir con Abundancia y Tener Éxito (Sergio Fernández)-NeutxLBEfIE.txt + 15 Hábitos para Vivir con Abundancia y Tener Éxito (Sergio Fernández)-NeutxLBEfIE.csv
  ✓ Done: 15 Hábitos para Vivir con Abundancia y Tener Éxito (Sergio Fernández)-NeutxLBEfIE

[6/7] https://www.youtube.com/watch?v=DO8ikVs5ODc&list=PLpZJ7XCi1UtPAcqhYNokCtHsH4NZ_Dq_T&index=242
  ↳ Trying player_client=['web_safari'] …




  ↳ Transcribing -> Ha Creado una Empresa de 40 Millones con 25 años (Internxt)-DO8ikVs5ODc.txt + Ha Creado una Empresa de 40 Millones con 25 años (Internxt)-DO8ikVs5ODc.csv
  ✓ Done: Ha Creado una Empresa de 40 Millones con 25 años (Internxt)-DO8ikVs5ODc

[7/7] https://www.youtube.com/watch?v=9WTE-VPUS6c&list=PLpZJ7XCi1UtPAcqhYNokCtHsH4NZ_Dq_T&index=241
  ↳ Trying player_client=['web_safari'] …




  ↳ Transcribing -> Cómo Tener una Mentalidad de Éxito para Ser el Mejor (Joan Pradells)-9WTE-VPUS6c.txt + Cómo Tener una Mentalidad de Éxito para Ser el Mejor (Joan Pradells)-9WTE-VPUS6c.csv
  ✓ Done: Cómo Tener una Mentalidad de Éxito para Ser el Mejor (Joan Pradells)-9WTE-VPUS6c

[DONE] 7/7 processed
Full transcripts: /content/transcripts_full (zip: /content/transcripts_full.zip)
Chunk CSVs:       /content/transcripts_chunks/csv      (zip: /content/transcripts_chunks.zip)
