In [None]:
!pip install youtube-transcript-api requests nltk yt_dlp

import os
import re
import csv
import time
import requests
from pathlib import Path
from urllib.parse import urlparse, parse_qs
from pathlib import Path
from zipfile import ZipFile, ZIP_DEFLATED


import nltk
from nltk.tokenize import sent_tokenize

from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
from youtube_transcript_api.formatters import TextFormatter



try:
    nltk.data.find("tokenizers/punkt")
except LookupError:
    nltk.download("punkt")

try:
    nltk.data.find("tokenizers/punkt_tab")
except LookupError:
    nltk.download("punkt_tab")

try:
    from yt_dlp import YoutubeDL
    YT_DLP_AVAILABLE = True
except Exception:
    YT_DLP_AVAILABLE = False


ydl_opts = {
    "quiet": True,
    "no_warnings": True,
    "skip_download": True,
    "noplaylist": True,
    "extract_flat": True,
}

# ----------------- Config -----------------
URLS = [
    "https://www.youtube.com/watch?v=QM6zUHrSpyo",
    "https://www.youtube.com/watch?v=1dAd22MuaUg",
    "https://www.youtube.com/watch?v=exIKS7Y3xV4",
    "https://www.youtube.com/watch?v=08xx39f3vW4",
    "https://www.youtube.com/watch?v=5gxCCXOGYwc",
    "https://www.youtube.com/watch?v=T9M_wZ4Jqm0",
    "https://www.youtube.com/watch?v=0-xkCqbuB4g",
    "https://www.youtube.com/watch?v=_MW9OxwVSq0",
    "https://www.youtube.com/watch?v=2dQHz2c4RK0"
]

LANGUAGES = ("es", "es-ES")
OUT_TXT_DIR = Path("output_transcripts")
OUT_CSV_DIR = Path("transcripts_chunks/csv")

MAX_CHARS_PER_CHUNK = 5000
MAX_SENTS_PER_CHUNK = 1
SENT_OVERLAP = 0

from urllib.parse import urlparse, parse_qs

def get_video_id(url: str):
    p = urlparse(url)

    # Short form: https://youtu.be/VIDEO_ID (may also have extra query like ?t=30s)
    if p.netloc in {"youtu.be", "www.youtu.be"}:
        # p.path is like "/VIDEO_ID"
        vid = p.path.lstrip("/")
        # strip any trailing slash or extra junk if present
        vid = vid.split("/")[0]
        # defensive: remove any accidental query concatenation
        vid = vid.split("?")[0]
        return vid or None

    # Standard form: https://www.youtube.com/watch?v=VIDEO_ID
    if p.netloc in {"www.youtube.com", "youtube.com", "m.youtube.com"}:
        q = parse_qs(p.query)
        v = q.get("v", [None])[0]
        if v:
            # Sometimes v can be like "VIDEO_ID&list=..." if string was malformed
            return v.split("&")[0]
    return None

def sanitize_filename(name: str, max_len: int = 150) -> str:
    name = re.sub(r"[\\/:*?\"<>|]", "_", name).strip()
    name = re.sub(r"\s+", " ", name)
    return name[:max_len].rstrip(" ._")


def get_video_title(url: str, fallback_id: str) -> str:
    if not YT_DLP_AVAILABLE:
        return fallback_id
    try:
        ydl_opts = {"quiet": True, "skip_download": True}
        with YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(url, download=False)
            title = info.get("title")
            if title:
                return sanitize_filename(title)
    except Exception:
        pass
    return fallback_id

# ----------------- Your classes (unchanged) -----------------
class ChunkWriter:
    def __init__(self, csv_writer, video_id, max_chars=450, max_sents=5, overlap=1):
        self.w = csv_writer
        self.video_id = video_id
        self.max_chars = max_chars
        self.max_sents = max_sents
        self.overlap = max(0, overlap)
        self.buffer = []
        self.chunk_id = 0

    def add_segment_sentences(self, sents, seg_start, seg_end):
        if not sents:
            return
        seg_start = float(seg_start or 0.0)
        seg_end = float(seg_end or seg_start)
        dur = max(0.001, seg_end - seg_start)
        per = dur / max(1, len(sents))
        for i, s in enumerate(sents):
            st = seg_start + i * per
            en = seg_start + (i + 1) * per
            self.buffer.append({"text": s, "start": st, "end": en})
        self._flush_complete()

    def _flush_complete(self, force=False):
        while self.buffer:
            txt = ""
            count = 0
            last = -1
            for i, it in enumerate(self.buffer):
                cand = ((" " + it["text"]) if txt else it["text"])
                if count >= self.max_sents:
                    break
                if len((txt + cand).strip()) > self.max_chars and count > 0:
                    break
                txt = (txt + cand).strip()
                count += 1
                last = i

            if last == -1:
                it = self.buffer[0]
                self._write(it["start"], it["end"], it["text"])
                del self.buffer[0]
                continue

            if not force and count <= self.overlap and len(self.buffer) <= self.max_sents:
                break

            st = self.buffer[0]["start"]
            en = self.buffer[last]["end"]
            self._write(st, en, txt)
            remove_n = max(1, count - self.overlap) if self.overlap >= 0 else count
            self.buffer = self.buffer[remove_n:]

        if force and self.buffer:
            st = self.buffer[0]["start"]
            en = self.buffer[-1]["end"]
            txt = " ".join([x["text"] for x in self.buffer]).strip()
            if txt:
                self._write(st, en, txt)
            self.buffer.clear()

    def _write(self, st, en, text):
        self.w.writerow([self.video_id, self.chunk_id, text])
        self.chunk_id += 1

    def close(self):
        self._flush_complete(force=True)

# ----------------- NLTK Sentence Assembler -----------------
class NLTKSentenceAssembler:
    def __init__(self, language="spanish"):
        self.lang = language
        self.buf_text = ""
        self.buf_st = None
        self.buf_en = None

    def add_segment(self, text, seg_start, seg_end):
        text = (text or "").strip()
        if not text:
            return []
        if self.buf_text:
            self.buf_text += " " + text
            self.buf_st = min(self.buf_st, float(seg_start))
            self.buf_en = max(self.buf_en, float(seg_end))
        else:
            self.buf_text = text
            self.buf_st = float(seg_start or 0.0)
            self.buf_en = float(seg_end or seg_start)

        sents = [s.strip() for s in sent_tokenize(self.buf_text, language=self.lang) if s.strip()]
        if not sents:
            return []

        complete = sents[:-1]
        tail = sents[-1] if sents else ""

        emitted = []
        if complete:
            st, en = float(self.buf_st), float(self.buf_en)
            for s in complete:
                emitted.append((s, st, en))
            self.buf_text = tail  # keep tail; keep same approx time window

        return emitted

    def flush(self):
        if not self.buf_text.strip():
            return []
        st = float(self.buf_st or 0.0)
        en = float(self.buf_en or st)
        out = [(self.buf_text.strip(), st, en)]
        self.buf_text = ""
        self.buf_st = None
        self.buf_en = None
        return out

# ----------------- Processing loop (YouTube API ‚Üí chunking) -----------------
def process_url(url: str):
    video_id = get_video_id(url)
    if not video_id:
        print(f"‚õî Could not extract video ID from URL: {url}")
        return

    title = get_video_title(url, video_id)
    base_name = sanitize_filename(title)
    base_name = base_name.lstrip(". ").strip() or video_id


    OUT_TXT_DIR.mkdir(parents=True, exist_ok=True)
    OUT_CSV_DIR.mkdir(parents=True, exist_ok=True)

    txt_path = OUT_TXT_DIR / f"{base_name}.txt"
    csv_path = OUT_CSV_DIR / f"{base_name}.csv"

    ytt_api = YouTubeTranscriptApi()

    video_id = get_video_id(url)
    if not video_id:
        print(f"‚õî Could not extract video ID from URL: {url}")

    transcript_list = ytt_api.list(video_id)

    try:
        transcript = transcript_list.find_manually_created_transcript(LANGUAGES)
        print("‚úÖ Found manual transcript:", transcript)
    except Exception as e:
        transcript = transcript_list.find_transcript(LANGUAGES)


    # 2) Fetch segments (with timestamps) + write full text
    try:
        segments = transcript.fetch()  # list[ { 'text', 'start', 'duration' }, ... ]
    except Exception as e:
        print(f"‚ö†Ô∏è Failed to fetch transcript for {base_name}: {e}")
        return

    # Save full transcript (.txt) using TextFormatter (for readability)
    try:
        formatter = TextFormatter()
        formatted_text = formatter.format_transcript(segments)
        with open(txt_path, "w", encoding="utf-8") as f:
            f.write(formatted_text)
        print(f"üíæ Full transcript saved ‚Üí {txt_path}")
    except Exception as e:
        print(f"‚ö†Ô∏è Failed writing TXT for {base_name}: {e}")

    # 3) Chunk by sentences ‚Üí CSV
    try:
        with open(csv_path, "w", encoding="utf-8", newline="") as fcsv:
            writer = csv.writer(fcsv)
            writer.writerow(["video_id", "chunk_id", "text"])

            cw = ChunkWriter(
                writer, video_id,
                max_chars=MAX_CHARS_PER_CHUNK,
                max_sents=MAX_SENTS_PER_CHUNK,
                overlap=SENT_OVERLAP,
            )
            assembler = NLTKSentenceAssembler(language="spanish")

            for seg in segments:
                text = getattr(seg, "text", None) or (seg.get("text") if isinstance(seg, dict) else None)
                if not text:
                    continue

                st = 0.0
                en = 0.0

                for sent_text, sst, sen in assembler.add_segment(text.strip(), st, en):
                    cw.add_segment_sentences([sent_text], sst, sen)

            for sent_text, sst, sen in assembler.flush():
                cw.add_segment_sentences([sent_text], sst, sen)

            cw.close()

        print(f"üìÑ Chunk CSV saved ‚Üí {csv_path}")
    except Exception as e:
        print(f"‚ö†Ô∏è Failed writing CSV for {base_name}: {e}")


def main():
    t0 = time.time()
    for i, url in enumerate(URLS, 1):
        print(f"\n[{i}/{len(URLS)}] Processing: {url}")
        process_url(url)
        time.sleep(2.0)  # politeness


    ZIP_TXT = Path("transcripts_full.zip")            # zip for all .txt
    ZIP_CSV = Path("transcripts_chunks_csv.zip")      # zip for all .csv

    # Overwrite if they already exist
    if ZIP_TXT.exists():
        ZIP_TXT.unlink()
    if ZIP_CSV.exists():
        ZIP_CSV.unlink()

    # Zip full transcripts
    if OUT_TXT_DIR.exists():
        with ZipFile(ZIP_TXT, "w", compression=ZIP_DEFLATED) as zf:
            for p in OUT_TXT_DIR.glob("*.txt"):
                zf.write(p, arcname=p.name)  # store flat names
        print(f"üóúÔ∏è  Zipped full transcripts ‚Üí {ZIP_TXT}")
    else:
        print(f"‚ö†Ô∏è  Transcripts folder not found: {OUT_TXT_DIR}")

    # Zip chunk CSVs
    if OUT_CSV_DIR.exists():
        with ZipFile(ZIP_CSV, "w", compression=ZIP_DEFLATED) as zf:
            for p in OUT_CSV_DIR.glob("*.csv"):
                # keep a tidy path inside the zip: transcripts_chunks/csv/<file.csv>
                arc = Path("transcripts_chunks") / "csv" / p.name
                zf.write(p, arcname=str(arc))
        print(f"üóúÔ∏è  Zipped chunk CSVs ‚Üí {ZIP_CSV}")
    else:
        print(f"‚ö†Ô∏è  Chunk CSV folder not found: {OUT_CSV_DIR}")

    print(f"\n‚úÖ All done")

if __name__ == "__main__":
    main()

Collecting youtube-transcript-api
  Downloading youtube_transcript_api-1.2.3-py3-none-any.whl.metadata (24 kB)
Collecting yt_dlp
  Downloading yt_dlp-2025.10.22-py3-none-any.whl.metadata (176 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m176.0/176.0 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Downloading youtube_transcript_api-1.2.3-py3-none-any.whl (485 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m485.1/485.1 kB[0m [31m16.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading yt_dlp-2025.10.22-py3-none-any.whl (3.2 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m3.2/3.2 MB[0m [31m62.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: yt_dlp, youtube-transcript-api
Successful

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.



[1/9] Processing: https://www.youtube.com/watch?v=QM6zUHrSpyo


         player = https://www.youtube.com/s/player/6e4dbefe/player_ias.vflset/en_US/base.js
         n = mLlG5xUZ44hSXTU0 ; player = https://www.youtube.com/s/player/6e4dbefe/player_ias.vflset/en_US/base.js
         Please report this issue on  https://github.com/yt-dlp/yt-dlp/issues?q= , filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U


üíæ Full transcript saved ‚Üí output_transcripts/¬øCu√°nta PRESI√ìN puede AGUANTAR Israel.txt
üìÑ Chunk CSV saved ‚Üí transcripts_chunks/csv/¬øCu√°nta PRESI√ìN puede AGUANTAR Israel.csv

[2/9] Processing: https://www.youtube.com/watch?v=1dAd22MuaUg


         player = https://www.youtube.com/s/player/6e4dbefe/player_ias.vflset/en_US/base.js
         n = 1exk904elV36xEhd ; player = https://www.youtube.com/s/player/6e4dbefe/player_ias.vflset/en_US/base.js
         Please report this issue on  https://github.com/yt-dlp/yt-dlp/issues?q= , filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U


üíæ Full transcript saved ‚Üí output_transcripts/Los _GROYPERS_, ¬øla IDEOLOG√çA detr√°s del ASESINO de CHARLIE KIRK.txt
üìÑ Chunk CSV saved ‚Üí transcripts_chunks/csv/Los _GROYPERS_, ¬øla IDEOLOG√çA detr√°s del ASESINO de CHARLIE KIRK.csv

[3/9] Processing: https://www.youtube.com/watch?v=exIKS7Y3xV4


         player = https://www.youtube.com/s/player/6e4dbefe/player_ias.vflset/en_US/base.js
         n = QlPpwszpo-IMsSTw ; player = https://www.youtube.com/s/player/6e4dbefe/player_ias.vflset/en_US/base.js
         Please report this issue on  https://github.com/yt-dlp/yt-dlp/issues?q= , filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U


üíæ Full transcript saved ‚Üí output_transcripts/Nueva POL√çTICA de DEFENSA en EEUU.txt
üìÑ Chunk CSV saved ‚Üí transcripts_chunks/csv/Nueva POL√çTICA de DEFENSA en EEUU.csv

[4/9] Processing: https://www.youtube.com/watch?v=08xx39f3vW4


         player = https://www.youtube.com/s/player/25f1a420/player_ias.vflset/en_US/base.js
         n = QNOucWZc1pLWehxB ; player = https://www.youtube.com/s/player/25f1a420/player_ias.vflset/en_US/base.js
         Please report this issue on  https://github.com/yt-dlp/yt-dlp/issues?q= , filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U


üíæ Full transcript saved ‚Üí output_transcripts/Trump AMENAZA a ESPA√ëA.txt
üìÑ Chunk CSV saved ‚Üí transcripts_chunks/csv/Trump AMENAZA a ESPA√ëA.csv

[5/9] Processing: https://www.youtube.com/watch?v=5gxCCXOGYwc


         player = https://www.youtube.com/s/player/25f1a420/player_ias.vflset/en_US/base.js
         n = Jc9ck8CKq9EWmX5Q ; player = https://www.youtube.com/s/player/25f1a420/player_ias.vflset/en_US/base.js
         Please report this issue on  https://github.com/yt-dlp/yt-dlp/issues?q= , filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U


üíæ Full transcript saved ‚Üí output_transcripts/Mucho RUIDO y P√âSIMA GESTI√ìN.txt
üìÑ Chunk CSV saved ‚Üí transcripts_chunks/csv/Mucho RUIDO y P√âSIMA GESTI√ìN.csv

[6/9] Processing: https://www.youtube.com/watch?v=T9M_wZ4Jqm0


         player = https://www.youtube.com/s/player/6e4dbefe/player_ias.vflset/en_US/base.js
         n = 6485SNHqoCApRt-w ; player = https://www.youtube.com/s/player/6e4dbefe/player_ias.vflset/en_US/base.js
         Please report this issue on  https://github.com/yt-dlp/yt-dlp/issues?q= , filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U


üíæ Full transcript saved ‚Üí output_transcripts/ENTENDIENDO la OFERTA de RUSIA.txt
üìÑ Chunk CSV saved ‚Üí transcripts_chunks/csv/ENTENDIENDO la OFERTA de RUSIA.csv

[7/9] Processing: https://www.youtube.com/watch?v=0-xkCqbuB4g


         player = https://www.youtube.com/s/player/6e4dbefe/player_ias.vflset/en_US/base.js
         n = nBqoOF1ZeGPUFGaR ; player = https://www.youtube.com/s/player/6e4dbefe/player_ias.vflset/en_US/base.js
         Please report this issue on  https://github.com/yt-dlp/yt-dlp/issues?q= , filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U


üíæ Full transcript saved ‚Üí output_transcripts/INCENDIOS DE VERANO con el INGENIERO T√âCNICO AGR√çCOLA Felipe Mar√≠n.txt
üìÑ Chunk CSV saved ‚Üí transcripts_chunks/csv/INCENDIOS DE VERANO con el INGENIERO T√âCNICO AGR√çCOLA Felipe Mar√≠n.csv

[8/9] Processing: https://www.youtube.com/watch?v=_MW9OxwVSq0




üíæ Full transcript saved ‚Üí output_transcripts/Analizamos el ACUERDO COMERCIAL entre la UE y EEUU.txt
üìÑ Chunk CSV saved ‚Üí transcripts_chunks/csv/Analizamos el ACUERDO COMERCIAL entre la UE y EEUU.csv

[9/9] Processing: https://www.youtube.com/watch?v=2dQHz2c4RK0


         player = https://www.youtube.com/s/player/6e4dbefe/player_ias.vflset/en_US/base.js
         n = LrSzFSFQJ5WiBFna ; player = https://www.youtube.com/s/player/6e4dbefe/player_ias.vflset/en_US/base.js
         Please report this issue on  https://github.com/yt-dlp/yt-dlp/issues?q= , filling out the appropriate issue template. Confirm you are on the latest version using  yt-dlp -U


üíæ Full transcript saved ‚Üí output_transcripts/Caso MONTORO_ C√ìMO FUNCIONAN las CLOACAS.txt
üìÑ Chunk CSV saved ‚Üí transcripts_chunks/csv/Caso MONTORO_ C√ìMO FUNCIONAN las CLOACAS.csv
üóúÔ∏è  Zipped full transcripts ‚Üí transcripts_full.zip
üóúÔ∏è  Zipped chunk CSVs ‚Üí transcripts_chunks_csv.zip

‚úÖ All done


In [None]:
LANGUAGES = ("es", "es-ES")
OUT_TXT_DIR = Path("output_transcripts")
OUT_CSV_DIR = Path("transcripts_chunks/csv")

IP_TXT = Path("transcripts_full.zip")
ZIP_CSV = Path("transcripts_chunks_csv.zip")

ZIP_TXT = Path("transcripts_full.zip")            # zip for all .txt
ZIP_CSV = Path("transcripts_chunks_csv.zip")

# Overwrite if they already exist
if ZIP_TXT.exists():
    ZIP_TXT.unlink()
if ZIP_CSV.exists():
  ZIP_CSV.unlink()

# Zip full transcripts
if OUT_TXT_DIR.exists():
  with ZipFile(ZIP_TXT, "w", compression=ZIP_DEFLATED) as zf:
      for p in OUT_TXT_DIR.glob("*.txt"):
          zf.write(p, arcname=p.name)  # store flat names
  print(f"üóúÔ∏è  Zipped full transcripts ‚Üí {ZIP_TXT}")
else:
  print(f"‚ö†Ô∏è  Transcripts folder not found: {OUT_TXT_DIR}")

# Zip chunk CSVs
if OUT_CSV_DIR.exists():
  with ZipFile(ZIP_CSV, "w", compression=ZIP_DEFLATED) as zf:
      for p in OUT_CSV_DIR.glob("*.csv"):
          # keep a tidy path inside the zip: transcripts_chunks/csv/<file.csv>
          arc = Path("transcripts_chunks") / "csv" / p.name
          zf.write(p, arcname=str(arc))
  print(f"üóúÔ∏è  Zipped chunk CSVs ‚Üí {ZIP_CSV}")
else:
  print(f"‚ö†Ô∏è  Chunk CSV folder not found: {OUT_CSV_DIR}")

üóúÔ∏è  Zipped full transcripts ‚Üí transcripts_full.zip
üóúÔ∏è  Zipped chunk CSVs ‚Üí transcripts_chunks_csv.zip


In [None]:
from google.colab import files

# Download the archives you created
files.download("transcripts_full.zip")
files.download("transcripts_chunks_csv.zip")

print(f"\n‚úÖ All done")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


‚úÖ All done
