# 📼 YouTube → 🗣️ Whisper (local) → 🧠 ChatGPT (Summary + Full)

This Colab-ready notebook:
1. **Downloads** a YouTube video and extracts audio (WAV)
2. **Transcribes locally** in Colab using **Python Whisper** (default) or **faster-whisper**
3. Sends the transcript to **ChatGPT** to generate a **Summary** and a **Cleaned Full Version**

**What you need:**
- A YouTube URL
- An **OpenAI API key** (paste when prompted)

> If a particular video is blocked by YouTube automation checks, try another URL. In Colab we cannot pass browser cookies.

In [ ]:
#@title 🔧 Install dependencies (run once per session)
!pip -q install yt-dlp ffmpeg-python openai-whisper
# Optional (comment out if you won't use it):
!pip -q install faster-whisper
# OpenAI Python SDK v1+
!pip -q install --upgrade openai

import os, sys, json, time, textwrap, pathlib, subprocess
from pathlib import Path

print('Setup complete.')

In [ ]:
#@title ⚙️ Configuration
YOUTUBE_URL = "" #@param {type:"string"}
ASR_ENGINE = "whisper" #@param ["whisper", "faster-whisper"]
WHISPER_MODEL = "base" #@param ["tiny", "base", "small", "medium", "large"]
FORCE_LANGUAGE = "auto" #@param ["auto", "en", "he", "ru", "fr", "de", "es", "it", "pt", "ar", "zh"]
OUTPUT_SUBTITLES = False #@param {type:"boolean"}

from getpass import getpass
OPENAI_API_KEY = getpass("Enter your OpenAI API key (will not be shown): ")
assert YOUTUBE_URL, "Please set YOUTUBE_URL above and re-run this cell."
assert OPENAI_API_KEY, "Please paste your OpenAI API key."

workdir = Path('/content/yt_whisper_run')
workdir.mkdir(parents=True, exist_ok=True)
print('Workdir:', workdir)

In [ ]:
#@title ⤵️ Download YouTube audio (WAV)
import yt_dlp
import ffmpeg

audio_out = workdir / "%(_id)s.%(ext)s"

ydl_opts = {
    'format': 'bestaudio/best',
    'outtmpl': str(audio_out),
    'noplaylist': True,
    'quiet': True,
}

print('Downloading...')
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
    info = ydl.extract_info(YOUTUBE_URL, download=True)

downloaded = list(workdir.glob(f"{info['id']}.*"))
assert downloaded, "Download failed."
src_path = downloaded[0]
wav_path = workdir / f"{info['id']}.wav"

print('Converting to WAV (48kHz, mono)...')
(
    ffmpeg
    .input(str(src_path))
    .output(str(wav_path), ac=1, ar=48000)
    .overwrite_output()
    .run(quiet=True)
)
print('WAV ready:', wav_path)

In [ ]:
#@title 🗣️ Transcribe with Whisper (local in Colab)
import torch, os
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Torch device:', device)

language = None if FORCE_LANGUAGE == 'auto' else FORCE_LANGUAGE

transcript_txt = workdir / 'transcript.txt'
srt_path = workdir / 'subtitles.srt'

if ASR_ENGINE == 'whisper':
    import whisper
    model = whisper.load_model(WHISPER_MODEL, device=device)
    result = model.transcribe(str(wav_path), language=language, fp16=(device=='cuda'))
    text = result.get('text', '').strip()
    transcript_txt.write_text(text, encoding='utf-8')
    if OUTPUT_SUBTITLES:
        # quick-and-simple SRT from segments
        segments = result.get('segments', [])
        def srt_timestamp(t):
            h = int(t // 3600); m = int((t % 3600) // 60); s = int(t % 60); ms = int((t*1000) % 1000)
            return f"{h:02}:{m:02}:{s:02},{ms:03}"
        lines = []
        for i, seg in enumerate(segments, 1):
            lines.append(str(i))
            lines.append(f"{srt_timestamp(seg['start'])} --> {srt_timestamp(seg['end'])}")
            lines.append(seg.get('text','').strip())
            lines.append('')
        srt_path.write_text('\n'.join(lines), encoding='utf-8')
elif ASR_ENGINE == 'faster-whisper':
    from faster_whisper import WhisperModel
    compute_type = 'float16' if device=='cuda' else 'int8'
    model = WhisperModel(WHISPER_MODEL, device=device, compute_type=compute_type)
    segments, info = model.transcribe(str(wav_path), language=language)
    
    # Write transcript and (optionally) SRT
    lines = []
    srt_lines = []
    def srt_timestamp(t):
        h = int(t // 3600); m = int((t % 3600) // 60); s = int(t % 60); ms = int((t*1000) % 1000)
        return f"{h:02}:{m:02}:{s:02},{ms:03}"
    for i, seg in enumerate(segments, 1):
        seg_text = seg.text.strip()
        lines.append(seg_text)
        if OUTPUT_SUBTITLES:
            srt_lines.append(str(i))
            srt_lines.append(f"{srt_timestamp(seg.start)} --> {srt_timestamp(seg.end)}")
            srt_lines.append(seg_text)
            srt_lines.append("")
    transcript_txt.write_text('\n'.join(lines), encoding='utf-8')
    if OUTPUT_SUBTITLES:
        srt_path.write_text('\n'.join(srt_lines), encoding='utf-8')
else:
    raise ValueError('Unknown ASR_ENGINE; choose "whisper" or "faster-whisper"')

print('Transcript saved to:', transcript_txt)
if OUTPUT_SUBTITLES and srt_path.exists():
    print('SRT saved to:', srt_path)

print('\nPreview (first 600 chars):\n')
print(transcript_txt.read_text(encoding='utf-8')[:600])

In [ ]:
#@title 🤖 Send transcript to ChatGPT for Summary + Clean Full Version
import os
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY

from openai import OpenAI
client = OpenAI()

transcript = Path(transcript_txt).read_text(encoding='utf-8')

system_prompt = (
    "You are a careful editor. Clean the transcript (fix obvious ASR errors; keep speaker intent) and produce TWO sections:\n\n"
    "### Summary\n- 5–10 bullet points of key takeaways\n- a 2–3 sentence abstract\n\n"
    "### Full Version (Cleaned)\nA lightly edited, readable transcript (no hallucinations; note unclear parts with [inaudible]).\n"
)

user_prompt = f"Source URL: {YOUTUBE_URL}\n\nTranscript:\n---\n{transcript}"

resp = client.chat.completions.create(
    model="gpt-4o-mini",
    temperature=0.2,
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt},
    ],
)

out_text = resp.choices[0].message.content
md_path = workdir / 'chatgpt_output.md'
md_path.write_text(out_text, encoding='utf-8')

summary_path = workdir / 'summary_only.md'
full_path = workdir / 'full_cleaned_only.md'

# Optional: split sections for convenience
summary_section = []
full_section = []
target = None
for line in out_text.splitlines():
    if line.strip().lower().startswith('### summary'):
        target = 'summary'; continue
    if line.strip().lower().startswith('### full version'):
        target = 'full'; continue
    if target == 'summary': summary_section.append(line)
    elif target == 'full': full_section.append(line)

if summary_section:
    summary_path.write_text('\n'.join(summary_section).strip(), encoding='utf-8')
if full_section:
    full_path.write_text('\n'.join(full_section).strip(), encoding='utf-8')

print('Saved:')
print(' - Transcript:', transcript_txt)
print(' - ChatGPT (both sections):', md_path)
if summary_path.exists():
    print(' - Summary only:', summary_path)
if full_path.exists():
    print(' - Full cleaned only:', full_path)

In [ ]:
#@title 📁 Show output file paths (clickable in Colab)
from IPython.display import FileLink, display
print('Workdir:', workdir)
for p in [transcript_txt, srt_path, Path(workdir/"chatgpt_output.md"), Path(workdir/"summary_only.md"), Path(workdir/"full_cleaned_only.md")]:
    if p.exists():
        display(FileLink(str(p)))

## Notes
- For tougher audio, set a larger model (e.g., `small` or `medium`).
- If language detection flips, set `FORCE_LANGUAGE` to a specific code (`he`, `ru`, `en`, ...).
- If a download fails due to YouTube anti-bot checks, try a different video.
- You can switch `ASR_ENGINE` to `faster-whisper` for speed on CPU/GPU.