# Global Imports

In [1]:
import time
from pathlib import Path
from urllib.parse import urlparse, parse_qs
import re
import yt_dlp
import whisper
import numpy
import torch
import whisper

print("NumPy version:", numpy.__version__)
print("Torch version:", torch.__version__)
print("Whisper loaded:", whisper.__file__)


NumPy version: 1.26.4
Torch version: 2.2.2
Whisper loaded: /Users/bshanmugam/Documents/ImmersiveHistorical-StorytellerChatbot/.venv/lib/python3.11/site-packages/whisper/__init__.py


# Reading Video URLs from a File

In [2]:
def read_video_urls(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        return [line.strip() for line in f if line.strip()]

In [3]:
def get_video_id(url):
    # Extract YouTube video ID from URL using regex
    pattern = r'(?:v=|youtu\.be/)([a-zA-Z0-9_-]{11})'
    match = re.search(pattern, url)
    return match.group(1) if match else None

# audio download and Transcripts

In [7]:



def download_audio(video_url, output_dir, filename, retries=3):
    output_path = output_dir / f"{filename}.%(ext)s"
    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': str(output_path),
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '192',
        }],
        'quiet': True,
        'no_warnings': True,
    }

    for attempt in range(1, retries + 1):
        try:
            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                print(f"Downloading audio for: {filename} (Attempt {attempt})")
                ydl.download([video_url])
            print(f"✓ Download complete: {filename}.mp3")
            return True
        except Exception as e:
            print(f"Error downloading {filename} on attempt {attempt}: {e}")
            if attempt < retries:
                print("Retrying...")
                time.sleep(2)
            else:
                print("Giving up.")
                return False


def transcribe_audio_whisper_local(audio_path, model):
    try:
        print("Loaded Whisper model")
        result = model.transcribe(str(audio_path))
        print("Transcription preview:", result["text"][:100])
        return result["text"]
    except Exception as e:
        print(f"Error transcribing {audio_path}: {e}")
        return None


def file_exists(path):
    return path.exists()


def process_video(video_id, audio_output_dir, transcripts_dir, model):
    url = f"https://www.youtube.com/watch?v={video_id}"
    filename = video_id
    audio_path = audio_output_dir / f"{filename}.mp3"
    transcript_path = transcripts_dir / f"{filename}_transcript.txt"

    if not file_exists(audio_path):
        print(f"Audio file not found for {filename}, downloading...")
        if not download_audio(url, audio_output_dir, filename):
            print(f"Skipping {filename} due to download failure")
            return
    else:
        print(f"Audio already exists: {audio_path}")

    if not file_exists(transcript_path):
        print(f"Transcribing {filename}...")
        transcript = transcribe_audio_whisper_local(audio_path, model)
        if transcript:
            with open(transcript_path, "w", encoding="utf-8") as f:
                f.write(transcript)
            print(f"✓ Transcript saved: {transcript_path}")
        else:
            print(f"Failed to transcribe {filename}")
    else:
        print(f"Transcript already exists: {transcript_path}")


def main():
    # Paths & settings
    video_links_file = "YoutubeVideos.txt"  # Your input file with YouTube URLs, one per line
    audio_output_dir = Path("audio_files")
    transcripts_dir = Path("transcripts")
    model_size = "small"  # Whisper model size (tiny, base, small, medium, large)

    # Make sure output folders exist
    audio_output_dir.mkdir(exist_ok=True)
    transcripts_dir.mkdir(exist_ok=True)

    # Read URLs and extract video IDs
    video_urls = read_video_urls(video_links_file)
    video_ids = [get_video_id(url) for url in video_urls if get_video_id(url)]

    print(f"Found {len(video_ids)} videos to process")

    # Load Whisper model once
    print(f"Loading Whisper model: {model_size} ...")
    model = whisper.load_model(model_size, device="cpu")

    # Process each video
    for i, video_id in enumerate(video_ids, 1):
        print(f"\n[{i}/{len(video_ids)}] Processing video ID: {video_id}")
        process_video(video_id, audio_output_dir, transcripts_dir, model)
        # Be polite to YouTube servers
        time.sleep(1)

    print("\nAll done!")


if __name__ == "__main__":
    main()

Found 28 videos to process
Loading Whisper model: small ...

[1/28] Processing video ID: 4jEad6zxaFk
Audio already exists: audio_files/4jEad6zxaFk.mp3
Transcript already exists: transcripts/4jEad6zxaFk_transcript.txt

[2/28] Processing video ID: BR2ZMj3o5EU
Audio already exists: audio_files/BR2ZMj3o5EU.mp3
Transcript already exists: transcripts/BR2ZMj3o5EU_transcript.txt

[3/28] Processing video ID: 9yD9GxzKd_Q
Audio already exists: audio_files/9yD9GxzKd_Q.mp3
Transcript already exists: transcripts/9yD9GxzKd_Q_transcript.txt

[4/28] Processing video ID: vJucA4FOTSI
Audio already exists: audio_files/vJucA4FOTSI.mp3
Transcript already exists: transcripts/vJucA4FOTSI_transcript.txt

[5/28] Processing video ID: 25sBBCPeRvY
Audio already exists: audio_files/25sBBCPeRvY.mp3
Transcript already exists: transcripts/25sBBCPeRvY_transcript.txt

[6/28] Processing video ID: k3QiW0gEpYM
Audio already exists: audio_files/k3QiW0gEpYM.mp3
Transcript already exists: transcripts/k3QiW0gEpYM_transcript.tx