In [16]:
import os
import re
import json
from googleapiclient.discovery import build
from youtube_transcript_api import YouTubeTranscriptApi

def load_secrets():
    """Loads API keys from secrets.json"""
    with open("../secrets.json", "r") as f:
        secrets = json.load(f)
    return secrets

secrets = load_secrets()
API_KEY = secrets["YOUTUBE_API_KEY"]

def get_video_id(youtube_url):
    """Extracts video ID from a YouTube URL."""
    if "watch?v=" in youtube_url:
        return youtube_url.split("watch?v=")[-1].split("&")[0]
    elif "youtu.be/" in youtube_url:
        return youtube_url.split("youtu.be/")[-1].split("?")[0]
    else:
        raise ValueError("Invalid YouTube URL")

def get_caption_tracks(video_id):
    """Fetches available caption tracks for a video."""
    youtube = build("youtube", "v3", developerKey=API_KEY)
    
    response = youtube.captions().list(
        part="snippet",
        videoId=video_id
    ).execute()
    
    captions = [
        {
            "id": item["id"],
            "language": item["snippet"]["language"],
            "name": item["snippet"].get("name", "Unknown"),
        }
        for item in response.get("items", [])
    ]
    
    return captions

def get_captions_json(video_id, lang="en"):
    """Retrieves closed captions for the video in JSON format."""
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=[lang])
        return transcript  # return as Python object; we will later dump to JSON
    except Exception as e:
        return {"error": str(e)}

def get_video_details(video_id):
    """Retrieves video details (title, description, thumbnails, caption status, embed HTML)."""
    youtube = build("youtube", "v3", developerKey=API_KEY)
    
    response = youtube.videos().list(
        part="snippet,contentDetails,player,status",
        id=video_id
    ).execute()
    
    items = response.get("items", [])
    if not items:
        raise ValueError(f"No video found with ID: {video_id}")
    
    item = items[0]
    
    # Get data from snippet
    snippet = item.get("snippet", {})
    title = snippet.get("title", "")
    description = snippet.get("description", "")
    thumbnails = snippet.get("thumbnails", {})
    
    # Get caption status from contentDetails (this is usually a string "true" or "false")
    content_details = item.get("contentDetails", {})
    caption_status = content_details.get("caption", "false")
    
    # Get embed HTML from player
    player = item.get("player", {})
    embed_html = player.get("embedHtml", "")
    
    return {
        "video_id": video_id,
        "title": title,
        "description": description,
        "thumbnails": thumbnails,
        "caption_status": caption_status,
        "embed_html": embed_html
    }

def sanitize_filename(name):
    """Sanitizes a string to be safe for use in a filename."""
    # Remove any character that is not alphanumeric or an underscore.
    name = re.sub(r'\s+', '_', name)  # replace spaces with underscore
    name = re.sub(r'[^\w\-]', '', name)
    return name

if __name__ == "__main__":
    # Replace with your desired YouTube URL
    youtube_url = "https://www.youtube.com/watch?v=I_Vhk_kNmQU"
    video_id = get_video_id(youtube_url)
    language = "en"  # set desired language for captions
    
    # Retrieve video details and captions
    video_details = get_video_details(video_id)
    captions = get_captions_json(video_id, lang=language)
    
    # Combine results into a single dictionary
    combined_result = {
        "video_details": video_details,
        "captions": captions,
        "available_caption_tracks": get_caption_tracks(video_id)
    }
    
    # Prepare filename: videoID_title_language.json (sanitize title for filename)
    sanitized_title = sanitize_filename(video_details.get("title", "video"))
    filename = f"{video_id}_{sanitized_title}_{language}.json"
    filepath = os.path.join("..", "data", filename)
    
    # Ensure the data directory exists
    os.makedirs(os.path.dirname(filepath), exist_ok=True)
    
    # Write JSON to file
    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(combined_result, f, indent=2, ensure_ascii=False)
    
    print(f"Data saved to {filepath}")


Data saved to ..\data\I_Vhk_kNmQU_Henri_Stern_I_Server_Wallets_With_Privy_I_Agentic_Ethereum_2025_en.json
