# CELL 0: Introduction
# YouTube Subtitle Downloader

This notebook downloads existing subtitles from YouTube videos and saves them to a CSV file.


In [None]:
# CELL 1: Install and Import Libraries
# Install required package (for Google Colab)
!pip install --upgrade youtube-transcript-api

# Import required libraries
from youtube_transcript_api import YouTubeTranscriptApi
import pandas as pd
import re
from datetime import datetime

print("Libraries imported successfully!")
print(f"Current time: {datetime.now()}")
print(f"YouTube Transcript API version installed and ready!")


In [None]:
# CELL 2: Define YouTube URLs
# Define the list of YouTube URLs
youtube_urls = [
    "https://www.youtube.com/watch?v=Es0XHjoFO58",
    "https://www.youtube.com/watch?v=DYqSMjD3VC8",
    "https://www.youtube.com/watch?v=72ibffmv87s",
    "https://www.youtube.com/watch?v=7rqcJN0T_Wg&t=1537s",
    "https://www.youtube.com/watch?v=BmGWBoC7STE&t=232s",
    "https://www.youtube.com/watch?v=U8ssRy95nn0&t=1731s",
    "https://www.youtube.com/watch?v=8ODa_PYw_qc&t=1150s",
    "https://www.youtube.com/watch?v=K1jxCfbCX7Q&t=1803s",
    "https://www.youtube.com/watch?v=i6AeL5oGau4",
    "https://www.youtube.com/watch?v=sQGRfX7sve8"
]

print(f"Total videos to process: {len(youtube_urls)}")


In [None]:
# CELL 3: Extract Video IDs
# Extract video IDs from URLs
def extract_video_id(url):
    """
    Extract video ID from YouTube URL.
    Handles URLs with or without timestamp parameters.
    """
    # Pattern to match video ID in various YouTube URL formats
    pattern = r'(?:v=|\/)([0-9A-Za-z_-]{11}).*'
    match = re.search(pattern, url)
    if match:
        return match.group(1)
    return None

# Test the function with the first URL
test_id = extract_video_id(youtube_urls[0])
print(f"Example video ID: {test_id}")


In [None]:
# CELL 4: Download Subtitles
# Download subtitles for all videos
results = []

# Verify the API is available
print(f"Checking YouTubeTranscriptApi methods...")
print(f"Available methods: {[m for m in dir(YouTubeTranscriptApi) if not m.startswith('_')][:10]}")
print()

for i, url in enumerate(youtube_urls, 1):
    print(f"\nProcessing video {i}/{len(youtube_urls)}: {url}")
    
    try:
        # Extract video ID
        video_id = extract_video_id(url)
        
        if not video_id:
            print(f"  ❌ Could not extract video ID from URL")
            results.append({
                'youtube_url': url,
                'subtitle_text': 'ERROR: Invalid URL format'
            })
            continue
        
        print(f"  Video ID: {video_id}")
        
        # Fetch transcript using the correct API method
        # The API uses 'fetch' method, not 'get_transcript'
        try:
            transcript_list = YouTubeTranscriptApi.list(video_id)
            transcript = transcript_list.find_transcript(['en'])
            transcript_data = transcript.fetch()
        except:
            # Try to get transcript in any available language
            transcript_list = YouTubeTranscriptApi.list(video_id)
            transcript = transcript_list.find_generated_transcript(['en', 'en-US', 'en-GB'])
            transcript_data = transcript.fetch()
        
        # Concatenate all subtitle text
        subtitle_text = ' '.join([segment['text'] for segment in transcript_data])
        
        print(f"  ✓ Successfully downloaded {len(transcript_data)} subtitle segments")
        print(f"  ✓ Total characters: {len(subtitle_text)}")
        
        results.append({
            'youtube_url': url,
            'subtitle_text': subtitle_text
        })
        
    except Exception as e:
        print(f"  ❌ Error: {str(e)}")
        print(f"  Error type: {type(e).__name__}")
        results.append({
            'youtube_url': url,
            'subtitle_text': f'ERROR: {str(e)}'
        })

print(f"\n{'='*60}")
print(f"Processing complete! Successfully processed {sum(1 for r in results if not r['subtitle_text'].startswith('ERROR'))} out of {len(youtube_urls)} videos.")


In [None]:
# CELL 5: Create DataFrame
# Create DataFrame
df = pd.DataFrame(results)

# Display summary
print("DataFrame created!")
print(f"Shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nFirst few rows:")
print(df.head())


In [None]:
# CELL 6: Preview Subtitle Text
# Display preview of subtitle text (first 200 characters of each)
print("Preview of subtitle texts:\n")
for idx, row in df.iterrows():
    print(f"Video {idx + 1}:")
    print(f"  URL: {row['youtube_url']}")
    text_preview = row['subtitle_text'][:200] + '...' if len(row['subtitle_text']) > 200 else row['subtitle_text']
    print(f"  Text: {text_preview}")
    print()


In [None]:
# CELL 7: Save to CSV File
# Save to CSV
import os

# Create data directory if it doesn't exist
data_dir = '../data'
if not os.path.exists(data_dir):
    # For Google Colab, save in current directory instead
    output_path = 'youtube_subtitles.csv'
    print(f"Note: Saving to current directory (Colab compatible)")
else:
    output_path = '../data/youtube_subtitles.csv'

df.to_csv(output_path, index=False)

print(f"✓ CSV file saved successfully!")
print(f"  Location: {output_path}")
print(f"  Total rows: {len(df)}")
print(f"  Total columns: {len(df.columns)}")
print(f"\nYou can download this file from the file browser in Colab!")
