<a href="https://colab.research.google.com/github/akshayonly/youtube-to-notebooklm/blob/main/YouTube_Transcript_Retrival.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Packages

In [1]:
pip install youtube-transcript-api yt-dlp

Collecting youtube-transcript-api
  Downloading youtube_transcript_api-1.1.0-py3-none-any.whl.metadata (23 kB)
Collecting yt-dlp
  Downloading yt_dlp-2025.6.9-py3-none-any.whl.metadata (174 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m174.3/174.3 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
Downloading youtube_transcript_api-1.1.0-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.7/485.7 kB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading yt_dlp-2025.6.9-py3-none-any.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m36.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: yt-dlp, youtube-transcript-api
Successfully installed youtube-transcript-api-1.1.0 yt-dlp-2025.6.9


# Main

In [2]:
import subprocess, json, re
import pandas as pd
from youtube_transcript_api import (
    YouTubeTranscriptApi, TranscriptsDisabled,
    NoTranscriptFound, VideoUnavailable
)

# Fetch all video URLs from a YouTube playlist
def get_playlist_urls(playlist_url):
    try:
        result = subprocess.run(
            ['yt-dlp', '--flat-playlist', '--dump-json', playlist_url],
            capture_output=True, text=True, check=True
        )
        return [
            f"https://www.youtube.com/watch?v={json.loads(line)['id']}"
            for line in result.stdout.strip().split('\n')
        ]
    except subprocess.CalledProcessError as e:
        print("yt-dlp error:", e)
        return []

# Get playlist title
def get_playlist_title(playlist_url):
    try:
        result = subprocess.run(
            ['yt-dlp', '--flat-playlist', '--dump-single-json', playlist_url],
            capture_output=True, text=True, check=True
        )
        return json.loads(result.stdout).get('title', 'Unknown Playlist')
    except Exception as e:
        print(f"Failed to fetch playlist title: {e}")
        return "Unknown Playlist"

# Extract video ID from URL
def extract_video_id(url):
    match = re.search(r'(?:v=|\/)([0-9A-Za-z_-]{11})', url)
    if not match:
        raise ValueError(f"Invalid YouTube URL: {url}")
    return match.group(1)

# Get video title
def get_video_title(url):
    try:
        result = subprocess.run(
            ['yt-dlp', '--skip-download', '--print-json', url],
            capture_output=True, text=True, check=True
        )
        return json.loads(result.stdout).get('title', 'Unknown Title')
    except:
        return "Unknown Title"

# Clean unwanted annotations ([Music], [Applause], etc.)
def clean_transcript(text):
    return re.sub(r'\[.*?\]', '', text).replace('  ', ' ').strip()

# Get video transcript with annotations removed
def get_transcript_text(video_id):
    try:
        raw_transcript = YouTubeTranscriptApi.get_transcript(video_id)
    except NoTranscriptFound:
        transcripts = YouTubeTranscriptApi.list_transcripts(video_id)
        raw_transcript = transcripts.find_transcript(
            [t.language_code for t in transcripts]
        ).fetch()
    except (TranscriptsDisabled, VideoUnavailable):
        raise RuntimeError("Transcript disabled or video unavailable.")

    transcript_text = ' '.join(
        entry['text'].strip() for entry in raw_transcript if entry['text'].strip()
    )

    return clean_transcript(transcript_text)

# Process single video
def process_video(url):
    try:
        video_id = extract_video_id(url)
        title = get_video_title(url)
        transcript = get_transcript_text(video_id)
        print(f"Fetched: {title}")
        return {'video_id': video_id, 'title': title, 'transcript': transcript}
    except Exception as e:
        print(f"Skipped: {url} | Reason: {e}")
        return None

# Main pipeline
def main(playlist_url):
    print(f"Processing playlist: {playlist_url}")

    playlist_title = get_playlist_title(playlist_url)
    video_urls = get_playlist_urls(playlist_url)

    print(f"Found {len(video_urls)} videos in playlist: {playlist_title}\n")

    results = []
    for url in video_urls:
        data = process_video(url)
        if data:
            data['playlist_title'] = playlist_title
            results.append(data)

    df = pd.DataFrame(results)
    print(f"\nCompleted transcripts extraction for {len(df)} videos.")

    filename = f"{playlist_title.replace(' ', '_').lower()}.txt"
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(f"Playlist Title: {playlist_title}\n\n")
        for _, row in df.iterrows():
            f.write(row['title'].strip() + "\n")
            f.write(row['transcript'].strip() + "\n\n")

    print(f"Transcript file saved as: {filename}")
    return df

# Run pipeline
playlist_url = "https://youtube.com/playlist?list=PLctGJos87iQ1-dLNljF-kzTCQgTHWANWc&si=bhlc0Kt_-ApeKwIa"
df_transcripts = main(playlist_url)


Processing playlist: https://youtube.com/playlist?list=PLctGJos87iQ1-dLNljF-kzTCQgTHWANWc&si=bhlc0Kt_-ApeKwIa
Found 7 videos in playlist: Bulk RNAseq analysis

Fetched: Master Bioinformatics RNAseq Analysis from Scratch: A Beginner's Guide
Fetched: How to make a perfect looking heatmap using RNAseq data
Fetched: How to make a perfect looking volcano plot using RNAseq data
Fetched: Gene Set Enrichment Analysis using RNA Sequencing Data 🧬
Fetched: How to preprocess GEO bulk RNAseq fastq file with salmon
Fetched: From Salmon to DESeq2: RNAseq Data Analysis
Fetched: How to Merge Multiple gene counts Files in R Easily

Completed transcripts extraction for 7 videos.
Transcript file saved as: bulk_rnaseq_analysis.txt


In [4]:
!head bulk_rnaseq_analysis.txt

Playlist Title: Bulk RNAseq analysis

Master Bioinformatics RNAseq Analysis from Scratch: A Beginner's Guide
hello this is Tommy welcome back to Chad omics so today I'm going to talk about a beginnner b informatics guide for on sequencing analysis in the end I'm going to share with you a course link for for AR sequencing data analysis so you can have some handson uh exercise make sure you stick to the end so first of all what is uh an sequencing uh so by the way if you don't uh know what is an sequencing I highly recommend this video by Josh stammer a gentle introduction on an sequencing so according to the so I also bored the slid from his video so according to the name an sequencing is sequencing the an it's a technology to measure gene expression level uh for all the genes in the genome in in the human genome we have 20,000 genes so it can measure or sequence all genes to ex and to measure the Gen expression level so in this example we have two groups of cells uh bunch of uh normal 