In [None]:
import os
from dotenv import load_dotenv

# Load variables from .env into environment
load_dotenv()

# Read 


YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY")
PROXY_USERNAME = os.getenv("PROXY_USERNAME")
PROXY_PASSWORD = os.getenv("PROXY_PASSWORD")
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")


# # Example use
# print(f"API Key: {YOUTUBE_API_KEY}")
# print(f"Database: {PROXY_USERNAME}")
# print(f"Database: {PROXY_PASSWORD }")
# print(f"Database: {GEMINI_API_KEY }")

In [None]:
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.proxies import WebshareProxyConfig
from youtube_transcript_api.formatters import JSONFormatter

ytt_api = YouTubeTranscriptApi(
    proxy_config=WebshareProxyConfig(
        proxy_username=PROXY_USERNAME,
        proxy_password=PROXY_PASSWORD
    )
)

# all requests done by ytt_api will now be proxied through Webshare
transcript=ytt_api.fetch("JP0sS7f0_oQ") # "JP0sS7f0_oQ" is the video-id here

formatter = JSONFormatter()

# format_transcript(transcript) turns the transcript into a JSON string.
json_formatted = formatter.format_transcript(transcript, indent=2)

# Now we can write it out to a file.
with open('your_filename.json', 'w', encoding='utf-8') as json_file:
    json_file.write(json_formatted)

In [5]:
# transcript
json_formatted


'[\n  {\n    "text": "remember",\n    "start": 1.68,\n    "duration": 2.78\n  },\n  {\n    "text": "hi everyone",\n    "start": 25.76,\n    "duration": 6.48\n  },\n  {\n    "text": "so now I will",\n    "start": 28.859,\n    "duration": 3.381\n  },\n  {\n    "text": "just made an announcement so you have",\n    "start": 33.84,\n    "duration": 4.46\n  },\n  {\n    "text": "the links",\n    "start": 35.94,\n    "duration": 6.18\n  },\n  {\n    "text": "and now I will open slider with",\n    "start": 38.3,\n    "duration": 6.06\n  },\n  {\n    "text": "questions",\n    "start": 42.12,\n    "duration": 2.24\n  },\n  {\n    "text": "so I noticed that last time I",\n    "start": 53.399,\n    "duration": 5.101\n  },\n  {\n    "text": "accidentally didn\'t cover some of the",\n    "start": 56.219,\n    "duration": 4.621\n  },\n  {\n    "text": "questions so maybe they came in last",\n    "start": 58.5,\n    "duration": 4.859\n  },\n  {\n    "text": "minute before I noticed them so that\'s",\n

In [None]:
import os
import json
from googleapiclient.discovery import build
from youtube_transcript_api import YouTubeTranscriptApi, NoTranscriptFound, TranscriptsDisabled
from youtube_transcript_api.proxies import WebshareProxyConfig # New import for proxy config
import re
import sys
import datetime # For formatting timestamps

# --- Configuration ---
# Directory to save the transcript JSON files
OUTPUT_DIR = "zoomcamp_transcripts"

# DataTalks.Club Zoomcamp Playlist URLs
ZOOMCAMP_PLAYLIST_URLS = [
 "https://www.youtube.com/playlist?list=PL3MmuxUbc_hIoBpuc900htYF4uhEAbaT-",  # LLM Zoomcamp 2025
    "https://www.youtube.com/playlist?list=PL3MmuxUbc_hKiIVNf7DeEt_tGjypOYtKV", # LLM Zoomcamp 2024
    "https://www.youtube.com/playlist?list=PL3MmuxUbc_hIB4fSqLy_0AfTjVLpgjV3R", #LLM zoomcamp
    "https://www.youtube.com/playlist?list=PL3MmuxUbc_hLDZ8j0yyeX14N7fGfV4ovC",  # MLOps Zoomcamp 2025
    "https://www.youtube.com/playlist?list=PL3MmuxUbc_hJD0AVR2Un_GSVGMpotGM2t", # MLOPS zoomcamp 2024
    "https://www.youtube.com/playlist?list=PL3MmuxUbc_hKqamJqQ7Ew8HxptJYnXqQM", # MLOps Zoomcamp 2023
    "http://www.youtube.com/playlist?list=PL3MmuxUbc_hLG1MoGNxJ9DmQSSM2bEdQT",  # MLOps Zoomcamp 2022
    "https://www.youtube.com/playlist?list=PL3MmuxUbc_hIUISrluw_A7wDSmfOhErJK", # MLOps Zoomcamp  
    "http://www.youtube.com/playlist?list=PL3MmuxUbc_hJZdpLpRHp7dg6EOx828q6y",  # Data Engineering Zoomcamp 2025
    "https://www.youtube.com/playlist?list=PL3MmuxUbc_hKihpnNQ9qtTmWYy26bPrSb", # Data Engineering Zoomcamp 2024
    "https://www.youtube.com/playlist?list=PL3MmuxUbc_hJjEePXIdE-LVUx_1ZZjYGW", # Data Engineering Zoomcamp 2023
    "https://www.youtube.com/playlist?list=PL3MmuxUbc_hKVX8VnwWCPaWlIHf1qmg8s", # Data Engineering Zoomcamp 2022
    "http://www.youtube.com/playlist?list=PL3MmuxUbc_hJed7dXYoJw8DoCuVHhGEQb" , # Data Engineering Zoomcamp 
    "https://www.youtube.com/playlist?list=PL3MmuxUbc_hJoui-E7wf2r5wWgET3MMZt",  # Machine Learning Zoomcamp 2024
    "https://www.youtube.com/playlist?list=PL3MmuxUbc_hJo_PzMibLDcEGyazxYAtV0", # Machine Learning Zoomcamp 2023
    "https://www.youtube.com/playlist?list=PL3MmuxUbc_hL5QBBEyKUXKuTNx-3cTpKs", # Machine Learning Zoomcamp 2022
    "https://www.youtube.com/playlist?list=PL3MmuxUbc_hL4Gx-wzOJMT4q1K-cArKqu", # Machine Learning Zoomcamp 2021
    "http://www.youtube.com/playlist?list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR"  # Machine Learning Zoomcamp
]

# --- YouTube Data API Configuration ---
YOUTUBE_API_SERVICE_NAME = "youtube"
YOUTUBE_API_VERSION = "v3"

youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION, developerKey=YOUTUBE_API_KEY)

# --- Proxy Configuration for youtube-transcript-api ---

# Initialize YouTubeTranscriptApi with proxy config
# This instance will be reused for all transcript fetches
ytt_api = YouTubeTranscriptApi(
    proxy_config=WebshareProxyConfig(
        proxy_username=PROXY_USERNAME,
        proxy_password=PROXY_PASSWORD
    )
)

# --- Helper Functions ---

def clean_filename(title):
    """Cleans a string to be suitable for a filename."""
    s = str(title).strip().replace(' ', '_')
    s = re.sub(r'(?u)[^-\w.]', '', s)
    return s

def format_timestamp(seconds):
    """Formats seconds into HH:MM:SS string."""
    return str(datetime.timedelta(seconds=int(seconds)))



In [7]:
import os
import json
from googleapiclient.discovery import build
from youtube_transcript_api import YouTubeTranscriptApi, NoTranscriptFound, TranscriptsDisabled
from youtube_transcript_api.proxies import WebshareProxyConfig # New import for proxy config
import re
import sys
import datetime # For formatting timestamps
def get_playlist_id_from_url(url):
    """Extracts playlist ID from a YouTube playlist URL."""
    match = re.search(r'list=([a-zA-Z0-9_-]+)', url)
    if match:
        return match.group(1)
    return None


get_playlist_id_from_url("http://www.youtube.com/playlist?list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR")

'PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR'

In [8]:

def get_video_urls_from_playlist(playlist_url):
    """
    Retrieves all video URLs from a given YouTube playlist URL using YouTube Data API.
    """
    playlist_id = get_playlist_id_from_url(playlist_url)
    if not playlist_id:
        print(f"  Error: Could not extract playlist ID from URL: {playlist_url}. Skipping.")
        return []

    video_urls = []
    next_page_token = None

    print(f"  Fetching video URLs from playlist ID: '{playlist_id}' using YouTube Data API...")
    try:
        while True:
            playlist_items_request = youtube.playlistItems().list(
                part="snippet",
                playlistId=playlist_id,
                maxResults=50,
                pageToken=next_page_token
            )
            playlist_items_response = playlist_items_request.execute()

            for item in playlist_items_response.get("items", []):
                video_id = item["snippet"]["resourceId"]["videoId"]
                video_urls.append(f"https://www.youtube.com/watch?v={video_id}")
            
            next_page_token = playlist_items_response.get("nextPageToken")
            if not next_page_token:
                break

        print(f"  Successfully fetched {len(video_urls)} video URLs from playlist.")
        return video_urls

    except Exception as e:
        print(f"  Error fetching playlist {playlist_url} with YouTube Data API: {e}")
        print("  Please ensure your YOUTUBE_API_KEY is correct and YouTube Data API v3 is enabled for your project.")
        return []
get_video_urls_from_playlist("http://www.youtube.com/playlist?list=PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR")

  Fetching video URLs from playlist ID: 'PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR' using YouTube Data API...
  Successfully fetched 107 video URLs from playlist.


['https://www.youtube.com/watch?v=8wuR_Oz-to0',
 'https://www.youtube.com/watch?v=Crm_5n4mvmg',
 'https://www.youtube.com/watch?v=CeukwyUdaz8',
 'https://www.youtube.com/watch?v=j9kcEuGcC2Y',
 'https://www.youtube.com/watch?v=dCa3JvmJbr0',
 'https://www.youtube.com/watch?v=OH_R0Sl9neM',
 'https://www.youtube.com/watch?v=pqQFlV3f9Bo',
 'https://www.youtube.com/watch?v=Qa0-jYtRdbY',
 'https://www.youtube.com/watch?v=zZyKUeOR4Gg',
 'https://www.youtube.com/watch?v=0j3XK5PsnxA',
 'https://www.youtube.com/watch?v=VRrEEVeJ440',
 'https://www.youtube.com/watch?v=vM3SqPNlStE',
 'https://www.youtube.com/watch?v=Kd74oR4QWGM',
 'https://www.youtube.com/watch?v=k6k8sQ0GhPM',
 'https://www.youtube.com/watch?v=ck0IfiPaQi0',
 'https://www.youtube.com/watch?v=Dn1eTQLsOdA',
 'https://www.youtube.com/watch?v=YkyevnYyAww',
 'https://www.youtube.com/watch?v=hx6nak-Y11g',
 'https://www.youtube.com/watch?v=SvPpMMYtYbU',
 'https://www.youtube.com/watch?v=0LWoFtbzNUM',
 'https://www.youtube.com/watch?v=rawGPX

In [9]:

def get_playlist_title_from_id(playlist_id):
    """Retrieves playlist title using YouTube Data API."""
    try:
        playlist_request = youtube.playlists().list(
            part="snippet",
            id=playlist_id
        )
        playlist_response = playlist_request.execute()
        if playlist_response.get("items"):
            return playlist_response["items"][0]["snippet"]["title"]
    except Exception as e:
        print(f"Error getting playlist title for ID {playlist_id}: {e}. Using default title.")
    return "Unknown Playlist"
get_playlist_title_from_id("PL3MmuxUbc_hIhxl5Ji8t4O6lPAOpHaCLR")

'Machine Learning Zoomcamp'

In [2]:
import json

with open("zoomcamp_transcripts/all_zoomcamp_metadata_with_segmented_transcripts.json", "r") as f:
    transcript_data = json.load(f)


In [3]:
len(transcript_data)

258849

In [4]:
len(transcript_data[0])

9

In [5]:
transcript_data[0]

{'video_id': 'GH3lrOsU3AU',
 'video_title': 'How to Build Agentic\u202fRAG Pipelines with OpenAI Function\u202fCalling? - LLM\u202fZoomcamp Bonus Module',
 'video_url': 'https://www.youtube.com/watch?v=GH3lrOsU3AU',
 'playlist_id': 'PL3MmuxUbc_hIoBpuc900htYF4uhEAbaT-',
 'playlist_title': 'LLM Zoomcamp 2025',
 'text': 'Hi everyone, welcome to our bonus module',
 'start': 0.0,
 'duration': 5.839,
 'end': 5.839}

In [6]:
from collections import defaultdict

# Group segments by video
grouped_by_video = defaultdict(list)
for entry in transcript_data:
    grouped_by_video[entry["video_id"]].append(entry)

In [9]:
len(grouped_by_video[0])

0

Step 1: Chunk Transcript by Time (~30 seconds)
Here’s a custom implementation using Python with LangChain-style preprocessing, but it respects your start and end times:

In [14]:
import json

# Load your transcript JSON
with open("zoomcamp_transcripts/all_zoomcamp_metadata_with_segmented_transcripts.json", "r") as f:
    transcript = json.load(f)

# Step 1: Group transcript into ~30s chunks
def group_transcript_by_time(transcript, window=30):
    chunks = []
    current_chunk = None
    current_end = None

    for entry in transcript:
        start = entry.get("start")
        end = entry.get("end")

        # Skip bad entries
        if start is None or end is None:
            continue

        if current_chunk is None:
            current_chunk = {
                "video_id": entry["video_id"],
                "video_url": entry["video_url"],
                "start_time": start,
                "text": [entry["text"]],
            }
            current_end = end
            continue

        if start - current_chunk["start_time"] < window:
            current_chunk["text"].append(entry["text"])
            current_end = end
        else:
            # Finalize current chunk
            current_chunk["end_time"] = current_end
            current_chunk["text"] = " ".join(current_chunk["text"])
            chunks.append(current_chunk)

            # Start new chunk
            current_chunk = {
                "video_id": entry["video_id"],
                "video_url": entry["video_url"],
                "start_time": start,
                "text": [entry["text"]],
            }
            current_end = end

    # Final chunk
    if current_chunk:
        current_chunk["end_time"] = current_end
        current_chunk["text"] = " ".join(current_chunk["text"])
        chunks.append(current_chunk)

    return chunks

chunks = group_transcript_by_time(transcript, window=30)
print(f"✅ Chunks created: {len(chunks)}")
print("🧩 Example chunk:", chunks[0])


✅ Chunks created: 197
🧩 Example chunk: {'video_id': 'GH3lrOsU3AU', 'video_url': 'https://www.youtube.com/watch?v=GH3lrOsU3AU', 'start_time': 0.0, 'text': 'Hi everyone, welcome to our bonus module of LLM Zoom camp. In this bonus module, we are going to talk about agents. When we launched our course this year a couple of uh weeks ago, I think it was one month one month one month ago, everyone everyone was asking like every second person in the Q&A section section was asking, "Hey, but what about agents? What about agents?" And uh the answer I gave back then was', 'end_time': 34.719}


In [33]:
import json
import pandas as pd
from collections import defaultdict

# Load your transcript JSON
with open("zoomcamp_transcripts/all_zoomcamp_metadata_with_segmented_transcripts.json", "r") as f:
    transcript = json.load(f)

# Step 1: Group transcript into ~30s chunks
def group_transcript_by_time(transcript_entries, window=30):
    chunks = []
    current_chunk = None
    current_end = None

    for entry in transcript_entries:
        start = entry.get("start")
        end = entry.get("end")

        if start is None or end is None:
            continue

        if current_chunk is None:
            current_chunk = {
                "video_id": entry["video_id"],
                "video_url": entry["video_url"],
                "start_time": start,
                "text": [entry["text"]],
            }
            current_end = end
            continue

        if start - current_chunk["start_time"] < window:
            current_chunk["text"].append(entry["text"])
            current_end = end
        else:
            current_chunk["end_time"] = current_end
            current_chunk["text"] = " ".join(current_chunk["text"])
            chunks.append(current_chunk)

            current_chunk = {
                "video_id": entry["video_id"],
                "video_url": entry["video_url"],
                "start_time": start,
                "text": [entry["text"]],
            }
            current_end = end

    if current_chunk:
        current_chunk["end_time"] = current_end
        current_chunk["text"] = " ".join(current_chunk["text"])
        chunks.append(current_chunk)

    return chunks

# Group transcript entries by video_id
transcript_by_video = defaultdict(list)
for entry in transcript:
    transcript_by_video[entry["video_id"]].append(entry)

summary_rows = []
all_chunks = []

for video_id, entries in transcript_by_video.items():
    entries_sorted = sorted(entries, key=lambda x: x["start"])
    num_transcript = len(entries_sorted)

    chunks = group_transcript_by_time(entries_sorted, window=30)
    num_chunks = len(chunks)

    print(f"[INFO] Processed video_id: {video_id}")
    print(f"       Transcript entries: {num_transcript}")
    print(f"       Chunks created:     {num_chunks}")

    if num_chunks == 0:
        print(f"  ⚠️  Warning: No chunks generated for video_id {video_id}")
    elif num_chunks < num_transcript / 10:
        print(f"  ⚠️  Warning: Chunk count much lower than transcript entries. Investigate {video_id}")

    all_chunks.extend(chunks)
    summary_rows.append({
        "video_id": video_id,
        "transcript_entries": num_transcript,
        "chunk_entries": num_chunks
    })

# Save summary table
df_summary = pd.DataFrame(summary_rows)
df_summary.to_csv("transcript_chunk_summary.csv", index=False)
print("\n✅ Summary table saved to 'transcript_chunk_summary.csv'")


[INFO] Processed video_id: GH3lrOsU3AU
       Transcript entries: 2012
       Chunks created:     179
[INFO] Processed video_id: FgnelhEJFj0
       Transcript entries: 1520
       Chunks created:     132
[INFO] Processed video_id: 8lgiOLMMKcY
       Transcript entries: 1245
       Chunks created:     108
[INFO] Processed video_id: YuxVHZ88hfg
       Transcript entries: 1245
       Chunks created:     113
[INFO] Processed video_id: q-p36Ak6YI8
       Transcript entries: 1832
       Chunks created:     170
[INFO] Processed video_id: nMrGK5QgPVE
       Transcript entries: 2028
       Chunks created:     195
[INFO] Processed video_id: ifpqpB1ksGc
       Transcript entries: 1683
       Chunks created:     151
[INFO] Processed video_id: q4Mb4SN-doo
       Transcript entries: 1101
       Chunks created:     100
[INFO] Processed video_id: lre6h7vqz7A
       Transcript entries: 992
       Chunks created:     92
[INFO] Processed video_id: 8UIcSgyekvs
       Transcript entries: 283
       Chunks 

In [17]:
#comparison
video_id = "GH3lrOsU3AU"
original_transcript = [entry for entry in transcript if entry["video_id"] == video_id]

chunked_transcript = [chunk for chunk in chunks if chunk["video_id"] == video_id]


In [18]:
print("🟡 ORIGINAL TRANSCRIPT:")
print(" ".join([entry["text"] for entry in original_transcript])[:500])

print("\n🟢 CHUNKED TRANSCRIPT:")
for i, chunk in enumerate(chunked_transcript[:3]):  # limit to first 3 chunks
    print(f"\n--- Chunk {i+1} ---")
    print(f"Start: {chunk['start_time']}s | End: {chunk['end_time']}s")
    print(chunk["text"][:500])


🟡 ORIGINAL TRANSCRIPT:
Hi everyone, welcome to our bonus module of LLM Zoom camp. In this bonus module, we are going to talk about agents. When we launched our course this year a couple of uh weeks ago, I think it was one month one month one month ago, everyone everyone was asking like every second person in the Q&A section section was asking, "Hey, but what about agents? What about agents?" And uh the answer I gave back then was that agents are still actively evolving. Uh so it doesn't make sense to invest a lot of t

🟢 CHUNKED TRANSCRIPT:

--- Chunk 1 ---
Start: 0.0s | End: 34.719s
Hi everyone, welcome to our bonus module of LLM Zoom camp. In this bonus module, we are going to talk about agents. When we launched our course this year a couple of uh weeks ago, I think it was one month one month one month ago, everyone everyone was asking like every second person in the Q&A section section was asking, "Hey, but what about agents? What about agents?" And uh the answer I gave back then wa

In [None]:
#Check missing values
original_words = set(" ".join([entry["text"] for entry in original_transcript]).split())
chunked_words = set(" ".join([chunk["text"] for chunk in chunked_transcript]).split())

missing_words = original_words - chunked_words
print(f"❌ Missing words in chunks: {missing_words}")

❌ Missing words in chunks: set()


In [20]:
s = 97.2  # set your desired timestamp cutoff
video_id = "GH3lrOsU3AU"

# Filter transcript entries up to timestamp `s` for the specific video
for entry in transcript:
    if entry["video_id"] == video_id and entry["start"] <= s:
        print(f"{entry['start']:.2f}s - {entry['text']}")

transcript_until_s = [
    entry for entry in transcript
    if entry["video_id"] == video_id and entry["start"] <= s
]


0.00s - Hi everyone, welcome to our bonus module
2.88s - of LLM Zoom camp. In this bonus module,
5.84s - we are going to talk about agents. When
8.32s - we launched our course this year a
11.20s - couple of uh weeks ago, I think it was
13.60s - one month one month one month ago,
16.96s - everyone everyone was asking like every
20.08s - second person in the Q&A section section
23.20s - was asking, "Hey, but what about agents?
24.96s - What about agents?"
26.96s - And uh the answer I gave back then was
30.48s - that agents are still actively evolving.
34.72s - Uh so it doesn't make sense to invest a
36.96s - lot of time into creating content which
39.36s - will become obsolete in one year.
42.40s - uh but I already had some materials and
44.64s - I thought how about making this module
46.72s - lightweight so I don't put too much
49.36s - effort but at the same time share with
51.60s - you what I already know what the content
53.68s - I already have and we do it not by
57.12s - creating a

In [None]:
#compare videos seaconds orig versus new
video_id = "GH3lrOsU3AU"
cutoff_time = 97.2

transcript_entries_until_s = [
    entry for entry in transcript
    if entry["video_id"] == video_id and entry["start"] <= cutoff_time
]
print(f"Transcript entries up to {cutoff_time}s: {len(transcript_entries_until_s)}")


Transcript entries up to 97.2s: 35


In [None]:
#compare videos seaconds orig versus new
chunks_until_s = [
    chunk for chunk in chunks
    if chunk["video_id"] == video_id and chunk["start_time"] <= cutoff_time
]
print(f"Chunks up to {cutoff_time}s: {len(chunks_until_s)}")


Chunks up to 97.2s: 4


In [25]:
#compare videos seaconds orig versus new
video_id = "GH3lrOsU3AU"
cutoff_time = 97.2

transcript_entries_until_s = [
    entry for entry in transcript
    if entry["video_id"] == video_id 
]
print(f"Transcript entries for {video_id} up to {cutoff_time}s: {len(transcript_entries_until_s)}")


Transcript entries for GH3lrOsU3AU up to 97.2s: 2012


In [26]:
chunks_until_s = [
    chunk for chunk in chunks
    if chunk["video_id"] == video_id 
]
print(f"Chunks  for {video_id} up to {cutoff_time}s: {len(chunks_until_s)}")

Chunks  for GH3lrOsU3AU up to 97.2s: 179


In [28]:
# Code: Compare Transcript vs Chunks per Video ID

import pandas as pd

# Step 1: Get all video_ids present in transcripts and chunks
video_ids = set([entry["video_id"] for entry in transcript]) | set([chunk["video_id"] for chunk in chunks])

# Step 2: Count entries for each video_id
rows = []
for video_id in video_ids:
    transcript_count = sum(1 for entry in transcript if entry["video_id"] == video_id)
    chunk_count = sum(1 for chunk in chunks if chunk["video_id"] == video_id)
    rows.append({
        "video_id": video_id,
        "transcript_entries": transcript_count,
        "chunk_entries": chunk_count
    })

# Step 3: Create DataFrame
df = pd.DataFrame(rows)

# Step 4: Display the result
print(df)



        video_id  transcript_entries  chunk_entries
0    82TYlOvKwfk                 522              0
1    Zz6oRGsJkW4                 163              0
2    ZdbIk8AltDU                 590              0
3    hCAIVe9N0ow                 265              0
4    HIm2BOj8C0Q                 187              0
..           ...                 ...            ...
510  XODz6LwKY7g                 701              0
511  OzZA4mSBE0Q                 586              0
512  W3Zm6rjOq70                 148              0
513  gsKuETFJr54                 155              0
514  _lwz34sOnSE                 168              0

[515 rows x 3 columns]


In [30]:
video_id='82TYlOvKwfk'
chunked_transcript = [chunk for chunk in chunks if chunk["video_id"] == video_id]
len(chunked_transcript)

0

In [32]:
video_id='82TYlOvKwfk'
chunked_transcript = [entry for entry in transcript
    if entry["video_id"] == video_id == video_id]
len(chunked_transcript)

522

In [19]:
all_zoomcamp_metadata

NameError: name 'all_zoomcamp_metadata' is not defined

In [5]:
from youtube_transcript_api import YouTubeTranscriptApi


# Extract the video ID from the URL
video_url = "https://www.youtube.com/watch?v=2jM7t-NTZxs"
video_id = video_url.split("v=")[-1]

# Fetch transcript
trans=YouTubeTranscriptApi()
transcript = trans.fetch(video_id)

# Print lines
for entry in transcript:
    print(f"[{entry['start']:.2f}s] {entry['text']}")


RequestBlocked: 
Could not retrieve a transcript for the video https://www.youtube.com/watch?v=2jM7t-NTZxs! This is most likely caused by:

YouTube is blocking requests from your IP. This usually is due to one of the following reasons:
- You have done too many requests and your IP has been blocked by YouTube
- You are doing requests from an IP belonging to a cloud provider (like AWS, Google Cloud Platform, Azure, etc.). Unfortunately, most IPs from cloud providers are blocked by YouTube.

There are two things you can do to work around this:
1. Use proxies to hide your IP address, as explained in the "Working around IP bans" section of the README (https://github.com/jdepoix/youtube-transcript-api?tab=readme-ov-file#working-around-ip-bans-requestblocked-or-ipblocked-exception).
2. (NOT RECOMMENDED) If you authenticate your requests using cookies, you will be able to continue doing requests for a while. However, YouTube will eventually permanently ban the account that you have used to authenticate with! So only do this if you don't mind your account being banned!

If you are sure that the described cause is not responsible for this error and that a transcript should be retrievable, please create an issue at https://github.com/jdepoix/youtube-transcript-api/issues. Please add which version of youtube_transcript_api you are using and provide the information needed to replicate the error. Also make sure that there are no open issues which already describe your problem!

In [None]:
# import google.genai as genai # For using Youtube tool
from google import genai
client = genai.Client(api_key=GEMINI_API_KEY)
# client = genai.Client(api_key=GOOGLE_API_KEY)
# model = genai.GenerativeModel("gemini-pro")
response = client.models.generate_content(
    model="gemini-2.0-flash",
    contents="Explain AI to me like I'm a kid.")

print(response.text)

Imagine you have a really smart puppy, but it doesn't know anything yet. You have to teach it tricks.

**AI is like that smart puppy.** It's a computer program that we try to teach to do things that usually need a human brain, like:

*   **Recognizing pictures:** Like teaching your puppy to know which picture is a dog and which is a cat.
*   **Understanding words:** Like teaching your puppy to understand "sit" or "fetch."
*   **Playing games:** Like teaching your puppy to play hide-and-seek, but with computers playing instead of puppies!
*   **Solving problems:** Like giving your puppy a puzzle to figure out how to get a treat.

**How do we teach the AI puppy?**

We give it lots and lots of examples! Like if we want it to recognize dogs, we show it thousands of pictures of dogs. The AI looks for patterns in those pictures to learn what makes a dog a dog.

**So, AI is like a super-smart program that learns from examples, just like you learn at school, but way faster!**

**Think of it li