In [1]:
!uv add youtube-transcript-api

[2mResolved [1m103 packages[0m [2min 1.25s[0m[0m
[2mPrepared [1m1 package[0m [2min 281ms[0m[0m
[2mInstalled [1m1 package[0m [2min 136ms[0m[0m
 [32m+[39m [1myoutube-transcript-api[0m[2m==1.2.3[0m


In [2]:
import datetime
from youtube_transcript_api import YouTubeTranscriptApi

In [4]:
def fetch_transcript(video_id):
    ytt_api = YouTubeTranscriptApi()
    transcript = ytt_api.fetch(video_id)
    return transcript

In [5]:
def format_timestamp(seconds: float) -> str:
    """Convert seconds to H:MM:SS if > 1 hour, else M:SS"""
    total_seconds = int(seconds)
    hours, remainder = divmod(total_seconds, 3600)
    minutes, secs = divmod(remainder, 60)

    if hours == 0:
        return f"{minutes}:{secs:02}"
    return f"{hours}:{minutes:02}:{secs:02}"

def make_subtitles(transcript) -> str:
    lines = []

    for entry in transcript:
        ts = format_timestamp(entry.start)
        text = entry.text.replace('\n', ' ')
        lines.append(ts + ' ' + text)

    return '\n'.join(lines)

In [6]:
video_id = 'D2rw52SOFfM'
video_name = 'Reinventing a Career in Tech'

transcript = fetch_transcript(video_id)
subtitles = make_subtitles(transcript)

In [51]:
!mkdir data

In [7]:
from pathlib import Path

In [8]:
data_root = Path('data') 

In [9]:
subtitles_file = data_root / f"{video_id}.txt"

In [55]:
with subtitles_file.open('wt', encoding='utf-8') as f_out:
    f_out.write(video_name)
    f_out.write('\n\n')
    f_out.write(subtitles)

In [56]:
!head data/D2rw52SOFfM.txt

Reinventing a Career in Tech

0:00 Hi everyone, welcome to our event. This
0:03 event is brought to you by Data Dogs
0:04 Club which is a community of people who
0:06 love data. We have weekly events. Today
0:09 is one of such events. If you want to
0:11 found find out more about the events we
0:13 have, there's a link in the description.
0:14 Click on this link, you see all the


In [10]:
def write_file(video_id, video_name, subtitles):
    subtitles_file = data_root / f"{video_id}.txt"
    
    with subtitles_file.open('wt', encoding='utf-8') as f_out:
        f_out.write(video_name)
        f_out.write('\n\n')
        f_out.write(subtitles)

In [11]:
from pathlib import Path
from dataclasses import dataclass

In [12]:
@dataclass
class Subtitles:
    video_id: str
    video_title: str
    subtitles: str

    def write_file(self, subtitles_file: Path):    
        with subtitles_file.open('wt', encoding='utf-8') as f_out:
            f_out.write(self.video_title)
            f_out.write('\n\n')
            f_out.write(self.subtitles)

In [23]:
s = Subtitles(
    video_id=video_id,
    video_title=video_name,
    subtitles=subtitles
)

subtitles_file = data_root / f"{s.video_id}.txt"
s.write_file(subtitles_file)

NameError: name 'video_id' is not defined

In [13]:
!head data/2ZOnA19sDpM.txt

Reinventing a Career in Tech

0:00 Hi everyone, welcome to our event. This
0:03 event is brought to you by Data Dogs
0:04 Club which is a community of people who
0:06 love data. We have weekly events. Today
0:09 is one of such events. If you want to
0:11 found find out more about the events we
0:13 have, there's a link in the description.
0:14 Click on this link, you see all the


In [18]:
!uv add requests pyyaml

[2mResolved [1m103 packages[0m [2min 706ms[0m[0m
[2mAudited [1m99 packages[0m [2min 0.08ms[0m[0m


In [16]:
import requests
import yaml

events_url = 'https://raw.githubusercontent.com/DataTalksClub/datatalksclub.github.io/187b7d056a36d5af6ac33e4c8096c52d13a078a7/_data/events.yaml'

raw_yaml = requests.get(events_url).content
events_data = yaml.load(raw_yaml, yaml.CSafeLoader)

podcasts = [d for d in events_data if (d.get('type') == 'podcast') and (d.get('youtube'))]

In [17]:
podcasts[12]

{'time': datetime.datetime(2025, 4, 14, 17, 0),
 'title': 'Taking your Freelance Career to the Next Level',
 'speakers': ['dimitrivisnadi'],
 'type': 'podcast',
 'link': 'https://lu.ma/p5a6orim',
 'youtube': 'https://www.youtube.com/watch?v=S93V8RgwBig'}

In [18]:
videos = []

for podcast in podcasts:
    _, video_id = podcast['youtube'].split('watch?v=')

    if video_id == 'FRi0SUtxdMw':
        continue

    videos.append({
        'title': podcast['title'],
        'video_id': video_id
    })

In [19]:
videos[10]

{'title': 'From Medicine to Machine Learning: How Public Learning Turned into a Career',
 'video_id': '5km62e4nDaw'}

In [66]:
!uv add tqdm

[2mResolved [1m106 packages[0m [2min 895ms[0m[0m
[2mInstalled [1m1 package[0m [2min 176ms[0m[0m
 [32m+[39m [1mtqdm[0m[2m==4.67.1[0m


In [20]:
from tqdm.auto import tqdm

In [21]:
def workflow(video_id, video_name):
    subtitles_file = data_root / f"{video_id}.txt"
    if subtitles_file.exists():
        return subtitles_file
    
    transcript = fetch_transcript(video_id)
    subtitles = make_subtitles(transcript)

    s = Subtitles(
        video_id=video_id,
        video_title=video_name,
        subtitles=subtitles
    )

    s.write_file(subtitles_file)
    return subtitles_file
    

for video in tqdm(videos):
    video_id = video['video_id']
    video_name = video['title']

    workflow(video_id, video_name)

  0%|          | 0/193 [00:00<?, ?it/s]

KeyboardInterrupt: 

Let's add proxies (you don't need to do it - you can download the results directly, and I'll show you how)

In [25]:
from youtube_transcript_api.proxies import GenericProxyConfig

In [22]:
import os

proxy_user = os.getenv('PROXY_USER')
proxy_password = os.getenv('PROXY_PASSWORD')
proxy_base_url = os.getenv('PROXY_BASE_URL')

proxy_url = f'http://{proxy_user}:{proxy_password}@{proxy_base_url}'

proxy = GenericProxyConfig(
    http_url=proxy_url,
    https_url=proxy_url,
)

In [23]:
def fetch_transcript(video_id):
    ytt_api = YouTubeTranscriptApi(proxy_config=proxy)
    transcript = ytt_api.fetch(video_id)
    return transcript

Even now:

    SSLError                                  Traceback (most recent call last)
    SSLError: [SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1028)
    
    The above exception was the direct cause of the following exception:
    
    ...
    
    SSLError: HTTPSConnectionPool(host='www.youtube.com', port=443): Max retries exceeded with url: /watch?v=NThHAEIazFk (Caused by SSLError(SSLEOFError(8, '[SSL: UNEXPECTED_EOF_WHILE_READING] EOF occurred in violation of protocol (_ssl.c:1028)')))

In [26]:
for video in tqdm(videos):
    video_id = video['video_id']
    video_name = video['title']

    workflow(video_id, video_name)

  0%|          | 0/193 [00:00<?, ?it/s]

SSLError: [SSL: WRONG_VERSION_NUMBER] wrong version number (_ssl.c:2648)

Add to elastic 

    docker run -it \
      --rm \
      --name elasticsearch \
      -m 4GB \
      -p 9200:9200 \
      -p 9300:9300 \
      -e "discovery.type=single-node" \
      -e "xpack.security.enabled=false" \
      -v es-data:/usr/share/elasticsearch/data \
      docker.elastic.co/elasticsearch/elasticsearch:8.4.3

In [25]:
!uv add elasticsearch

[2mResolved [1m105 packages[0m [2min 1.17s[0m[0m
[2mPrepared [1m2 packages[0m [2min 891ms[0m[0m
[2mInstalled [1m2 packages[0m [2min 308ms[0m[0m
 [32m+[39m [1melastic-transport[0m[2m==9.2.0[0m
 [32m+[39m [1melasticsearch[0m[2m==9.2.0[0m


In [26]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200') 