In [None]:
from pydantic import BaseModel
import json
import datetime
from typing import Optional

class Video(BaseModel):
    url: str
    title: str
    duration: int
    date: str
    description: Optional[str] = None

whisper_model_speedup = 43 # 43x faster than real time

with open('./videos_without_description.json') as f:
    data = [Video(**item) for item in json.load(f)]
    total_videos_duration_seconds = sum(video.duration for video in data)
    whisper_time = total_videos_duration_seconds/whisper_model_speedup
    total_videos_duration_formatted = str(datetime.timedelta(seconds=int(total_videos_duration_seconds)))
    whisper_time_formatted = str(datetime.timedelta(seconds=int(whisper_time)))
    print(f"total video time : {total_videos_duration_formatted}\nwhisper estimated time : {whisper_time_formatted}" )


In [None]:
import requests
from bs4 import BeautifulSoup
import re
import html
import codecs

def get_youtube_video_description(video_url):
    response = requests.get(video_url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        scripts = soup.find_all('script')
        
        for script in scripts:
            if 'var ytInitialPlayerResponse' in script.text:
                script_text = script.string
                match = re.search(r'"shortDescription":"(.*?)","isCrawlable"', script_text, re.DOTALL)
                if match:
                    description = match.group(1)
                    description = codecs.decode(description, 'unicode_escape').encode('latin1').decode('utf-8')
                    return description
    return "-no description-"

with open("le_blob_videos.json", 'a', encoding='utf-8') as file:
    file.write('[\n')
    index=0
    count = len(data)
    for video in data:
        description = get_youtube_video_description(video.url)
        video.description = description
        updated_video = video.model_dump()
        file.write(json.dumps(updated_video) + ',\n')
        print(f"{index}/{count} | {video.url} : {description[:35]}...{description[-35:]}")
        index+=1
    file.write('\n]')
