In [None]:
import yt_dlp
from youtube_transcript_api import YouTubeTranscriptApi
from googleapiclient.discovery import build
import nbimporter
# from pymongo import MongoClient
import os
from utils import load_secrets

youtube_info = []

secrets = load_secrets()
YOUTUBE_API_KEY = secrets["api_keys"]["youtube"]
youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY)

# MongoDB setup
# client = MongoClient('mongodb://localhost:27017/')
# db = client['youtube_data']
# collection = db['video_info']

def extract_video_id(url):
    with yt_dlp.YoutubeDL() as ydl:
        info = ydl.extract_info(url, download=False)
        return info['id']

def get_transcript(video_id):
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        return ' '.join([entry['text'] for entry in transcript])
    except Exception as e:
        print(f"Error getting transcript for video {video_id}: {str(e)}")
        return None

def get_metadata(video_id):
    try:
        response = youtube.videos().list(
            part='snippet,statistics',
            id=video_id
        ).execute()

        if 'items' in response:
            video_data = response['items'][0]
            snippet = video_data['snippet']
            statistics = video_data['statistics']

            return {
                'title': snippet['title'],
                'description': snippet['description'],
                'publishedAt': snippet['publishedAt'],
                'viewCount': statistics['viewCount'],
                'likeCount': statistics.get('likeCount', 0),
                'commentCount': statistics.get('commentCount', 0)
            }
    except Exception as e:
        print(f"Error getting metadata for video {video_id}: {str(e)}")
        return None

def process_video(url):
    video_id = extract_video_id(url)
    transcript = get_transcript(video_id)
    metadata = get_metadata(video_id)

    if transcript and metadata:
        video_data = {
            'url': url,
            'source': 'YouTube',
            'video_id': video_id,
            'transcript': transcript,
            **metadata
        }
        youtube_info.append(video_data)
        # collection.insert_one(video_data)
        print(f"Processed and stored data for video: {video_id}")
    else:
        print(f"Failed to process video: {video_id}")

def main(urls):
    for url in urls:
        process_video(url)

if __name__ == "__main__":
    youtube_urls = [
        'https://www.youtube.com/watch?v=T-D1OfcDW1M&ab_channel=IBMTechnology'
    ]
    main(youtube_urls)
    print(youtube_info)

[youtube] Extracting URL: https://www.youtube.com/watch?v=T-D1OfcDW1M&ab_channel=IBMTechnology
[youtube] T-D1OfcDW1M: Downloading webpage
[youtube] T-D1OfcDW1M: Downloading ios player API JSON
[youtube] T-D1OfcDW1M: Downloading mweb player API JSON
[youtube] T-D1OfcDW1M: Downloading m3u8 information




Processed and stored data for video: T-D1OfcDW1M
[{'url': 'https://www.youtube.com/watch?v=T-D1OfcDW1M&ab_channel=IBMTechnology', 'source': 'YouTube', 'video_id': 'T-D1OfcDW1M', 'transcript': 'Large language models. They are everywhere. They get some things amazingly right and other things very interestingly wrong. My name\xa0is Marina Danilevsky. I am a Senior Research Scientist here at IBM Research. And I want\xa0to tell you about a framework to help large language models be more accurate and more up to\xa0date: Retrieval-Augmented Generation, or RAG. Let\'s just talk about the "Generation" part for a\xa0minute. So forget the "Retrieval-Augmented". So the\xa0generation, this refers to large language models,\xa0or LLMs, that generate text in response to a user query, referred to as a prompt. These\xa0models can have some undesirable behavior. I want to tell you an anecdote to illustrate this. So my kids, they recently asked me this question: "In our solar system, what planet has the m