# General Setup

In [1]:
import os
import os
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv
from googleapiclient.discovery import build
from langchain_community.document_loaders.youtube import YoutubeLoader

load_dotenv()  # take environment variables from .env

os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["USER_AGENT"] = os.getenv("USER_AGENT")
YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY")

llm = ChatOpenAI(model="gpt-4o-mini")


# YouTube API Setup

In [2]:
def get_channel_id(api_key, username):
    youtube = build('youtube', 'v3', developerKey=api_key)

    # Search for the channel by username
    request = youtube.search().list(
        part='snippet',
        q=username,
        type='channel',
        maxResults=1
    )
    response = request.execute()

    # Extract the channel ID from the response
    if response['items']:
        channel_id = response['items'][0]['snippet']['channelId']
        return channel_id
    else:
        return None

channel_id = get_channel_id(YOUTUBE_API_KEY, "@peplink")
print(channel_id)


UCVoGNhZzFTtuUd5I8GvVH-Q


In [3]:

def fetch_video_ids(api_key, channel_id):
    youtube = build("youtube", "v3", developerKey=api_key)
    video_ids = []

    # Get uploads playlist ID for the channel
    request = youtube.channels().list(part="contentDetails", id=channel_id)
    response = request.execute()
    uploads_playlist_id = response["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]

    # Fetch all video IDs from the uploads playlist
    next_page_token = None
    while True:
        playlist_request = youtube.playlistItems().list(
            part="contentDetails",
            playlistId=uploads_playlist_id,
            maxResults=50,
            pageToken=next_page_token,
        )
        playlist_response = playlist_request.execute()
        for item in playlist_response["items"]:
            video_id = item["contentDetails"]["videoId"]

            # Check video privacy status
            video_request = youtube.videos().list(
                part="status",
                id=video_id
            )
            video_response = video_request.execute()
            if video_response["items"]:
                if video_response["items"][0]["status"]["privacyStatus"] == "public":
                    video_ids.append(video_id)

        next_page_token = playlist_response.get("nextPageToken")
        if not next_page_token:
            break
    return video_ids


video_ids = fetch_video_ids(YOUTUBE_API_KEY, channel_id)
print(len(video_ids))
print(video_ids)


226
['X5__4jeHx8Y', 'SR4Fqqvm26w', 'bYRWlpO0Fps', 'EL-jbu455mY', 'RGoum6IwHnQ', 'PEGrP0gGlfE', 'p1RmCXoqwCA', 'hLti35eXu-Y', 'amLK85LdDlQ', '6h_xPnd_s0c', 'jXkTcTFHZM4', 'j_tgbrWc5r0', 'Y5CGF61Kg58', '9TKAtcQA4gQ', 'P2YdZdMBiNs', 'U9kLNBvdpXo', 'fszikxLfzDo', 'VbAzr-_gaYg', 'cwUZMDzAPp0', 'igReb-oENS4', '85O_C2GPYxk', 'oeQVDtWUGfo', 'oKxch6X9FE0', 'tIXVtfto8ic', '_LAHIILM0q8', 'rAABltQ_mFg', 'iNwVqhp2QtY', 'GLtjyS4ELAA', 'peixgiy1Iko', 'y6CortEyHV4', 'CAEnhCk8-IY', 'A2IApxXmwug', 'NOcdJOVNa9Q', 'cyWL6AmwwOM', 'wyuumBLn2Wk', 'T4L039BDZcU', 'u_y8jUX1n4o', 'BabplwtnHnc', '_vpiNJJJb44', 'P7EfnkAx6oU', 'HHibWo42yBM', 'jab6P1m2BJY', 'n1jLdrk17ac', 'nH8x5jbAI8g', 'tyS58gMPedY', 'yV3hmff9bsY', 'UX7fCpU69eA', 'ZP39-9oDiJM', '-S3w1wOjirY', 'jVY4qNhYiTM', 'EH_5WMTgXJc', 'rcqh84KP8lk', 'DUV4_NDn4p4', 'egRsS-JBko8', 'EafF_TT2bdc', 'Wakv_Iqat7M', '4ft_QOUw8w8', 'tUIjifdL82s', 'kttp_HKJqUA', 'PzYNVIP8ei8', 'FgFQErHm3Vc', 'GMCyxaSeU34', '5uHH9Io2aPk', 'IcMjOhEUGjU', 'ZfUm7fybz_g', 'IF4nFpiwaZU', 'O3mQ

In [5]:

def load_transcripts(video_ids):
    documents = []
    for video_id in video_ids:
        try:
            loader = YoutubeLoader(video_id=video_id)
            docs = loader.load()
            documents.extend(docs)
        except Exception as e:
            print(f"Could not load transcript for video {video_id}: {str(e)}")
            continue
    return documents

documents = load_transcripts(video_ids)
print(len(documents))
print(documents)


1[{"variableName": "ID_TO_MEANING", "type": "dictionary", "supportedEngines": ["pandas"], "isLocalVariable": true}]

1[{"variableName": "ID_TO_MEANING", "type": "dictionary", "supportedEngines": ["pandas"], "isLocalVariable": true}]

1[{"variableName": "ID_TO_MEANING", "type": "dictionary", "supportedEngines": ["pandas"], "isLocalVariable": true}]

