In [1]:
import json
import pandas as pd
import os
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv
from googleapiclient.discovery import build
from langchain_community.document_loaders.youtube import YoutubeLoader
from extract.youtube.VideoItem import VideoItem

load_dotenv()  # take environment variables from .env

os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["USER_AGENT"] = os.getenv("USER_AGENT")
YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY")

llm = ChatOpenAI(model="gpt-4o-mini")


In [2]:

def get_channel_id(api_key, username):
    youtube = build('youtube', 'v3', developerKey=api_key)

    # Search for the channel by username
    request = youtube.search().list(
        part='snippet',
        q=username,
        type='channel',
        maxResults=1
    )
    response = request.execute()

    # Extract the channel ID from the response
    if response['items']:
        channel_id = response['items'][0]['snippet']['channelId']
        return channel_id
    else:
        return None


channel_id = get_channel_id(YOUTUBE_API_KEY, "@peplink")
print(channel_id)


UCVoGNhZzFTtuUd5I8GvVH-Q


In [3]:

youtube_client = build("youtube", "v3", developerKey=YOUTUBE_API_KEY)

# Get uploads playlist ID for the channel
request = youtube_client.channels().list(part="contentDetails", id=channel_id)
response = request.execute()
uploads_playlist_id = response["items"][0]["contentDetails"]["relatedPlaylists"]["uploads"]
print(uploads_playlist_id)


UUVoGNhZzFTtuUd5I8GvVH-Q


### Fetch videos with transcripts

In [4]:

videos_with_transcripts = []
next_page_token = None
while True:
    playlist_request = youtube_client.playlistItems().list(
        part="contentDetails",
        playlistId=uploads_playlist_id,
        maxResults=50,
        pageToken=next_page_token,
    )
    playlist_response = playlist_request.execute()
    for item in playlist_response["items"]:
        video_id = item["contentDetails"]["videoId"]

        # Check video privacy and caption status
        video_request = youtube_client.videos().list(
            part="status,contentDetails,snippet,statistics,topicDetails,localizations,player,recordingDetails",
            id=video_id
        )
        video_response = video_request.execute()
        if video_response["items"]:
            video_item = video_response["items"][0]
            is_public = video_item["status"]["privacyStatus"] == "public"

            if is_public:
                try:
                    loader = YoutubeLoader(video_id=video_id)
                    docs = loader.load()
                    video_item["transcript"] = docs[0].page_content
                    video = VideoItem.model_validate(video_item)
                    videos_with_transcripts.append(video.model_dump())

                except Exception as e:
                    print(f"Could not load transcript for video {video_id}")
                    continue

    next_page_token = playlist_response.get("nextPageToken")
    if not next_page_token:
        break

# Write videos to JSON file
with open("youtube_account_pepwave.json", "w", encoding="utf-8") as f:
    json.dump(videos_with_transcripts, f, ensure_ascii=False, indent=2)

# Write videos to parquet file
df = pd.DataFrame(videos_with_transcripts)
df.to_parquet("youtube_account_pepwave.parquet")


Could not load transcript for video RGoum6IwHnQ
Could not load transcript for video p1RmCXoqwCA
Could not load transcript for video 6h_xPnd_s0c
Could not load transcript for video Y5CGF61Kg58
Could not load transcript for video P2YdZdMBiNs
Could not load transcript for video cwUZMDzAPp0
Could not load transcript for video igReb-oENS4
Could not load transcript for video 85O_C2GPYxk
Could not load transcript for video cyWL6AmwwOM
Could not load transcript for video -S3w1wOjirY
Could not load transcript for video iLnKarz0yYM
Could not load transcript for video -CTrjiYFzpY
Could not load transcript for video lLR6LF_igRM
Could not load transcript for video -EAESBwHuQs
Could not load transcript for video yL9RXaufPv8
Could not load transcript for video U21p7ivXia4
Could not load transcript for video -ccl5Ih5EpA
Could not load transcript for video xeg0hfYc0Qk
Could not load transcript for video Fk6ySZHxYyk
Could not load transcript for video 07TNZmGj1VU
Could not load transcript for video FAUP