In [None]:
from pytube import Playlist

def get_playlist_urls(playlist_url: str) -> list[str]:
    """
    Fetches all video URLs from a given YouTube playlist.

    Args:
        playlist_url (str): The URL of the YouTube playlist.

    Returns:
        list[str]: A list of video URLs in the playlist.

    Example:
        >>> get_playlist_urls("https://www.youtube.com/playlist?list=PLabc123xyz")
        ['https://www.youtube.com/watch?v=video1', 'https://www.youtube.com/watch?v=video2', ...]
    """
    playlist = Playlist(playlist_url)
    return [video_url for video_url in playlist.video_urls]

# Example usage:
playlist_url = "https://www.youtube.com/playlist?list=PLm9_3psBwxqMzDRSElnDTnK0hNrWTG0Sq"  # Replace with your playlist URL
urls = get_playlist_urls(playlist_url)
print(urls)

In [None]:
from pytube import YouTube

In [None]:
yt = YouTube(urls[0])

In [None]:
yt.vid_info

In [None]:
API_KEY = "AIzaSyCNF9OgnJ0qqE0CppGIWZqUKht38URlByo"

from googleapiclient.discovery import build

def get_youtube_playlist_metadata(api_key: str, playlist_id: str) -> list[dict]:
    """
    Fetches detailed metadata for all videos in a YouTube playlist using the YouTube Data API.

    Args:
        api_key (str): Your YouTube Data API key.
        playlist_id (str): The ID of the YouTube playlist.

    Returns:
        list[dict]: A list of dictionaries containing metadata for each video.

    Example:
        >>> get_youtube_playlist_metadata("YOUR_API_KEY", "PLabc123xyz")
        [{'title': 'Video 1', 'url': 'https://www.youtube.com/watch?v=abc123', ...}, ...]
    """
    youtube = build('youtube', 'v3', developerKey=api_key)
    video_metadata = []
    next_page_token = None

    while True:
        # Fetch playlist items
        playlist_request = youtube.playlistItems().list(
            part="snippet",
            playlistId=playlist_id,
            maxResults=10,
            pageToken=next_page_token,
        )
        playlist_response = playlist_request.execute()

        for item in playlist_response["items"]:
            video_id = item["snippet"]["resourceId"]["videoId"]
            # Fetch video details
            video_request = youtube.videos().list(
                part="snippet,statistics,contentDetails",
                id=video_id,
            )
            video_response = video_request.execute()
            if video_response["items"]:
                
                video = video_response["items"][0]
                metadata = {
                    "title": video["snippet"]["title"],
                    "url": f"https://www.youtube.com/watch?v={video_id}",
                    "description": video["snippet"]["description"],
                    "publish_date": video["snippet"]["publishedAt"],
                    "duration": video["contentDetails"]["duration"],
                    "tags": video["snippet"].get("tags", []),
                }
                video_metadata.append(metadata)

        next_page_token = playlist_response.get("nextPageToken")
        if not next_page_token:
            break

    return video_metadata


PLAYLIST_ID = "PLm9_3psBwxqMzDRSElnDTnK0hNrWTG0Sq"  # Replace with your playlist ID
metadata = get_youtube_playlist_metadata(API_KEY, PLAYLIST_ID)

# for video in metadata:
#     print(video)

In [None]:
metadata

In [None]:
from data_processing.text_processing import get_text_from_file, write_text_to_file

In [None]:
import json
from pathlib import Path
metadata_path = Path("dp_video_metadata.jsonl")

with open(metadata_path, "w") as f:
    for video in metadata:
        json.dump(video, f)
        f.write("\n")

In [None]:
url_list_path = Path("master_url_list.csv")
for video in metadata:
    line = f"{video['url']}, {video['title']}\n"
    write_text_to_file(url_list_path, line, append=True)

In [None]:
from pathlib import Path
import csv

reduced_metadata = [{"url": video["url"], "title": video["title"]} for video in metadata]

# Write CSV to file
with url_list_path.open("w", encoding="utf-8", newline="") as file:
    writer = csv.DictWriter(file, fieldnames=["url", "title"])
    writer.writeheader()
    writer.writerows(reduced_metadata)