In [2]:
from pytube import Playlist

def get_playlist_urls(playlist_url: str) -> list[str]:
    """
    Fetches all video URLs from a given YouTube playlist.

    Args:
        playlist_url (str): The URL of the YouTube playlist.

    Returns:
        list[str]: A list of video URLs in the playlist.

    Example:
        >>> get_playlist_urls("https://www.youtube.com/playlist?list=PLabc123xyz")
        ['https://www.youtube.com/watch?v=video1', 'https://www.youtube.com/watch?v=video2', ...]
    """
    playlist = Playlist(playlist_url)
    return [video_url for video_url in playlist.video_urls]

# Example usage:
playlist_url = "https://www.youtube.com/playlist?list=PLm9_3psBwxqMzDRSElnDTnK0hNrWTG0Sq"  # Replace with your playlist URL
urls = get_playlist_urls(playlist_url)
print(urls)

['https://www.youtube.com/watch?v=5AU6Q_bFzYU', 'https://www.youtube.com/watch?v=hLhUh8PLXzQ', 'https://www.youtube.com/watch?v=zYFcFUNUj-A', 'https://www.youtube.com/watch?v=Drc6txG5fMg', 'https://www.youtube.com/watch?v=dySNBXmknz8', 'https://www.youtube.com/watch?v=ghMGL0RgdMo', 'https://www.youtube.com/watch?v=vsB7M4mp1Tw', 'https://www.youtube.com/watch?v=bXj_RaMNlPI', 'https://www.youtube.com/watch?v=1qH5PAc7a_w', 'https://www.youtube.com/watch?v=It4-dPx7mfk', 'https://www.youtube.com/watch?v=6MQ0-JyJjIM', 'https://www.youtube.com/watch?v=YpTYMBb0I-M', 'https://www.youtube.com/watch?v=H0RAhYES3M8', 'https://www.youtube.com/watch?v=LChHUSDfVYg', 'https://www.youtube.com/watch?v=LHCfxo5iSEQ', 'https://www.youtube.com/watch?v=DJyn6OVUheQ', 'https://www.youtube.com/watch?v=CjDo2ic4Dic', 'https://www.youtube.com/watch?v=_7RiB3NJ-GE', 'https://www.youtube.com/watch?v=5FffRgULu3E', 'https://www.youtube.com/watch?v=5eioQwgAmVA', 'https://www.youtube.com/watch?v=Yir4l3bwUXc', 'https://www

In [7]:
from pytube import YouTube

In [8]:
yt = YouTube(urls[0])

In [10]:
yt.title

PytubeError: Exception while accessing title of https://youtube.com/watch?v=5AU6Q_bFzYU. Please file a bug report at https://github.com/pytube/pytube

In [15]:
yt.vid_info

{'responseContext': {'visitorData': 'CgtCUFVtTFg3YmRTNCiv6ue6BjIKCgJVUxIEGgAgRg%3D%3D',
  'serviceTrackingParams': [{'service': 'GFEEDBACK',
    'params': [{'key': 'ipcc', 'value': '0'},
     {'key': 'is_viewed_live', 'value': 'False'},
     {'key': 'is_alc_surface', 'value': 'false'},
     {'key': 'logged_in', 'value': '0'},
     {'key': 'e',
      'value': '24004644,24077241,24078649,24181174,24241378,24299873,24466622,24548629,24556101,24690006,51009781,51017346,51020570,51021189,51025415,51028056,51030101,51037342,51037353,51050361,51053689,51057844,51057851,51065188,51089007,51105630,51111738,51115184,51117319,51124104,51151423,51152050,51153490,51156054,51157411,51160545,51169118,51176511,51178316,51178329,51178346,51178355,51178982,51183910,51187145,51217504,51221150,51222382,51225391,51226709,51227037,51227774,51228850,51230478,51231218,51237842,51239093,51241028,51242448,51243940,51248255,51248734,51255676,51255680,51255743,51256074,51256084,51258066,51274583,51275785,51276557

In [22]:
API_KEY = "AIzaSyCNF9OgnJ0qqE0CppGIWZqUKht38URlByo"

from googleapiclient.discovery import build

def get_youtube_playlist_metadata(api_key: str, playlist_id: str) -> list[dict]:
    """
    Fetches detailed metadata for all videos in a YouTube playlist using the YouTube Data API.

    Args:
        api_key (str): Your YouTube Data API key.
        playlist_id (str): The ID of the YouTube playlist.

    Returns:
        list[dict]: A list of dictionaries containing metadata for each video.

    Example:
        >>> get_youtube_playlist_metadata("YOUR_API_KEY", "PLabc123xyz")
        [{'title': 'Video 1', 'url': 'https://www.youtube.com/watch?v=abc123', ...}, ...]
    """
    youtube = build('youtube', 'v3', developerKey=api_key)
    video_metadata = []
    next_page_token = None

    while True:
        # Fetch playlist items
        playlist_request = youtube.playlistItems().list(
            part="snippet",
            playlistId=playlist_id,
            maxResults=10,
            pageToken=next_page_token,
        )
        playlist_response = playlist_request.execute()

        for item in playlist_response["items"]:
            video_id = item["snippet"]["resourceId"]["videoId"]
            # Fetch video details
            video_request = youtube.videos().list(
                part="snippet,statistics,contentDetails",
                id=video_id,
            )
            video_response = video_request.execute()
            if video_response["items"]:
                
                video = video_response["items"][0]
                metadata = {
                    "title": video["snippet"]["title"],
                    "url": f"https://www.youtube.com/watch?v={video_id}",
                    "description": video["snippet"]["description"],
                    "publish_date": video["snippet"]["publishedAt"],
                    "duration": video["contentDetails"]["duration"],
                    "tags": video["snippet"].get("tags", []),
                }
                video_metadata.append(metadata)

        next_page_token = playlist_response.get("nextPageToken")
        if not next_page_token:
            break

    return video_metadata


PLAYLIST_ID = "PLm9_3psBwxqMzDRSElnDTnK0hNrWTG0Sq"  # Replace with your playlist ID
metadata = get_youtube_playlist_metadata(API_KEY, PLAYLIST_ID)

# for video in metadata:
#     print(video)

In [23]:
metadata

[{'title': 'Understanding and Transforming Anger | Sister Ao Nghiem | 2024-11-24',
  'url': 'https://www.youtube.com/watch?v=5AU6Q_bFzYU',
  'description': 'From the Ocean of Peace Mediation Hall at Deer Park Monastery, Escondido, CA. To learn more about Deer Park Monastery or to offer a donation, please visit us at deerparkmonastery.org',
  'publish_date': '2024-12-04T23:12:56Z',
  'duration': 'PT1H4M20S',
  'tags': ['Deer Park Monastery', 'Thich Nhat Hanh']},
 {'title': 'Living a Well-Lived Life | Venerable Dharma Talk | 12-01-24',
  'url': 'https://www.youtube.com/watch?v=hLhUh8PLXzQ',
  'description': 'From the Ocean of Peace Mediation Hall at Deer Park Monastery, Escondido, CA. To learn more about Deer Park Monastery or to offer a donation, please visit us at deerparkmonastery.org',
  'publish_date': '2024-12-03T19:58:10Z',
  'duration': 'PT1H1M4S',
  'tags': ['Deer Park Monastery', 'Thich Nhat Hanh']},
 {'title': 'Thanksgiving Day Dharma Talk | Sister Spirit (Than Nghiem) | 2024-

In [24]:
from data_processing.text_processing import get_text_from_file, write_text_to_file

In [30]:
import json
from pathlib import Path
metadata_path = Path("dp_video_metadata.jsonl")

with open(metadata_path, "w") as f:
    for video in metadata:
        json.dump(video, f)
        f.write("\n")

In [34]:
url_list_path = Path("master_url_list.csv")
for video in metadata:
    line = f"{video['url']}, {video['title']}\n"
    write_text_to_file(url_list_path, line, append=True)

In [None]:
from pathlib import Path
import csv

reduced_metadata = [{"url": video["url"], "title": video["title"]} for video in metadata]

# Write CSV to file
with url_list_path.open("w", encoding="utf-8", newline="") as file:
    writer = csv.DictWriter(file, fieldnames=["url", "title"])
    writer.writeheader()
    writer.writerows(reduced_metadata)