# üìì 07_transform_playlist_items_manual_static.ipynb

## Objetivo

Construir la relaci√≥n actual entre
playlist manual ‚Üî video,
usando la API de YouTube, y guardarla como tabla est√°tica.

-1 fila = 1 video dentro de 1 playlist manual  
-estado actual  
-se sobrescribe  
-NO hist√≥rico todav√≠a 

In [1]:
from pathlib import Path
import pandas as pd
import requests
import os
from datetime import datetime
from dotenv import load_dotenv


In [2]:
load_dotenv()

API_KEY = os.getenv("YOUTUBE_API_KEY")
assert API_KEY, "Falta YOUTUBE_API_KEY"


In [3]:
PROJECT_ROOT = Path.cwd().parents[0]

PROCESSED_PATH = PROJECT_ROOT / "data" / "processed" / "youtube"
PROCESSED_PATH.mkdir(parents=True, exist_ok=True)


In [6]:
df_playlists_manual_static = pd.read_parquet(
    PROCESSED_PATH / "playlists_manual_static.parquet"
)

df_playlists_manual_static[["playlist_id", "title"]].head()


Unnamed: 0,playlist_id,title
0,PLV4oS06_KpqbsY_I8iR4HRvb6w3vXUBIM,SQL - Repaso
1,PLV4oS06_KpqZGwOHo-tsdIiaZts7qaqql,Python - Repaso
2,PLV4oS06_KpqaqyS9x6h5ys3REiUfUDOgy,Curso gratuito de SQL en BigQuery | Funciones ...
3,PLV4oS06_KpqbhnVieDd19KJczH_BlBArN,Git - Repaso
4,PLV4oS06_KpqYRtYRoQHo_F_KsEjmqcDK7,Power Bi - Repaso


In [7]:
from datetime import datetime, timezone

# Timestamp expl√≠citamente en UTC
EXTRACTED_AT = datetime.now(timezone.utc)

EXTRACTED_AT

datetime.datetime(2026, 2, 16, 7, 59, 49, 648344, tzinfo=datetime.timezone.utc)

In [8]:
rows = []

for _, playlist in df_playlists_manual_static.iterrows():
    playlist_id = playlist["playlist_id"]

    page_token = None

    while True:
        params = {
            "part": "snippet,contentDetails",
            "playlistId": playlist_id,
            "maxResults": 50,
            "key": API_KEY
        }

        if page_token:
            params["pageToken"] = page_token

        response = requests.get(
            "https://www.googleapis.com/youtube/v3/playlistItems",
            params=params
        )
        response.raise_for_status()

        data = response.json()

        for item in data.get("items", []):
            added_at_peru = (
                pd.to_datetime(item["snippet"]["publishedAt"], utc=True)
                .tz_convert("America/Lima")
                .tz_localize(None)
            )

            rows.append({
                "playlist_id": playlist_id,
                "video_id": item["contentDetails"]["videoId"],
                "position": item["snippet"]["position"],
                "added_at": added_at_peru,
                "extracted_at": EXTRACTED_AT
            })

        page_token = data.get("nextPageToken")
        if not page_token:
            break


In [9]:
df_playlist_items_manual_static = pd.DataFrame(rows)

df_playlist_items_manual_static.head()

Unnamed: 0,playlist_id,video_id,position,added_at,extracted_at
0,PLV4oS06_KpqbsY_I8iR4HRvb6w3vXUBIM,7bwkNrRpgw0,0,2026-01-23 01:51:06,2026-02-16 07:59:49.648344+00:00
1,PLV4oS06_KpqbsY_I8iR4HRvb6w3vXUBIM,HDyKUodeuNw,1,2026-01-23 01:37:27,2026-02-16 07:59:49.648344+00:00
2,PLV4oS06_KpqZGwOHo-tsdIiaZts7qaqql,Zj6uiqMvFOU,0,2026-01-17 15:02:37,2026-02-16 07:59:49.648344+00:00
3,PLV4oS06_KpqZGwOHo-tsdIiaZts7qaqql,RiYjYfMTGvw,1,2026-01-11 18:05:55,2026-02-16 07:59:49.648344+00:00
4,PLV4oS06_KpqZGwOHo-tsdIiaZts7qaqql,0VmI47XeOuE,2,2026-01-11 18:05:33,2026-02-16 07:59:49.648344+00:00


In [10]:
df_playlist_items_manual_static.shape

(165, 5)

In [8]:
df_playlist_items_manual_static.dtypes

playlist_id                     str
video_id                        str
position                      int64
added_at             datetime64[us]
extracted_at    datetime64[us, UTC]
dtype: object

In [9]:
# Convertir added_at a datetime UTC
df_playlist_items_manual_static["added_at"] = (
    pd.to_datetime(df_playlist_items_manual_static["added_at"], utc=True)
)


In [10]:
df_playlist_items_manual_static.dtypes

playlist_id                     str
video_id                        str
position                      int64
added_at        datetime64[us, UTC]
extracted_at    datetime64[us, UTC]
dtype: object

In [11]:
df_playlist_items_manual_static = df_playlist_items_manual_static[
    [
        "playlist_id",
        "video_id",
        "position",
        "added_at",
        "extracted_at",
    ]
]

> ‚ö†Ô∏è Nota (fase de desarrollo)
>
> El guardado en formato Parquet se utiliza temporalmente para pruebas y separaci√≥n entre notebooks.
> En la versi√≥n final del pipeline (.py), este paso ser√° omitido y el DataFrame se enviar√° directamente a BigQuery, donde residir√° el hist√≥rico definitivo.

In [12]:
output_file = PROCESSED_PATH / "playlist_items_manual_static.parquet"

df_playlist_items_manual_static.to_parquet(output_file, index=False)

output_file


PosixPath('/Users/angelgarciachanga/repositorios/publico/youtube-v3-data-pipeline/data/processed/youtube/playlist_items_manual_static.parquet')

In [13]:
df = pd.read_parquet(
    PROCESSED_PATH / "playlist_items_manual_static.parquet"
)

df.head()


Unnamed: 0,playlist_id,video_id,position,added_at,extracted_at
0,PLV4oS06_KpqbsY_I8iR4HRvb6w3vXUBIM,7bwkNrRpgw0,0,2026-01-23 01:51:06+00:00,2026-02-14 22:47:41.027713+00:00
1,PLV4oS06_KpqbsY_I8iR4HRvb6w3vXUBIM,HDyKUodeuNw,1,2026-01-23 01:37:27+00:00,2026-02-14 22:47:41.027713+00:00
2,PLV4oS06_KpqZGwOHo-tsdIiaZts7qaqql,Zj6uiqMvFOU,0,2026-01-17 15:02:37+00:00,2026-02-14 22:47:41.027713+00:00
3,PLV4oS06_KpqZGwOHo-tsdIiaZts7qaqql,RiYjYfMTGvw,1,2026-01-11 18:05:55+00:00,2026-02-14 22:47:41.027713+00:00
4,PLV4oS06_KpqZGwOHo-tsdIiaZts7qaqql,0VmI47XeOuE,2,2026-01-11 18:05:33+00:00,2026-02-14 22:47:41.027713+00:00
