# 游늽 06_transform_playlists_static.ipynb
Transformar playlists.json en una tabla limpia de playlists manuales, excluyendo uploads, lista para an치lisis y joins.

In [1]:
from pathlib import Path
import json
import pandas as pd
from datetime import datetime

In [2]:
PROJECT_ROOT = Path.cwd().parents[0]

RAW_PATH = PROJECT_ROOT / "data" / "raw" / "youtube"
PROCESSED_PATH = PROJECT_ROOT / "data" / "processed" / "youtube"

PROCESSED_PATH.mkdir(parents=True, exist_ok=True)


In [3]:
with open(RAW_PATH / "playlists.json", "r", encoding="utf-8") as f:
    playlists_raw = json.load(f)

len(playlists_raw["items"])


16

In [4]:
UPLOADS_PLAYLIST_ID = "UUUEOHBht8pnQhQvCfIcl-gg"

In [5]:
from datetime import datetime, timezone

# Timestamp expl칤citamente en UTC
EXTRACTED_AT = datetime.now(timezone.utc)

EXTRACTED_AT

datetime.datetime(2026, 2, 16, 6, 55, 49, 536537, tzinfo=datetime.timezone.utc)

In [6]:
rows = []

for item in playlists_raw["items"]:
    playlist_id = item["id"]

    # Excluir uploads
    if playlist_id == UPLOADS_PLAYLIST_ID:
        continue

    snippet = item["snippet"]
    content = item.get("contentDetails", {})
    status = item.get("status", {})

    rows.append({
        "playlist_id": playlist_id,
        "channel_id": snippet.get("channelId"),
        "title": snippet.get("title"),
        "description": snippet.get("description", ""),
        "item_count": content.get("itemCount", 0),
        "privacy_status": status.get("privacyStatus"),
        "published_at": snippet.get("publishedAt"),
        "thumbnail_url": (
            snippet.get("thumbnails", {})
                   .get("high", {})
                   .get("url")
        ),
        "playlist_url": f"https://www.youtube.com/playlist?list={playlist_id}",
        "extracted_at": EXTRACTED_AT
    })



In [7]:
df_playlists_manual_static = pd.DataFrame(rows)

df_playlists_manual_static.shape


(16, 10)

In [8]:
df_playlists_manual_static.dtypes

playlist_id                    object
channel_id                     object
title                          object
description                    object
item_count                      int64
privacy_status                 object
published_at                   object
thumbnail_url                  object
playlist_url                   object
extracted_at      datetime64[ns, UTC]
dtype: object

In [9]:
# Convertir el published_at a fecha
df_playlists_manual_static["published_at"] = (
    pd.to_datetime(df_playlists_manual_static["published_at"], utc=True)
)

In [23]:
df_playlists_manual_static.dtypes

playlist_id                    object
channel_id                     object
title                          object
description                    object
item_count                      int64
privacy_status                 object
published_at      datetime64[ns, UTC]
extracted_at      datetime64[ns, UTC]
dtype: object

In [10]:
df_playlists_manual_static = df_playlists_manual_static[
    [
        "playlist_id",
        "channel_id",
        "title",
        "description",
        "item_count",
        "privacy_status",
        "published_at",
        "thumbnail_url",
        "playlist_url",
        "extracted_at",
    ]
]



> 丘멆잺 Nota (fase de desarrollo)
>
> El guardado en formato Parquet se utiliza temporalmente para pruebas y separaci칩n entre notebooks.
> En la versi칩n final del pipeline (.py), este paso ser치 omitido y el DataFrame se enviar치 directamente a BigQuery, donde residir치 el hist칩rico definitivo.

In [11]:
output_file = PROCESSED_PATH / "playlists_manual_static.parquet"

df_playlists_manual_static.to_parquet(output_file, index=False)

output_file

PosixPath('/Users/angelgarciachanga/repositorios/publico/youtube-v3-data-pipeline/data/processed/youtube/playlists_manual_static.parquet')

In [12]:
from pathlib import Path
import pandas as pd

PROJECT_ROOT = Path.cwd().parents[0]
PROCESSED_PATH = PROJECT_ROOT / "data" / "processed" / "youtube"

df_playlists_manual_static = pd.read_parquet(
    PROCESSED_PATH / "playlists_manual_static.parquet"
)

df_playlists_manual_static.head()


Unnamed: 0,playlist_id,channel_id,title,description,item_count,privacy_status,published_at,thumbnail_url,playlist_url,extracted_at
0,PLV4oS06_KpqbsY_I8iR4HRvb6w3vXUBIM,UCUEOHBht8pnQhQvCfIcl-gg,SQL - Repaso,,2,public,2026-01-23 05:43:30.512825+00:00,https://i.ytimg.com/vi/7bwkNrRpgw0/hqdefault.jpg,https://www.youtube.com/playlist?list=PLV4oS06...,2026-02-16 06:55:49.536537+00:00
1,PLV4oS06_KpqZGwOHo-tsdIiaZts7qaqql,UCUEOHBht8pnQhQvCfIcl-gg,Python - Repaso,,3,public,2026-01-11 23:05:18.079131+00:00,https://i.ytimg.com/vi/Zj6uiqMvFOU/hqdefault.jpg,https://www.youtube.com/playlist?list=PLV4oS06...,2026-02-16 06:55:49.536537+00:00
2,PLV4oS06_KpqaqyS9x6h5ys3REiUfUDOgy,UCUEOHBht8pnQhQvCfIcl-gg,Curso gratuito de SQL en BigQuery | Funciones ...,Aprende a realizar c치lculos avanzados en SQL s...,5,public,2026-01-11 22:39:38.064293+00:00,https://i.ytimg.com/vi/XHnA-a5rivU/hqdefault.jpg,https://www.youtube.com/playlist?list=PLV4oS06...,2026-02-16 06:55:49.536537+00:00
3,PLV4oS06_KpqbhnVieDd19KJczH_BlBArN,UCUEOHBht8pnQhQvCfIcl-gg,Git - Repaso,,4,public,2026-01-11 10:10:50.096597+00:00,https://i.ytimg.com/vi/GLJffh1QDM4/hqdefault.jpg,https://www.youtube.com/playlist?list=PLV4oS06...,2026-02-16 06:55:49.536537+00:00
4,PLV4oS06_KpqYRtYRoQHo_F_KsEjmqcDK7,UCUEOHBht8pnQhQvCfIcl-gg,Power Bi - Repaso,Aprende funciones DAX avanzadas aplicadas a ca...,14,public,2025-11-14 07:55:25.117707+00:00,https://i.ytimg.com/vi/J0GOADsEp4E/hqdefault.jpg,https://www.youtube.com/playlist?list=PLV4oS06...,2026-02-16 06:55:49.536537+00:00


In [13]:
df_playlists_manual_static.shape

(16, 10)