# 03 – Transformación: channels_static

Este notebook transforma el JSON crudo del recurso `channels`
en una tabla etática que se actualiza y reescribe cada vez que se consulta

In [None]:
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
PROJECT_ROOT

In [None]:
import json
import pandas as pd
from datetime import datetime, date

In [None]:
RAW_PATH = PROJECT_ROOT / "data" / "raw" / "youtube"
PROCESSED_PATH = PROJECT_ROOT / "data" / "processed" / "youtube"

RAW_PATH, PROCESSED_PATH


In [None]:
PROCESSED_PATH.mkdir(parents=True, exist_ok=True)


In [None]:
with open(RAW_PATH / "channels.json", "r", encoding="utf-8") as f:
    channels_raw = json.load(f)

channels_raw.keys()


In [None]:
item = channels_raw["items"][0]
item.keys()


In [None]:
from datetime import datetime

EXTRACTED_AT = datetime.utcnow()

EXTRACTED_AT

In [None]:
item = channels_raw["items"][0]

thumbnails = item["snippet"].get("thumbnails", {})

thumbnail_url = (
    thumbnails.get("maxres", {}).get("url") or
    thumbnails.get("high", {}).get("url") or
    thumbnails.get("medium", {}).get("url") or
    thumbnails.get("default", {}).get("url")
)

channel_url = f"https://www.youtube.com/channel/{item['id']}"

row_static = {
    "channel_id": item["id"],
    "channel_title": item["snippet"]["title"],
    "description": item["snippet"].get("description"),
    "country": item["snippet"].get("country"),
    "published_at": pd.to_datetime(item["snippet"]["publishedAt"], utc=True),
    "thumbnail_url": thumbnail_url,
    "channel_url": channel_url,
    "extracted_at": pd.to_datetime(EXTRACTED_AT, utc=True)
}

df_channels_static = pd.DataFrame([row_static])

df_channels_static

In [None]:
df_channels_static.shape

In [None]:
df_channels_static.dtypes

Nota: snapshot_date  -> Los objetos datetime.date Se almacenan como dtype object, Esto es normal. No es un string. No es un error. Es simplemente cómo pandas maneja date.

In [None]:
#ordenar columnas
df_channels_static = df_channels_static[
    [
        "channel_id",
        "channel_title",
        "description",
        "country",
        "published_at",
        "thumbnail_url",
        "channel_url",
        "extracted_at"
    ]
]


> ⚠️ Nota (fase de desarrollo)
>
> El guardado en formato Parquet se utiliza temporalmente para pruebas y separación entre notebooks.
> En la versión final del pipeline (.py), este paso será omitido y el DataFrame se enviará directamente a BigQuery, donde residirá el histórico definitivo.

In [None]:
output_file = PROCESSED_PATH / "channels_static.parquet"

df_channels_static.to_parquet(output_file, index=False)

output_file


In [None]:
import pandas as pd
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
file_path = PROJECT_ROOT / "data" / "processed" / "youtube" / "channels_static.parquet"

df = pd.read_parquet(file_path)
df
