# 04 – Transformación: videos_stactic

Este notebook transforma el JSON crudo del recurso `videos`
en una tabla etática que se actualiza y reescribe cada vez que se consulta

In [20]:
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent

RAW_PATH = PROJECT_ROOT / "data" / "raw" / "youtube"
PROCESSED_PATH = PROJECT_ROOT / "data" / "processed" / "youtube"

PROCESSED_PATH.mkdir(parents=True, exist_ok=True)

RAW_PATH, PROCESSED_PATH

(PosixPath('/Users/angelgarciachanga/repositorios/publico/youtube-v3-data-pipeline/data/raw/youtube'),
 PosixPath('/Users/angelgarciachanga/repositorios/publico/youtube-v3-data-pipeline/data/processed/youtube'))

In [21]:
import json
import pandas as pd
from datetime import datetime, date
import isodate

In [22]:
with open(RAW_PATH / "videos.json", "r", encoding="utf-8") as f:
    videos_raw = json.load(f)

len(videos_raw["items"])


199

In [13]:
from datetime import datetime, timezone

EXTRACTED_AT = datetime.now(timezone.utc)

EXTRACTED_AT

datetime.datetime(2026, 2, 14, 22, 2, 51, 77435, tzinfo=datetime.timezone.utc)

In [23]:
import isodate

def iso_duration_to_seconds(duration):
    try:
        return int(isodate.parse_duration(duration).total_seconds())
    except Exception:
        return None


In [24]:
rows = []

for item in videos_raw["items"]:

    thumbnails = item["snippet"].get("thumbnails", {})

    thumbnail_url = (
        thumbnails.get("maxres", {}).get("url") or
        thumbnails.get("standard", {}).get("url") or
        thumbnails.get("high", {}).get("url") or
        thumbnails.get("medium", {}).get("url") or
        thumbnails.get("default", {}).get("url")
    )

    video_url = f"https://www.youtube.com/watch?v={item['id']}"

    rows.append({
        "video_id": item["id"],
        "channel_id": item["snippet"]["channelId"],
        "title": item["snippet"]["title"],
        "description": item["snippet"].get("description"),
        "category_id": item["snippet"].get("categoryId"),
        "published_at": pd.to_datetime(
            item["snippet"]["publishedAt"],
            utc=True
        ),
        "duration_seconds": iso_duration_to_seconds(
            item["contentDetails"].get("duration")
        ),
        "thumbnail_url": thumbnail_url,
        "video_url": video_url,
        "extracted_at": EXTRACTED_AT
    })

df_videos_static = pd.DataFrame(rows)
df_videos_static.head()



Unnamed: 0,video_id,channel_id,title,description,category_id,published_at,duration_seconds,thumbnail_url,video_url,extracted_at
0,xB4ecIksJSY,UCUEOHBht8pnQhQvCfIcl-gg,4. Git Push - Actualizar cambios de un reposit...,,27,2026-01-24 12:04:21+00:00,960,https://i.ytimg.com/vi/xB4ecIksJSY/maxresdefau...,https://www.youtube.com/watch?v=xB4ecIksJSY,2026-02-14 22:02:51.077435+00:00
1,7bwkNrRpgw0,UCUEOHBht8pnQhQvCfIcl-gg,Modo oscuro - Big query - Google Cloud,,27,2026-01-23 06:52:23+00:00,69,https://i.ytimg.com/vi/7bwkNrRpgw0/maxresdefau...,https://www.youtube.com/watch?v=7bwkNrRpgw0,2026-02-14 22:02:51.077435+00:00
2,HDyKUodeuNw,UCUEOHBht8pnQhQvCfIcl-gg,Como seleccionar valores iguales en SQL,,27,2026-01-23 06:43:39+00:00,294,https://i.ytimg.com/vi/HDyKUodeuNw/maxresdefau...,https://www.youtube.com/watch?v=HDyKUodeuNw,2026-02-14 22:02:51.077435+00:00
3,Zj6uiqMvFOU,UCUEOHBht8pnQhQvCfIcl-gg,3. Cómo funciona el bucle For en python - Repa...,,27,2026-01-17 20:07:55+00:00,1186,https://i.ytimg.com/vi/Zj6uiqMvFOU/maxresdefau...,https://www.youtube.com/watch?v=Zj6uiqMvFOU,2026-02-14 22:02:51.077435+00:00
4,RiYjYfMTGvw,UCUEOHBht8pnQhQvCfIcl-gg,2. Qué es el PATH en la instalación de python,,27,2026-01-11 23:25:57+00:00,1138,https://i.ytimg.com/vi/RiYjYfMTGvw/maxresdefau...,https://www.youtube.com/watch?v=RiYjYfMTGvw,2026-02-14 22:02:51.077435+00:00


In [25]:
df_videos_static.shape

(199, 10)

In [26]:
df_videos_static.dtypes

video_id                            str
channel_id                          str
title                               str
description                         str
category_id                         str
published_at        datetime64[us, UTC]
duration_seconds                  int64
thumbnail_url                       str
video_url                           str
extracted_at        datetime64[us, UTC]
dtype: object

In [27]:
#Ordenar columnas
df_videos_static = df_videos_static[
    [
        "video_id",
        "channel_id",
        "title",
        "description",
        "category_id",
        "published_at",
        "duration_seconds",
        "thumbnail_url",
        "video_url",
        "extracted_at"
    ]
]


> ⚠️ Nota (fase de desarrollo)
>
> El guardado en formato Parquet se utiliza temporalmente para pruebas y separación entre notebooks.
> En la versión final del pipeline (.py), este paso será omitido y el DataFrame se enviará directamente a BigQuery, donde residirá el histórico definitivo.

In [28]:
output_file = PROCESSED_PATH / "videos_static.parquet"

df_videos_static.to_parquet(output_file, index=False)

output_file


PosixPath('/Users/angelgarciachanga/repositorios/publico/youtube-v3-data-pipeline/data/processed/youtube/videos_static.parquet')

In [31]:
import pandas as pd
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
file_path = PROJECT_ROOT / "data" / "processed" / "youtube" / "videos_static.parquet"

df = pd.read_parquet(file_path)
df

Unnamed: 0,video_id,channel_id,title,description,category_id,published_at,duration_seconds,thumbnail_url,video_url,extracted_at
0,xB4ecIksJSY,UCUEOHBht8pnQhQvCfIcl-gg,4. Git Push - Actualizar cambios de un reposit...,,27,2026-01-24 12:04:21+00:00,960,https://i.ytimg.com/vi/xB4ecIksJSY/maxresdefau...,https://www.youtube.com/watch?v=xB4ecIksJSY,2026-02-14 22:02:51.077435+00:00
1,7bwkNrRpgw0,UCUEOHBht8pnQhQvCfIcl-gg,Modo oscuro - Big query - Google Cloud,,27,2026-01-23 06:52:23+00:00,69,https://i.ytimg.com/vi/7bwkNrRpgw0/maxresdefau...,https://www.youtube.com/watch?v=7bwkNrRpgw0,2026-02-14 22:02:51.077435+00:00
2,HDyKUodeuNw,UCUEOHBht8pnQhQvCfIcl-gg,Como seleccionar valores iguales en SQL,,27,2026-01-23 06:43:39+00:00,294,https://i.ytimg.com/vi/HDyKUodeuNw/maxresdefau...,https://www.youtube.com/watch?v=HDyKUodeuNw,2026-02-14 22:02:51.077435+00:00
3,Zj6uiqMvFOU,UCUEOHBht8pnQhQvCfIcl-gg,3. Cómo funciona el bucle For en python - Repa...,,27,2026-01-17 20:07:55+00:00,1186,https://i.ytimg.com/vi/Zj6uiqMvFOU/maxresdefau...,https://www.youtube.com/watch?v=Zj6uiqMvFOU,2026-02-14 22:02:51.077435+00:00
4,RiYjYfMTGvw,UCUEOHBht8pnQhQvCfIcl-gg,2. Qué es el PATH en la instalación de python,,27,2026-01-11 23:25:57+00:00,1138,https://i.ytimg.com/vi/RiYjYfMTGvw/maxresdefau...,https://www.youtube.com/watch?v=RiYjYfMTGvw,2026-02-14 22:02:51.077435+00:00
...,...,...,...,...,...,...,...,...,...,...
194,_wQMZfzqCAo,UCUEOHBht8pnQhQvCfIcl-gg,¿Cómo hacer un ANALISIS EXPLORATORIO DE DATOS ...,Activar macro para estadísticos descriptivo 1:...,27,2022-12-14 07:40:34+00:00,1643,https://i.ytimg.com/vi/_wQMZfzqCAo/maxresdefau...,https://www.youtube.com/watch?v=_wQMZfzqCAo,2026-02-14 22:02:51.077435+00:00
195,QC05ysMl8oc,UCUEOHBht8pnQhQvCfIcl-gg,Tipo de relaciones entre tablas,Descarga de archivos:\n\nRelación de uno a muc...,27,2022-12-11 09:16:21+00:00,1759,https://i.ytimg.com/vi/QC05ysMl8oc/maxresdefau...,https://www.youtube.com/watch?v=QC05ysMl8oc,2026-02-14 22:02:51.077435+00:00
196,mqLksU5um3Q,UCUEOHBht8pnQhQvCfIcl-gg,Calidad de datos en POWER QUERY,1) ¿Cómo FILTRAR ERRORES en power query? 3:00\...,27,2022-12-08 09:53:54+00:00,1287,https://i.ytimg.com/vi/mqLksU5um3Q/maxresdefau...,https://www.youtube.com/watch?v=mqLksU5um3Q,2026-02-14 22:02:51.077435+00:00
197,ZgZVpp61cYY,UCUEOHBht8pnQhQvCfIcl-gg,¿Qué es la desviación estandar?,,22,2022-11-08 10:21:48+00:00,1242,https://i.ytimg.com/vi/ZgZVpp61cYY/maxresdefau...,https://www.youtube.com/watch?v=ZgZVpp61cYY,2026-02-14 22:02:51.077435+00:00
