# 04 – Transformación: videos_snapshot

Este notebook transforma el JSON crudo del recurso `videos`
en una tabla de snapshot mensual por video, que permite
analizar métricas de rendimiento (vistas, likes, comentarios)
y su evolución mes a mes.

In [2]:
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent

RAW_PATH = PROJECT_ROOT / "data" / "raw" / "youtube"
PROCESSED_PATH = PROJECT_ROOT / "data" / "processed" / "youtube"

PROCESSED_PATH.mkdir(parents=True, exist_ok=True)

RAW_PATH, PROCESSED_PATH

(PosixPath('/Users/angelgarciachanga/repositorios/publico/youtube-v3-data-pipeline/data/raw/youtube'),
 PosixPath('/Users/angelgarciachanga/repositorios/publico/youtube-v3-data-pipeline/data/processed/youtube'))

In [3]:
import json
import pandas as pd
from datetime import datetime, date
import isodate

In [4]:
with open(RAW_PATH / "videos.json", "r", encoding="utf-8") as f:
    videos_raw = json.load(f)

len(videos_raw["items"])


199

In [5]:
from datetime import datetime, date

EXTRACTED_AT = datetime.utcnow()

EXTRACTED_AT

datetime.datetime(2026, 2, 16, 2, 24, 1, 189973)

In [6]:
import isodate

def iso_duration_to_seconds(duration):
    try:
        return int(isodate.parse_duration(duration).total_seconds())
    except Exception:
        return None


In [22]:
from datetime import date, datetime, timezone

# Fecha de snapshot (solo fecha, sin hora)
SNAPSHOT_DATE = date.today()

# Timestamp explícitamente en UTC (timezone-aware)
EXTRACTED_AT = datetime.now(timezone.utc)

SNAPSHOT_DATE, EXTRACTED_AT

(datetime.date(2026, 2, 14),
 datetime.datetime(2026, 2, 14, 22, 16, 51, 958746, tzinfo=datetime.timezone.utc))

In [23]:
import pandas as pd

rows = []

for item in videos_raw["items"]:
    rows.append({
        "snapshot_date": SNAPSHOT_DATE,
        "video_id": item["id"],
        "channel_id": item["snippet"]["channelId"],
        "published_at": pd.to_datetime(
            item["snippet"]["publishedAt"],
            utc=True
        ),
        "duration_seconds": iso_duration_to_seconds(
            item["contentDetails"].get("duration")
        ),
        "view_count": int(item["statistics"].get("viewCount", 0)),
        "like_count": int(item["statistics"].get("likeCount", 0)),
        "comment_count": int(item["statistics"].get("commentCount", 0)),
        "extracted_at": EXTRACTED_AT
    })

df_videos_snapshot = pd.DataFrame(rows)
df_videos_snapshot.head()



Unnamed: 0,snapshot_date,video_id,channel_id,published_at,duration_seconds,view_count,like_count,comment_count,extracted_at
0,2026-02-14,xB4ecIksJSY,UCUEOHBht8pnQhQvCfIcl-gg,2026-01-24 12:04:21+00:00,960,30,1,0,2026-02-14 22:16:51.958746+00:00
1,2026-02-14,7bwkNrRpgw0,UCUEOHBht8pnQhQvCfIcl-gg,2026-01-23 06:52:23+00:00,69,16,2,0,2026-02-14 22:16:51.958746+00:00
2,2026-02-14,HDyKUodeuNw,UCUEOHBht8pnQhQvCfIcl-gg,2026-01-23 06:43:39+00:00,294,9,1,0,2026-02-14 22:16:51.958746+00:00
3,2026-02-14,Zj6uiqMvFOU,UCUEOHBht8pnQhQvCfIcl-gg,2026-01-17 20:07:55+00:00,1186,18,1,0,2026-02-14 22:16:51.958746+00:00
4,2026-02-14,RiYjYfMTGvw,UCUEOHBht8pnQhQvCfIcl-gg,2026-01-11 23:25:57+00:00,1138,13,0,0,2026-02-14 22:16:51.958746+00:00


In [24]:
df_videos_snapshot.shape

(199, 9)

In [25]:
df_videos_snapshot.dtypes

snapshot_date                    object
video_id                            str
channel_id                          str
published_at        datetime64[us, UTC]
duration_seconds                  int64
view_count                        int64
like_count                        int64
comment_count                     int64
extracted_at        datetime64[us, UTC]
dtype: object

In [11]:
type(df_videos_snapshot["snapshot_date"].iloc[0])

datetime.date

In [26]:
#Ordenar columnas
df_videos_snapshot = df_videos_snapshot[
    [
        "snapshot_date",
        "video_id",
        "channel_id",
        "published_at",
        "duration_seconds",
        "view_count",
        "like_count",
        "comment_count",
        "extracted_at"
    ]
]

> ⚠️ Nota (fase de desarrollo)
>
> El guardado en formato Parquet se utiliza temporalmente para pruebas y separación entre notebooks.
> En la versión final del pipeline (.py), este paso será omitido y el DataFrame se enviará directamente a BigQuery, donde residirá el histórico definitivo.

In [None]:
output_file = PROCESSED_PATH / "videos_snapshot.parquet"

df_videos_snapshot.to_parquet(output_file, index=False)

output_file



PosixPath('/Users/angelgarciachanga/repositorios/publico/youtube-v3-data-pipeline/data/processed/youtube/videos_snapshot.parquet')

In [15]:
import pandas as pd
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
file_path = PROJECT_ROOT / "data" / "processed" / "youtube" / "videos_snapshot.parquet"

df = pd.read_parquet(file_path)
df

Unnamed: 0,snapshot_date,video_id,channel_id,published_at,duration_seconds,view_count,like_count,comment_count,extracted_at
0,2026-02-14,xB4ecIksJSY,UCUEOHBht8pnQhQvCfIcl-gg,2026-01-24 12:04:21+00:00,960,30,1,0,2026-02-14 21:35:02.223047+00:00
1,2026-02-14,7bwkNrRpgw0,UCUEOHBht8pnQhQvCfIcl-gg,2026-01-23 06:52:23+00:00,69,16,2,0,2026-02-14 21:35:02.223047+00:00
2,2026-02-14,HDyKUodeuNw,UCUEOHBht8pnQhQvCfIcl-gg,2026-01-23 06:43:39+00:00,294,9,1,0,2026-02-14 21:35:02.223047+00:00
3,2026-02-14,Zj6uiqMvFOU,UCUEOHBht8pnQhQvCfIcl-gg,2026-01-17 20:07:55+00:00,1186,18,1,0,2026-02-14 21:35:02.223047+00:00
4,2026-02-14,RiYjYfMTGvw,UCUEOHBht8pnQhQvCfIcl-gg,2026-01-11 23:25:57+00:00,1138,13,0,0,2026-02-14 21:35:02.223047+00:00
...,...,...,...,...,...,...,...,...,...
194,2026-02-14,_wQMZfzqCAo,UCUEOHBht8pnQhQvCfIcl-gg,2022-12-14 07:40:34+00:00,1643,1228,38,0,2026-02-14 21:35:02.223047+00:00
195,2026-02-14,QC05ysMl8oc,UCUEOHBht8pnQhQvCfIcl-gg,2022-12-11 09:16:21+00:00,1759,622,7,1,2026-02-14 21:35:02.223047+00:00
196,2026-02-14,mqLksU5um3Q,UCUEOHBht8pnQhQvCfIcl-gg,2022-12-08 09:53:54+00:00,1287,462,15,2,2026-02-14 21:35:02.223047+00:00
197,2026-02-14,ZgZVpp61cYY,UCUEOHBht8pnQhQvCfIcl-gg,2022-11-08 10:21:48+00:00,1242,203,8,2,2026-02-14 21:35:02.223047+00:00
