# 02 – Extracción RAW YouTube Data API

Este notebook se encarga de extraer datos crudos desde la YouTube Data API v3
y almacenarlos en la capa `data/raw/youtube` sin ningún tipo de transformación.

In [2]:
#Dado que los notebooks se ejecutan desde su propio directorio, el pipeline define explícitamente el directorio raíz del proyecto para evitar inconsistencias en la creación de rutas relativas y garantizar una estructura de datos estable.
from pathlib import Path

# El notebook vive en /notebooks, el root es el padre
PROJECT_ROOT = Path.cwd().parent
PROJECT_ROOT

PosixPath('/Users/angelgarciachanga/repositorios/publico/youtube-v3-data-pipeline')

In [3]:
import os
import json
import requests
from pathlib import Path
from dotenv import load_dotenv

load_dotenv()


True

In [4]:
RAW_PATH = PROJECT_ROOT / "data" / "raw" / "youtube"
RAW_PATH.mkdir(parents=True, exist_ok=True)

RAW_PATH

PosixPath('/Users/angelgarciachanga/repositorios/publico/youtube-v3-data-pipeline/data/raw/youtube')

In [5]:
API_KEY = os.getenv("YOUTUBE_API_KEY")
CHANNEL_ID = os.getenv("YOUTUBE_CHANNEL_ID")

assert API_KEY, "Falta YOUTUBE_API_KEY"
assert CHANNEL_ID, "Falta YOUTUBE_CHANNEL_ID"

# ✅ channels.json

In [10]:
url = "https://www.googleapis.com/youtube/v3/channels"

params = {
    "part": "snippet,statistics,contentDetails",
    "id": CHANNEL_ID,
    "key": API_KEY
}

response = requests.get(url, params=params)
response.raise_for_status()

channels_raw = response.json()


In [11]:
output_file = RAW_PATH / "channels.json"

with open(output_file, "w", encoding="utf-8") as f:
    json.dump(channels_raw, f, indent=2, ensure_ascii=False)

output_file

PosixPath('/Users/angelgarciachanga/repositorios/publico/youtube-v3-data-pipeline/data/raw/youtube/channels.json')

In [None]:
#Revisar un componente de channels
uploads_playlist_id = (
    channels_raw["items"][0]
    ["contentDetails"]
    ["relatedPlaylists"]
    ["uploads"]
)

uploads_playlist_id

'UUUEOHBht8pnQhQvCfIcl-gg'

# ✅ playlists.json

In [15]:
url = "https://www.googleapis.com/youtube/v3/playlists"

params = {
    "part": "snippet,contentDetails,status",
    "channelId": CHANNEL_ID,
    "maxResults": 50,
    "key": API_KEY
}

response = requests.get(url, params=params)
response.raise_for_status()

playlists_raw = response.json()

In [16]:
output_file = RAW_PATH / "playlists.json"

with open(output_file, "w", encoding="utf-8") as f:
    json.dump(playlists_raw, f, indent=2, ensure_ascii=False)

output_file

PosixPath('/Users/angelgarciachanga/repositorios/publico/youtube-v3-data-pipeline/data/raw/youtube/playlists.json')

# ✅ playlistsItems.json (uploads)

In [None]:
all_items = []
page_token = None

while True:
    params = {
        "part": "snippet,contentDetails",
        "playlistId": uploads_playlist_id,
        "maxResults": 50,
        "key": API_KEY
    }

    if page_token:
        params["pageToken"] = page_token

    response = requests.get(
        "https://www.googleapis.com/youtube/v3/playlistItems",
        params=params
    )
    response.raise_for_status()

    data = response.json()
    all_items.extend(data["items"])

    page_token = data.get("nextPageToken")
    if not page_token:
        break

len(all_items)


199

In [None]:
playlist_items_uploads = {
    "items": all_items
}

with open(RAW_PATH / "playlist_items_uploads.json", "w", encoding="utf-8") as f:
    json.dump(playlist_items_uploads, f, indent=2, ensure_ascii=False)


# ✅ videos.json

### Se utiliza playlist items uploads para cargar todos los videos del canal con sus ids

In [None]:
video_ids = [
    item["contentDetails"]["videoId"]
    for item in playlist_items_uploads["items"]
]
video_ids[:5], len(video_ids)

(['xB4ecIksJSY', '7bwkNrRpgw0', 'HDyKUodeuNw', 'Zj6uiqMvFOU', 'RiYjYfMTGvw'],
 199)

In [21]:
def chunk_list(lst, chunk_size=50):
    for i in range(0, len(lst), chunk_size):
        yield lst[i:i + chunk_size]

In [23]:
all_videos_items = []

for video_id_chunk in chunk_list(video_ids, 50):

    params = {
        "part": "snippet,statistics,contentDetails",
        "id": ",".join(video_id_chunk),
        "key": API_KEY
    }

    response = requests.get(
        "https://www.googleapis.com/youtube/v3/videos",
        params=params
    )
    response.raise_for_status()

    data = response.json()
    all_videos_items.extend(data["items"])


In [24]:
videos_raw = {
    "items": all_videos_items
}

len(videos_raw["items"])

199

In [25]:
with open(RAW_PATH / "videos.json", "w", encoding="utf-8") as f:
    json.dump(videos_raw, f, indent=2, ensure_ascii=False)
