variables de entorno

In [3]:

import os

REQ = [
    "SNOWFLAKE_HOST","SNOWFLAKE_USER","SNOWFLAKE_PASSWORD",
    "SNOWFLAKE_DATABASE","SNOWFLAKE_SCHEMA_RAW","SNOWFLAKE_SCHEMA_ANALYTICS",
    "SNOWFLAKE_WAREHOUSE","SNOWFLAKE_ROLE"
]

missing = [k for k in REQ if not os.getenv(k)]
assert not missing, f"Faltan en .env: {missing} — complétalas y reintenta."

# Derivar SNOWFLAKE_ACCOUNT si no está definido (desde el host)
sf_account = os.getenv("SNOWFLAKE_ACCOUNT")
if not sf_account:
    host = os.getenv("SNOWFLAKE_HOST")  # p.ej. xpc24435.us-east-1.snowflakecomputing.com
    sf_account = host.replace(".snowflakecomputing.com", "")

DATA_ROOT = os.getenv("DATA_ROOT", "/home/jovyan/work/datasets")

print("OK .env cargado.")
print("  DATA_ROOT =", DATA_ROOT)
print("  ACCOUNT   =", sf_account)
print("  DB/RAW/AN =",
      os.getenv("SNOWFLAKE_DATABASE"), os.getenv("SNOWFLAKE_SCHEMA_RAW"), os.getenv("SNOWFLAKE_SCHEMA_ANALYTICS"))
print("  WH/ROLE   =",
      os.getenv("SNOWFLAKE_WAREHOUSE"), os.getenv("SNOWFLAKE_ROLE"))


OK .env cargado.
  DATA_ROOT = /home/jovyan/work/datasets
  ACCOUNT   = xpc24435.us-east-1
  DB/RAW/AN = DM_PSET3 RAW ANALYTICS
  WH/ROLE   = WH_DM SYSADMIN


In [4]:


from pyspark.sql import SparkSession

# Reinicia por si había sesiones previas colgadas
try:
    spark.stop()
except:
    pass

spark = SparkSession.builder.getOrCreate()

sfOptions = {
    "sfURL": os.getenv("SNOWFLAKE_HOST"),
    "sfUser": os.getenv("SNOWFLAKE_USER"),
    "sfPassword": os.getenv("SNOWFLAKE_PASSWORD"),
    "sfDatabase": os.getenv("SNOWFLAKE_DATABASE"),
    "sfSchema": os.getenv("SNOWFLAKE_SCHEMA_RAW"),
    "sfWarehouse": os.getenv("SNOWFLAKE_WAREHOUSE"),
    "sfRole": os.getenv("SNOWFLAKE_ROLE"),
    "sfAccount": os.getenv("SNOWFLAKE_HOST").replace(".snowflakecomputing.com",""),
}

print("Spark listo ✔️")
print("  RAW schema :", os.getenv("SNOWFLAKE_SCHEMA_RAW"))
print("  URL/Acct   :", os.getenv("SNOWFLAKE_HOST"), "|", sfOptions["sfAccount"])


Spark listo ✔️
  RAW schema : RAW
  URL/Acct   : xpc24435.us-east-1.snowflakecomputing.com | xpc24435.us-east-1


In [5]:


from pathlib import Path
from pyspark.sql.functions import col

def _normalize_timestamps(df):
    """
    Convierte columnas de fecha/hora típicas de TLC a 'timestamp' clásico,
    para que el conector Spark↔Snowflake las acepte sin error.
    """
    candidates = [
        "tpep_pickup_datetime", "tpep_dropoff_datetime",   # yellow
        "lpep_pickup_datetime", "lpep_dropoff_datetime",   # green
    ]
    for c in candidates:
        if c in df.columns:
            df = df.withColumn(c, col(c).cast("timestamp"))
    return df

def load_month(service: str, year: int, month: str, sample: int | None = None) -> bool:
    """
    Lee un parquet local (DATA_ROOT/<service>_tripdata_<YYYY>-<MM>.parquet)
    y lo escribe en Snowflake como RAW.TRIPS_<SERVICE>_<YYYY>_<MM>.
    - 'service' : 'yellow' o 'green'
    - 'month'   : '01'..'12' (dos dígitos)
    - sample    : si es int, limita filas para smoke test; si None, carga completa
    """
    fname = f"{service}_tripdata_{year}-{month}.parquet"
    src = Path(DATA_ROOT) / fname
    if not src.exists():
        print(f"⚠️ No existe: {src}")
        return False

    df = spark.read.parquet(str(src))
    if sample:
        df = df.limit(sample)

    df = _normalize_timestamps(df)

    target = f'{os.getenv("SNOWFLAKE_SCHEMA_RAW")}.TRIPS_{service.upper()}_{year}_{month}'
    (df.write.format("snowflake")
       .options(**sfOptions)
       .option("dbtable", target)
       .mode("overwrite")
       .save())
    print("✅ Cargado:", target)
    return True

def sf_count(table: str) -> int:
    """
    Devuelve COUNT(*) de una tabla Snowflake dada como 'SCHEMA.TABLE'.
    """
    q = f"SELECT COUNT(*) AS c FROM {table}"
    df = (spark.read.format("snowflake")
          .options(**sfOptions)
          .option("query", q)
          .load())
    return int(df.collect()[0]["C"])

print("helpers listos ✔️")


helpers listos ✔️


In [9]:

from pathlib import Path

SERVICE = "yellow"
YEAR = 2015
MONTH = "01"
RAW = os.getenv("SNOWFLAKE_SCHEMA_RAW")
REAL_TABLE = f"{RAW}.TRIPS_{SERVICE.upper()}_{YEAR}_{MONTH}"

# 1)  conexión a Snowflake 
ping = (spark.read.format("snowflake")
        .options(**sfOptions)
        .option("query", "SELECT CURRENT_ACCOUNT() AS acct, CURRENT_WAREHOUSE() AS wh, CURRENT_ROLE() AS role, CURRENT_VERSION() AS ver")
        .load())
print("✅ Conexión Snowflake OK:")
ping.show(truncate=False)

# 2) Verificar que el parquet local existe 
src = Path(DATA_ROOT) / f"{SERVICE}_tripdata_{YEAR}-{MONTH}.parquet"
assert src.exists(), f"No existe el parquet local: {src}"
df_local = spark.read.parquet(str(src)).limit(1000)
print("✅ Lectura local OK (sample 1000):", df_local.count(), "filas")


try:
    cnt_real = sf_count(REAL_TABLE)
    print(f"✅ COUNT real en Snowflake ({REAL_TABLE}):", cnt_real)
except Exception as e:
    print(f"⚠️ No se pudo contar {REAL_TABLE} (quizá no existe aún):", e)


✅ Conexión Snowflake OK:
+--------+-----+--------+------+
|ACCT    |WH   |ROLE    |VER   |
+--------+-----+--------+------+
|XPC24435|WH_DM|SYSADMIN|9.32.1|
+--------+-----+--------+------+

✅ Lectura local OK (sample 1000): 1000 filas
✅ COUNT real en Snowflake (RAW.TRIPS_YELLOW_2015_01): 1000


In [10]:
# CELDA 5 — Plan: sobrescribir YELLOW 2015-01 completo (sin sample)

SERVICES = ["yellow"]
YEARS = [2015]
MONTHS = ["01"]

print("Plan:", SERVICES, YEARS, MONTHS, "→ overwrite completo")


Plan: ['yellow'] [2015] ['01'] → overwrite completo


In [11]:
# CELDA 6 — Ejecutar la carga (según el plan de la celda 5)

for svc in SERVICES:
    for y in YEARS:
        for m in MONTHS:
            try:
                ok = load_month(svc, y, m)   # carga completa (sin sample)
                if ok:
                    table = f'{os.getenv("SNOWFLAKE_SCHEMA_RAW")}.TRIPS_{svc.upper()}_{y}_{m}'
                    c = sf_count(table)
                    print(f"✅ Cargado: {table}  |  COUNT = {c}")
            except Exception as e:
                print(f"❌ Error en {svc} {y}-{m}:", e)


✅ Cargado: RAW.TRIPS_YELLOW_2015_01
✅ Cargado: RAW.TRIPS_YELLOW_2015_01  |  COUNT = 12741035


In [12]:


SERVICES = ["yellow"]
YEARS = [2017]
MONTHS = ["09","10","11","12"]
print("Plan:", SERVICES, YEARS, MONTHS)


Plan: ['yellow'] [2017] ['09', '10', '11', '12']


In [13]:
# CELDA 6 — Ejecutar la carga según el plan

for svc in SERVICES:
    for y in YEARS:
        for m in MONTHS:
            try:
                ok = load_month(svc, y, m)   # carga completa (sin sample)
                if ok:
                    table = f'{os.getenv("SNOWFLAKE_SCHEMA_RAW")}.TRIPS_{svc.upper()}_{y}_{m}'
                    c = sf_count(table)
                    print(f"✅ Cargado: {table}  |  COUNT = {c}")
            except Exception as e:
                print(f"❌ Error en {svc} {y}-{m}:", e)


✅ Cargado: RAW.TRIPS_YELLOW_2017_09
✅ Cargado: RAW.TRIPS_YELLOW_2017_09  |  COUNT = 8945421
✅ Cargado: RAW.TRIPS_YELLOW_2017_10
✅ Cargado: RAW.TRIPS_YELLOW_2017_10  |  COUNT = 9768672
✅ Cargado: RAW.TRIPS_YELLOW_2017_11
✅ Cargado: RAW.TRIPS_YELLOW_2017_11  |  COUNT = 9284803
✅ Cargado: RAW.TRIPS_YELLOW_2017_12
✅ Cargado: RAW.TRIPS_YELLOW_2017_12  |  COUNT = 9508501


In [14]:
SERVICES = ["yellow"]
YEARS = [2018]
MONTHS = [f"{m:02d}" for m in range(1,13)]
print("Plan:", SERVICES, YEARS, MONTHS)

Plan: ['yellow'] [2018] ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']


In [15]:
# CELDA 6 — Ejecutar la carga según el plan
for svc in SERVICES:
    for y in YEARS:
        for m in MONTHS:
            try:
                ok = load_month(svc, y, m)
                if ok:
                    table = f'{os.getenv("SNOWFLAKE_SCHEMA_RAW")}.TRIPS_{svc.upper()}_{y}_{m}'
                    c = sf_count(table)
                    print(f"✅ Cargado: {table}  |  COUNT = {c}")
            except Exception as e:
                print(f"❌ Error en {svc} {y}-{m}:", e)


✅ Cargado: RAW.TRIPS_YELLOW_2018_01
✅ Cargado: RAW.TRIPS_YELLOW_2018_01  |  COUNT = 8760687
✅ Cargado: RAW.TRIPS_YELLOW_2018_02
✅ Cargado: RAW.TRIPS_YELLOW_2018_02  |  COUNT = 8492819
✅ Cargado: RAW.TRIPS_YELLOW_2018_03
✅ Cargado: RAW.TRIPS_YELLOW_2018_03  |  COUNT = 9431289
✅ Cargado: RAW.TRIPS_YELLOW_2018_04
✅ Cargado: RAW.TRIPS_YELLOW_2018_04  |  COUNT = 9306216
✅ Cargado: RAW.TRIPS_YELLOW_2018_05
✅ Cargado: RAW.TRIPS_YELLOW_2018_05  |  COUNT = 9224788
✅ Cargado: RAW.TRIPS_YELLOW_2018_06
✅ Cargado: RAW.TRIPS_YELLOW_2018_06  |  COUNT = 8714667
✅ Cargado: RAW.TRIPS_YELLOW_2018_07
✅ Cargado: RAW.TRIPS_YELLOW_2018_07  |  COUNT = 7851143
✅ Cargado: RAW.TRIPS_YELLOW_2018_08
✅ Cargado: RAW.TRIPS_YELLOW_2018_08  |  COUNT = 7855040
✅ Cargado: RAW.TRIPS_YELLOW_2018_09
✅ Cargado: RAW.TRIPS_YELLOW_2018_09  |  COUNT = 8049094
✅ Cargado: RAW.TRIPS_YELLOW_2018_10
✅ Cargado: RAW.TRIPS_YELLOW_2018_10  |  COUNT = 8834520
✅ Cargado: RAW.TRIPS_YELLOW_2018_11
✅ Cargado: RAW.TRIPS_YELLOW_2018_11  |  COUN

In [16]:
# CELDA 5 — Plan del bloque (YELLOW 2019-02..12)
SERVICES = ["yellow"]
YEARS = [2019]
MONTHS = [f"{m:02d}" for m in range(2,13)]  # 02..12
print("Plan:", SERVICES, YEARS, MONTHS)


Plan: ['yellow'] [2019] ['02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']


In [17]:
# CELDA 6 — Ejecutar la carga según el plan
for svc in SERVICES:
    for y in YEARS:
        for m in MONTHS:
            try:
                ok = load_month(svc, y, m)
                if ok:
                    table = f'{os.getenv("SNOWFLAKE_SCHEMA_RAW")}.TRIPS_{svc.upper()}_{y}_{m}'
                    c = sf_count(table)
                    print(f"✅ Cargado: {table}  |  COUNT = {c}")
            except Exception as e:
                print(f"❌ Error en {svc} {y}-{m}:", e)


✅ Cargado: RAW.TRIPS_YELLOW_2019_02
✅ Cargado: RAW.TRIPS_YELLOW_2019_02  |  COUNT = 7049370
✅ Cargado: RAW.TRIPS_YELLOW_2019_03
✅ Cargado: RAW.TRIPS_YELLOW_2019_03  |  COUNT = 7866620
✅ Cargado: RAW.TRIPS_YELLOW_2019_04
✅ Cargado: RAW.TRIPS_YELLOW_2019_04  |  COUNT = 7475949
✅ Cargado: RAW.TRIPS_YELLOW_2019_05
✅ Cargado: RAW.TRIPS_YELLOW_2019_05  |  COUNT = 7598445
✅ Cargado: RAW.TRIPS_YELLOW_2019_06
✅ Cargado: RAW.TRIPS_YELLOW_2019_06  |  COUNT = 6971560
✅ Cargado: RAW.TRIPS_YELLOW_2019_07
✅ Cargado: RAW.TRIPS_YELLOW_2019_07  |  COUNT = 6310419
✅ Cargado: RAW.TRIPS_YELLOW_2019_08
✅ Cargado: RAW.TRIPS_YELLOW_2019_08  |  COUNT = 6073357
✅ Cargado: RAW.TRIPS_YELLOW_2019_09
✅ Cargado: RAW.TRIPS_YELLOW_2019_09  |  COUNT = 6567788
✅ Cargado: RAW.TRIPS_YELLOW_2019_10
✅ Cargado: RAW.TRIPS_YELLOW_2019_10  |  COUNT = 7213891
✅ Cargado: RAW.TRIPS_YELLOW_2019_11
✅ Cargado: RAW.TRIPS_YELLOW_2019_11  |  COUNT = 6878111
✅ Cargado: RAW.TRIPS_YELLOW_2019_12
✅ Cargado: RAW.TRIPS_YELLOW_2019_12  |  COUN

In [19]:
# CELDA 5 — Plan YELLOW 2020→2025 con 2025 limitado a 01–08

SERVICES = ["yellow"]
YEARS = list(range(2023, 2026))  # 2020..2025

MONTHS_2020_2024 = [f"{m:02d}" for m in range(1,13)]
MONTHS_2023 = [f"{m:02d}" for m in range(6,13)]# 01..12
MONTHS_2025 = [f"{m:02d}" for m in range(1,9)]        # 01..08

print("Plan YELLOW:")
print("  Años:", YEARS)
print("  Meses 2020–2024:", MONTHS_2020_2024)
print("  Meses 2025:", MONTHS_2025)


Plan YELLOW:
  Años: [2020, 2021, 2022, 2023, 2024, 2025]
  Meses 2020–2024: ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
  Meses 2025: ['01', '02', '03', '04', '05', '06', '07', '08']


In [None]:
# CELDA 6 — Ejecutar carga YELLOW 2020→2025 (2025 solo 01–08)

for svc in SERVICES:
    for y in YEARS:
        months = MONTHS_2025 if y == 2025 else MONTHS_2020_2024
        for m in months:
            try:
                ok = load_month(svc, y, m)
                if ok:
                    table = f'{os.getenv("SNOWFLAKE_SCHEMA_RAW")}.TRIPS_{svc.upper()}_{y}_{m}'
                    c = sf_count(table)
                    print(f"✅ Cargado: {table}  |  COUNT = {c}")
            except Exception as e:
                print(f"❌ Error en {svc} {y}-{m}:", e)


✅ Cargado: RAW.TRIPS_YELLOW_2020_01
✅ Cargado: RAW.TRIPS_YELLOW_2020_01  |  COUNT = 6405008
✅ Cargado: RAW.TRIPS_YELLOW_2020_02
✅ Cargado: RAW.TRIPS_YELLOW_2020_02  |  COUNT = 6299367
✅ Cargado: RAW.TRIPS_YELLOW_2020_03
✅ Cargado: RAW.TRIPS_YELLOW_2020_03  |  COUNT = 3007687
✅ Cargado: RAW.TRIPS_YELLOW_2020_04
✅ Cargado: RAW.TRIPS_YELLOW_2020_04  |  COUNT = 238073
✅ Cargado: RAW.TRIPS_YELLOW_2020_05
✅ Cargado: RAW.TRIPS_YELLOW_2020_05  |  COUNT = 348415
✅ Cargado: RAW.TRIPS_YELLOW_2020_06
✅ Cargado: RAW.TRIPS_YELLOW_2020_06  |  COUNT = 549797
✅ Cargado: RAW.TRIPS_YELLOW_2020_07
✅ Cargado: RAW.TRIPS_YELLOW_2020_07  |  COUNT = 800412
✅ Cargado: RAW.TRIPS_YELLOW_2020_08
✅ Cargado: RAW.TRIPS_YELLOW_2020_08  |  COUNT = 1007286
✅ Cargado: RAW.TRIPS_YELLOW_2020_09
✅ Cargado: RAW.TRIPS_YELLOW_2020_09  |  COUNT = 1341017
✅ Cargado: RAW.TRIPS_YELLOW_2020_10
✅ Cargado: RAW.TRIPS_YELLOW_2020_10  |  COUNT = 1681132
✅ Cargado: RAW.TRIPS_YELLOW_2020_11
✅ Cargado: RAW.TRIPS_YELLOW_2020_11  |  COUNT = 

In [21]:
# CELDA 2.5 — Reinicio rápido de Spark y ping a Snowflake (no escribe)

try:
    spark.stop()
except:
    pass

from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

def sf_ping():
    df = (spark.read.format("snowflake")
          .options(**sfOptions)
          .option("query", "SELECT CURRENT_ACCOUNT() acct, CURRENT_WAREHOUSE() wh, CURRENT_ROLE() role, CURRENT_TIMESTAMP() ts")
          .load())
    df.show(truncate=False)

print("🔄 Spark reiniciado. Ping a Snowflake:")
sf_ping()


🔄 Spark reiniciado. Ping a Snowflake:
+--------+-----+--------+-----------------------+
|ACCT    |WH   |ROLE    |TS                     |
+--------+-----+--------+-----------------------+
|XPC24435|WH_DM|SYSADMIN|2025-10-22 04:32:20.757|
+--------+-----+--------+-----------------------+



In [22]:
# CELDA 5 — Plan dinámico (YELLOW 2020→2025 con 2025-01..08), solo faltantes

from itertools import product

# universo esperado
years_all = list(range(2020, 2026))
months_full = [f"{m:02d}" for m in range(1,13)]
months_2025 = [f"{m:02d}" for m in range(1,9)]  # 01..08
expected = {(y, m) for y in years_all for m in (months_2025 if y==2025 else months_full)}

# ya cargado en RAW
q_loaded = (
    "WITH t AS ("
    " SELECT table_name,"
    " TRY_TO_NUMBER(REGEXP_SUBSTR(table_name, 'TRIPS_YELLOW_(\\\\d{4})_', 1, 1, 'e', 1)) AS yr,"
    " TRY_TO_NUMBER(REGEXP_SUBSTR(table_name, 'TRIPS_YELLOW_\\\\d{4}_(\\\\d{2})', 1, 1, 'e', 1)) AS mo"
    " FROM INFORMATION_SCHEMA.TABLES"
    " WHERE table_schema='RAW' AND table_name ILIKE 'TRIPS_YELLOW_%'"
    ") SELECT yr, mo FROM t WHERE yr IS NOT NULL AND mo IS NOT NULL"
)
df_loaded = (spark.read.format("snowflake")
             .options(**sfOptions)
             .option("query", q_loaded)
             .load())

loaded = {(int(r["YR"]), f"{int(r['MO']):02d}") for r in df_loaded.collect() if r["YR"] is not None and r["MO"] is not None}

missing = sorted([(y, m) for (y, m) in expected if (y, m) not in loaded])

# Armamos listas para el bucle
SERVICES = ["yellow"]
YEARS = sorted(list({y for y, _ in missing}))
MONTHS_BY_YEAR = {y: [m for (yy, m) in missing if yy == y] for y in YEARS}

print("📋 Faltantes YELLOW (cuenta):", len(missing))
for y in YEARS:
    print(f"  {y}: {MONTHS_BY_YEAR[y]}")


📋 Faltantes YELLOW (cuenta): 0


In [23]:
# CELDA 5 — Plan dinámico (GREEN 2015→2025 con 2025-01..08), solo faltantes

# universo esperado
years_all_g = list(range(2015, 2026))                      # 2015..2025
months_full  = [f"{m:02d}" for m in range(1,13)]           # 01..12
months_2025g = [f"{m:02d}" for m in range(1,9)]            # 01..08

expected_g = {(y, m) for y in years_all_g for m in (months_2025g if y==2025 else months_full)}

# ya cargado en RAW para GREEN
q_loaded_g = (
    "WITH t AS ("
    " SELECT table_name,"
    " TRY_TO_NUMBER(REGEXP_SUBSTR(table_name, 'TRIPS_GREEN_(\\\\d{4})_', 1, 1, 'e', 1)) AS yr,"
    " TRY_TO_NUMBER(REGEXP_SUBSTR(table_name, 'TRIPS_GREEN_\\\\d{4}_(\\\\d{2})', 1, 1, 'e', 1)) AS mo"
    " FROM INFORMATION_SCHEMA.TABLES"
    " WHERE table_schema='RAW' AND table_name ILIKE 'TRIPS_GREEN_%'"
    ") SELECT yr, mo FROM t WHERE yr IS NOT NULL AND mo IS NOT NULL"
)

df_loaded_g = (spark.read.format("snowflake")
               .options(**sfOptions)
               .option("query", q_loaded_g)
               .load())

loaded_g = {(int(r["YR"]), f"{int(r['MO']):02d}") for r in df_loaded_g.collect() if r["YR"] is not None and r["MO"] is not None}
missing_g = sorted([(y, m) for (y, m) in expected_g if (y, m) not in loaded_g])

# Armamos listas para el bucle
SERVICES = ["green"]
YEARS = sorted(list({y for y, _ in missing_g}))
MONTHS_BY_YEAR = {y: [m for (yy, m) in missing_g if yy == y] for y in YEARS}

print("📋 Faltantes GREEN (cuenta):", len(missing_g))
for y in YEARS:
    print(f"  {y}: {MONTHS_BY_YEAR[y]}")


📋 Faltantes GREEN (cuenta): 127
  2015: ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
  2016: ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
  2017: ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
  2018: ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
  2019: ['02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
  2020: ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
  2021: ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
  2022: ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
  2023: ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
  2024: ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
  2025: ['01', '02', '03', '04', '05', '06', '07', '08']


In [24]:
SERVICES = ["green"]
YEARS = [2015]
MONTHS = [f"{m:02d}" for m in range(1,13)]  # 01..12
print("Plan:", SERVICES, YEARS, MONTHS)

Plan: ['green'] [2015] ['01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']


In [25]:
# CELDA 6 — Ejecutar la carga según el plan actual

for svc in SERVICES:
    for y in YEARS:
        for m in MONTHS:
            try:
                ok = load_month(svc, y, m)  # normaliza lpep_* datetime dentro
                if ok:
                    table = f'{os.getenv("SNOWFLAKE_SCHEMA_RAW")}.TRIPS_{svc.upper()}_{y}_{m}'
                    c = sf_count(table)
                    print(f"✅ Cargado: {table}  |  COUNT = {c}")
            except Exception as e:
                print(f"❌ Error en {svc} {y}-{m}:", e)


✅ Cargado: RAW.TRIPS_GREEN_2015_01
✅ Cargado: RAW.TRIPS_GREEN_2015_01  |  COUNT = 1508493
✅ Cargado: RAW.TRIPS_GREEN_2015_02
✅ Cargado: RAW.TRIPS_GREEN_2015_02  |  COUNT = 1574830
✅ Cargado: RAW.TRIPS_GREEN_2015_03
✅ Cargado: RAW.TRIPS_GREEN_2015_03  |  COUNT = 1722574
✅ Cargado: RAW.TRIPS_GREEN_2015_04
✅ Cargado: RAW.TRIPS_GREEN_2015_04  |  COUNT = 1664394
✅ Cargado: RAW.TRIPS_GREEN_2015_05
✅ Cargado: RAW.TRIPS_GREEN_2015_05  |  COUNT = 1786848
✅ Cargado: RAW.TRIPS_GREEN_2015_06
✅ Cargado: RAW.TRIPS_GREEN_2015_06  |  COUNT = 1638868
✅ Cargado: RAW.TRIPS_GREEN_2015_07
✅ Cargado: RAW.TRIPS_GREEN_2015_07  |  COUNT = 1541671
✅ Cargado: RAW.TRIPS_GREEN_2015_08
✅ Cargado: RAW.TRIPS_GREEN_2015_08  |  COUNT = 1532343
✅ Cargado: RAW.TRIPS_GREEN_2015_09
✅ Cargado: RAW.TRIPS_GREEN_2015_09  |  COUNT = 1494927
✅ Cargado: RAW.TRIPS_GREEN_2015_10
✅ Cargado: RAW.TRIPS_GREEN_2015_10  |  COUNT = 1630536
✅ Cargado: RAW.TRIPS_GREEN_2015_11
✅ Cargado: RAW.TRIPS_GREEN_2015_11  |  COUNT = 1529984
✅ Cargado:

In [26]:
# CELDA FINAL — Cargar todo lo que falta de GREEN (2016→2025-08)

years_green = {
    2016: [f"{m:02d}" for m in range(1,13)],
    2017: [f"{m:02d}" for m in range(1,13)],
    2018: [f"{m:02d}" for m in range(1,13)],
    2019: [f"{m:02d}" for m in range(2,13)],
    2020: [f"{m:02d}" for m in range(1,13)],
    2021: [f"{m:02d}" for m in range(1,13)],
    2022: [f"{m:02d}" for m in range(1,13)],
    2023: [f"{m:02d}" for m in range(1,13)],
    2024: [f"{m:02d}" for m in range(1,13)],
    2025: [f"{m:02d}" for m in range(1,9)],
}

errores = []
for y, months in years_green.items():
    for m in months:
        try:
            ok = load_month("green", y, m)
            if ok:
                table = f'{os.getenv("SNOWFLAKE_SCHEMA_RAW")}.TRIPS_GREEN_{y}_{m}'
                c = sf_count(table)
                print(f"✅ Cargado: {table}  |  COUNT = {c}")
        except Exception as e:
            msg = f"{y}-{m}: {e}"
            errores.append(msg)
            print(f"❌ Error en GREEN {msg}")

print("\nResumen:")
print("  Meses con error:", len(errores))
for e in errores:
    print("   -", e)


✅ Cargado: RAW.TRIPS_GREEN_2016_01
✅ Cargado: RAW.TRIPS_GREEN_2016_01  |  COUNT = 1445292
✅ Cargado: RAW.TRIPS_GREEN_2016_02
✅ Cargado: RAW.TRIPS_GREEN_2016_02  |  COUNT = 1510722
✅ Cargado: RAW.TRIPS_GREEN_2016_03
✅ Cargado: RAW.TRIPS_GREEN_2016_03  |  COUNT = 1576393
✅ Cargado: RAW.TRIPS_GREEN_2016_04
✅ Cargado: RAW.TRIPS_GREEN_2016_04  |  COUNT = 1543926
✅ Cargado: RAW.TRIPS_GREEN_2016_05
✅ Cargado: RAW.TRIPS_GREEN_2016_05  |  COUNT = 1536979
✅ Cargado: RAW.TRIPS_GREEN_2016_06
✅ Cargado: RAW.TRIPS_GREEN_2016_06  |  COUNT = 1404727
✅ Cargado: RAW.TRIPS_GREEN_2016_07
✅ Cargado: RAW.TRIPS_GREEN_2016_07  |  COUNT = 1332510
✅ Cargado: RAW.TRIPS_GREEN_2016_08
✅ Cargado: RAW.TRIPS_GREEN_2016_08  |  COUNT = 1247675
✅ Cargado: RAW.TRIPS_GREEN_2016_09
✅ Cargado: RAW.TRIPS_GREEN_2016_09  |  COUNT = 1162373
✅ Cargado: RAW.TRIPS_GREEN_2016_10
✅ Cargado: RAW.TRIPS_GREEN_2016_10  |  COUNT = 1252572
✅ Cargado: RAW.TRIPS_GREEN_2016_11
✅ Cargado: RAW.TRIPS_GREEN_2016_11  |  COUNT = 1148214
✅ Cargado: