In [2]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName("PZ4-IngestaTest")
    .config("spark.driver.extraClassPath", "/opt/jars/postgresql-42.7.4.jar")
    .config("spark.jars", "/opt/jars/postgresql-42.7.4.jar")
    .getOrCreate()
)

spark


In [2]:
from pyspark.sql import functions as F

# Configuración de conexión JDBC a Postgres
jdbc_url = "jdbc:postgresql://postgres:5432/nyctaxi"

db_user = "pset"
db_password = "pset_password"

connection_props = {
    "user": db_user,
    "password": db_password,
    "driver": "org.postgresql.Driver",
    "fetchsize": "10000",
}

# Ahora traemos TODO yellow 2019-01 (sin LIMIT), filtrando duraciones negativas
yellow_2019_01_query = """
    (SELECT *
     FROM raw.yellow_taxi_trip
     WHERE service_type = 'yellow'
       AND source_year = 2019
       AND source_month = 1
       AND dropoff_datetime >= pickup_datetime) AS t
"""

yellow_2019_01_df = (
    spark.read
         .jdbc(url=jdbc_url, table=yellow_2019_01_query, properties=connection_props)
)

# Solo ver esquema, SIN show() ni count() para no forzar acciones pesadas
yellow_2019_01_df.printSchema()
print("DataFrame yellow_2019_01_df listo")


root
 |-- VendorID: long (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: integer (nullable = true)
 |-- airport_fee: integer (nullable = true)
 |-- service_type: string (nullable = true)
 |-- source_year: integer (nullable = true)
 |-- source_month: integer (nullable = true)
 |-- ingested_at_u

In [6]:
from pyspark.sql import functions as F

# 1) Leer lookup de zonas desde RAW en Postgres
zones_df = (
    spark.read
         .jdbc(
             url=jdbc_url,
             table="raw.taxi_zone_lookup",
             properties=connection_props
         )
)

pu_zones_df = (
    zones_df
    .select(
        F.col("locationid").alias("pu_location_id"),
        F.col("zone").alias("pu_zone"),
        F.col("borough").alias("pu_borough")
    )
)

do_zones_df = (
    zones_df
    .select(
        F.col("locationid").alias("do_location_id"),
        F.col("zone").alias("do_zone"),
        F.col("borough").alias("do_borough")
    )
)

# 2) Partimos del DF completo yellow_2019_01_df
base_df = yellow_2019_01_df

enriched_df = (
    base_df
    # tiempo
    .withColumn("pickup_hour", F.hour("pickup_datetime").cast("smallint"))
    .withColumn("pickup_dow", F.dayofweek("pickup_datetime").cast("smallint"))
    .withColumn("month", F.col("source_month").cast("smallint"))
    .withColumn("year", F.col("source_year").cast("smallint"))

    # claves de zona
    .withColumn("pu_location_id", F.col("PULocationID").cast("int"))
    .withColumn("do_location_id", F.col("DOLocationID").cast("int"))

    # vendor
    .withColumn("vendor_id", F.col("VendorID").cast("int"))
    .withColumn(
        "vendor_name",
        F.when(F.col("VendorID") == 1, "Creative Mobile Technologies")
         .when(F.col("VendorID") == 2, "VeriFone Inc")
         .otherwise("Other")
    )

    # rate_code
    .withColumn("rate_code_id", F.col("RatecodeID").cast("int"))
    .withColumn(
        "rate_code_desc",
        F.when(F.col("RatecodeID") == 1, "Standard rate")
         .when(F.col("RatecodeID") == 2, "JFK")
         .when(F.col("RatecodeID") == 3, "Newark")
         .when(F.col("RatecodeID") == 4, "Nassau or Westchester")
         .when(F.col("RatecodeID") == 5, "Negotiated fare")
         .when(F.col("RatecodeID") == 6, "Group ride")
         .otherwise("Other")
    )

    # payment_type + descripción
    .withColumn("payment_type", F.col("payment_type").cast("int"))
    .withColumn(
        "payment_type_desc",
        F.when(F.col("payment_type") == 1, "Credit card")
         .when(F.col("payment_type") == 2, "Cash")
         .when(F.col("payment_type") == 3, "No charge")
         .when(F.col("payment_type") == 4, "Dispute")
         .when(F.col("payment_type") == 5, "Unknown")
         .when(F.col("payment_type") == 6, "Voided trip")
         .otherwise("Other")
    )

    # trip_type (solo aplica a green)
    .withColumn("trip_type", F.lit(None).cast("int"))

    # derivadas
    .withColumn(
        "trip_duration_min",
        (F.unix_timestamp("dropoff_datetime") - F.unix_timestamp("pickup_datetime")) / 60.0
    )
    .withColumn(
        "avg_speed_mph",
        F.when(
            (F.col("trip_distance") > 0) & (F.col("trip_duration_min") > 0),
            F.col("trip_distance") / (F.col("trip_duration_min") / 60.0)
        ).otherwise(F.lit(None).cast("double"))
    )
    .withColumn(
        "tip_pct",
        F.when(F.col("fare_amount") > 0,
               (F.col("tip_amount") / F.col("fare_amount")) * 100.0
        ).otherwise(F.lit(None).cast("double"))
    )
)

# 3) Join con zonas pickup y dropoff
enriched_df = (
    enriched_df
    .join(pu_zones_df, on="pu_location_id", how="left")
    .join(do_zones_df, on="do_location_id", how="left")
)

enriched_df.printSchema()
print("enriched_df listo para escribir (yellow 2019-01 completo).")


root
 |-- do_location_id: integer (nullable = true)
 |-- pu_location_id: integer (nullable = true)
 |-- VendorID: long (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: integer (nullable = true)
 |-- airport_fee: integer (nullable = true)
 |-- service_type: string (nullable = true)
 |-- sourc

In [7]:
# Seleccionar columnas en el orden del DDL de analytics.obt_trips
obt_trips_df = (
    enriched_df
    .select(
        # Tiempo
        "pickup_datetime",
        "dropoff_datetime",
        "pickup_hour",
        "pickup_dow",
        "month",
        "year",

        # Ubicación
        "pu_location_id",
        "pu_zone",
        "pu_borough",
        "do_location_id",
        "do_zone",
        "do_borough",

        # Servicio / Códigos
        "service_type",
        "vendor_id",
        "vendor_name",
        "rate_code_id",
        "rate_code_desc",
        "payment_type",
        "payment_type_desc",
        "trip_type",

        # Viaje / Montos
        F.col("passenger_count").cast("int").alias("passenger_count"),
        "trip_distance",
        "fare_amount",
        "extra",
        "mta_tax",
        "tip_amount",
        "tolls_amount",
        "improvement_surcharge",
        F.col("congestion_surcharge").cast("double").alias("congestion_surcharge"),
        F.col("airport_fee").cast("double").alias("airport_fee"),
        "total_amount",
        "store_and_fwd_flag",

        # Derivadas
        "trip_duration_min",
        "avg_speed_mph",
        "tip_pct",

        # Metadatos
        "run_id",
        "source_year",
        "source_month",
        "ingested_at_utc",
    )
)

write_props = {
    "user": db_user,
    "password": db_password,
    "driver": "org.postgresql.Driver",
    "batchsize": "10000",
}

(
    obt_trips_df
    .write
    .mode("append")
    .jdbc(
        url=jdbc_url,
        table="analytics.obt_trips",
        properties=write_props,
    )
)

print("Escritura COMPLETA: yellow 2019-01 → analytics.obt_trips")


Escritura COMPLETA: yellow 2019-01 → analytics.obt_trips


In [3]:
def build_obt_month(service: str, year: int, month: int) -> int:
    """
    Construye la capa analytics.obt_trips para un (service, year, month)
    a partir de raw.<service>_taxi_trip.

    service: 'yellow' o 'green'
    year, month: partición a procesar

    Propiedades:
    - Idempotente: borra antes las filas de ese (service, year, month) en analytics.obt_trips.
    - Lee desde RAW vía JDBC.
    - Junta con raw.taxi_zone_lookup.
    - Calcula columnas derivadas (hour, dow, duration, speed, tip_pct, etc.).
    - Escribe en analytics.obt_trips vía JDBC (append).
    - Devuelve el número de filas insertadas (según Postgres).
    """
    import os
    from datetime import timedelta
    import time
    import psycopg2
    from pyspark.sql import functions as F

    # --- Config Postgres desde env (igual que en ingest_month) ---
    PG_HOST = os.getenv("PG_HOST", "postgres")
    PG_PORT = os.getenv("PG_PORT", "5432")
    PG_DB   = os.getenv("PG_DB", "nyctaxi")
    PG_USER = os.getenv("PG_USER", "pset")
    PG_PWD  = os.getenv("PG_PASSWORD", "pset_password")

    jdbc_url = f"jdbc:postgresql://{PG_HOST}:{PG_PORT}/{PG_DB}?reWriteBatchedInserts=true"
    jdbc_props = {
        "driver": "org.postgresql.Driver",
        "user": PG_USER,
        "password": PG_PWD,
        "fetchsize": "10000",
    }

    print("")
    print("=" * 80)
    print(f"[OBT] Construyendo analytics.obt_trips para {service.upper()} {year}-{month:02d}")
    t0 = time.time()

    # ------------------------------------------------------------------
    # 1) Borrar previamente la partición en analytics.obt_trips (idempotencia)
    # ------------------------------------------------------------------
    conn = psycopg2.connect(
        host=PG_HOST,
        port=PG_PORT,
        dbname=PG_DB,
        user=PG_USER,
        password=PG_PWD,
    )
    cur = conn.cursor()
    cur.execute(
        """
        DELETE FROM analytics.obt_trips
        WHERE service_type = %s
          AND source_year  = %s
          AND source_month = %s;
        """,
        (service, int(year), int(month)),
    )
    deleted = cur.rowcount
    conn.commit()
    cur.close()
    conn.close()
    print(f"[OBT] Borradas filas previas en analytics.obt_trips: {deleted}")

    # ------------------------------------------------------------------
    # 2) Leer RAW para este (service, year, month) desde raw.<service>_taxi_trip
    #    Filtrando duraciones negativas (dropoff < pickup)
    # ------------------------------------------------------------------
    raw_table = f"raw.{service}_taxi_trip"
    raw_query = f"""
        (SELECT *
         FROM {raw_table}
         WHERE service_type = '{service}'
           AND source_year  = {int(year)}
           AND source_month = {int(month)}
           AND dropoff_datetime >= pickup_datetime) AS t
    """

    raw_df = (
        spark.read
             .jdbc(url=jdbc_url, table=raw_query, properties=jdbc_props)
    )

    if raw_df.rdd.isEmpty():
        print(f"[OBT] No hay filas en RAW para {service} {year}-{month:02d}. Nada que hacer.")
        return 0

    # ------------------------------------------------------------------
    # 3) Leer lookup de zonas
    # ------------------------------------------------------------------
    zones_df = (
        spark.read
             .jdbc(
                 url=jdbc_url,
                 table="raw.taxi_zone_lookup",
                 properties=jdbc_props,
             )
    )

    pu_zones_df = (
        zones_df
        .select(
            F.col("locationid").alias("pu_location_id"),
            F.col("zone").alias("pu_zone"),
            F.col("borough").alias("pu_borough"),
        )
    )

    do_zones_df = (
        zones_df
        .select(
            F.col("locationid").alias("do_location_id"),
            F.col("zone").alias("do_zone"),
            F.col("borough").alias("do_borough"),
        )
    )

    # ------------------------------------------------------------------
    # 4) Enriquecer con derivadas y dimensiones básicas
    # ------------------------------------------------------------------
    df = (
        raw_df
        # tiempo
        .withColumn("pickup_hour", F.hour("pickup_datetime").cast("smallint"))
        .withColumn("pickup_dow", F.dayofweek("pickup_datetime").cast("smallint"))
        .withColumn("month", F.col("source_month").cast("smallint"))
        .withColumn("year", F.col("source_year").cast("smallint"))

        # claves de zona
        .withColumn("pu_location_id", F.col("PULocationID").cast("int"))
        .withColumn("do_location_id", F.col("DOLocationID").cast("int"))

        # vendor
        .withColumn("vendor_id", F.col("VendorID").cast("int"))
        .withColumn(
            "vendor_name",
            F.when(F.col("VendorID") == 1, "Creative Mobile Technologies")
             .when(F.col("VendorID") == 2, "VeriFone Inc")
             .otherwise("Other"),
        )

        # rate_code
        .withColumn("rate_code_id", F.col("RatecodeID").cast("int"))
        .withColumn(
            "rate_code_desc",
            F.when(F.col("RatecodeID") == 1, "Standard rate")
             .when(F.col("RatecodeID") == 2, "JFK")
             .when(F.col("RatecodeID") == 3, "Newark")
             .when(F.col("RatecodeID") == 4, "Nassau or Westchester")
             .when(F.col("RatecodeID") == 5, "Negotiated fare")
             .when(F.col("RatecodeID") == 6, "Group ride")
             .otherwise("Other"),
        )

        # payment_type + desc
        .withColumn("payment_type", F.col("payment_type").cast("int"))
        .withColumn(
            "payment_type_desc",
            F.when(F.col("payment_type") == 1, "Credit card")
             .when(F.col("payment_type") == 2, "Cash")
             .when(F.col("payment_type") == 3, "No charge")
             .when(F.col("payment_type") == 4, "Dispute")
             .when(F.col("payment_type") == 5, "Unknown")
             .when(F.col("payment_type") == 6, "Voided trip")
             .otherwise("Other"),
        )

        # derivadas
        .withColumn(
            "trip_duration_min",
            (F.unix_timestamp("dropoff_datetime") - F.unix_timestamp("pickup_datetime")) / 60.0,
        )
        .withColumn(
            "avg_speed_mph",
            F.when(
                (F.col("trip_distance") > 0) & (F.col("trip_duration_min") > 0),
                F.col("trip_distance") / (F.col("trip_duration_min") / 60.0),
            ).otherwise(F.lit(None).cast("double")),
        )
        .withColumn(
            "tip_pct",
            F.when(F.col("fare_amount") > 0,
                   (F.col("tip_amount") / F.col("fare_amount")) * 100.0
            ).otherwise(F.lit(None).cast("double")),
        )
    )

    # trip_type: solo existe en green; si no existe, ponemos NULL
    if "trip_type" in df.columns:
        df = df.withColumn("trip_type", F.col("trip_type").cast("int"))
    else:
        df = df.withColumn("trip_type", F.lit(None).cast("int"))

    # Join con zonas pickup y dropoff
    df = (
        df
        .join(pu_zones_df, on="pu_location_id", how="left")
        .join(do_zones_df, on="do_location_id", how="left")
    )

    # airport_fee: existe solo en yellow; para green la rellenamos con NULL
    if "airport_fee" not in df.columns:
        df = df.withColumn("airport_fee", F.lit(None).cast("double"))

    # ------------------------------------------------------------------
    # 5) Seleccionar columnas finales en el orden del DDL de analytics.obt_trips
    # ------------------------------------------------------------------
    obt_df = (
        df.select(
            # Tiempo
            "pickup_datetime",
            "dropoff_datetime",
            "pickup_hour",
            "pickup_dow",
            "month",
            "year",

            # Ubicación
            "pu_location_id",
            "pu_zone",
            "pu_borough",
            "do_location_id",
            "do_zone",
            "do_borough",

            # Servicio / Códigos
            "service_type",
            "vendor_id",
            "vendor_name",
            "rate_code_id",
            "rate_code_desc",
            "payment_type",
            "payment_type_desc",
            "trip_type",

            # Viaje / Montos
            F.col("passenger_count").cast("int").alias("passenger_count"),
            "trip_distance",
            "fare_amount",
            "extra",
            "mta_tax",
            "tip_amount",
            "tolls_amount",
            "improvement_surcharge",
            F.col("congestion_surcharge").cast("double").alias("congestion_surcharge"),
            F.col("airport_fee").cast("double").alias("airport_fee"),
            "total_amount",
            "store_and_fwd_flag",

            # Derivadas
            "trip_duration_min",
            "avg_speed_mph",
            "tip_pct",

            # Metadatos
            "run_id",
            "source_year",
            "source_month",
            "ingested_at_utc",
        )
    )

    # ------------------------------------------------------------------
    # 6) Escribir en analytics.obt_trips vía JDBC (sin count() previo)
    # ------------------------------------------------------------------
    write_props = {
        "user": PG_USER,
        "password": PG_PWD,
        "driver": "org.postgresql.Driver",
        "batchsize": "10000",
    }

    (
        obt_df
        .write
        .mode("append")
        .jdbc(
            url=jdbc_url,
            table="analytics.obt_trips",
            properties=write_props,
        )
    )

    # Conteo más barato: lo hacemos en Postgres después de escribir
    conn = psycopg2.connect(
        host=PG_HOST,
        port=PG_PORT,
        dbname=PG_DB,
        user=PG_USER,
        password=PG_PWD,
    )
    cur = conn.cursor()
    cur.execute(
        """
        SELECT COUNT(*)
        FROM analytics.obt_trips
        WHERE service_type = %s
          AND source_year  = %s
          AND source_month = %s;
        """,
        (service, int(year), int(month)),
    )
    cnt = cur.fetchone()[0]
    cur.close()
    conn.close()

    elapsed = time.time() - t0
    print(f"[OBT] {service.upper()} {year}-{month:02d} COMPLETADO en {timedelta(seconds=int(elapsed))}")
    print(f"[OBT] Filas en analytics.obt_trips para {service} {year}-{month:02d}: {cnt}")

    return cnt


In [17]:
build_obt_month("yellow", 2019, 1)



[OBT] Construyendo analytics.obt_trips para YELLOW 2019-01
[OBT] Borradas filas previas en analytics.obt_trips: 7696613
[OBT] YELLOW 2019-01 COMPLETADO en 0:05:39
[OBT] Filas en analytics.obt_trips para yellow 2019-01: 7696613


7696613

In [16]:
build_obt_month("green", 2019, 1)



[OBT] Construyendo analytics.obt_trips para GREEN 2019-01
[OBT] Borradas filas previas en analytics.obt_trips: 0
[OBT] GREEN 2019-01 COMPLETADO en 0:00:55
[OBT] Filas en analytics.obt_trips para green 2019-01: 672105


672105

In [4]:
from datetime import timedelta
import time

services = ["yellow", "green"]
year = 2019
months = range(4, 13)

resumen = []
start_global = time.time()

print(f"=== Construyendo analytics.obt_trips para año {year} (yellow + green) ===")

for m in months:
    for svc in services:
        t0 = time.time()
        cnt = build_obt_month(svc, year, m)
        elapsed = time.time() - t0
        resumen.append((svc, year, m, cnt, timedelta(seconds=int(elapsed))))
        print(f"[RESUMEN PARCIAL] {svc} {year}-{m:02d}: {cnt} filas en {timedelta(seconds=int(elapsed))}")
        print("-" * 60)

print("\n=== RESUMEN FINAL 2019 ===")
for svc, y, m, cnt, elapsed in resumen:
    print(f"{svc} {y}-{m:02d}: {cnt} filas (t={elapsed})")

print("\nDuración total:", timedelta(seconds=int(time.time() - start_global)))


=== Construyendo analytics.obt_trips para año 2019 (yellow + green) ===

[OBT] Construyendo analytics.obt_trips para YELLOW 2019-04
[OBT] Borradas filas previas en analytics.obt_trips: 7475940
[OBT] YELLOW 2019-04 COMPLETADO en 0:10:31
[OBT] Filas en analytics.obt_trips para yellow 2019-04: 7475940
[RESUMEN PARCIAL] yellow 2019-04: 7475940 filas en 0:10:31
------------------------------------------------------------

[OBT] Construyendo analytics.obt_trips para GREEN 2019-04
[OBT] Borradas filas previas en analytics.obt_trips: 567851
[OBT] GREEN 2019-04 COMPLETADO en 0:01:31
[OBT] Filas en analytics.obt_trips para green 2019-04: 567851
[RESUMEN PARCIAL] green 2019-04: 567851 filas en 0:01:31
------------------------------------------------------------

[OBT] Construyendo analytics.obt_trips para YELLOW 2019-05
[OBT] Borradas filas previas en analytics.obt_trips: 7598435
[OBT] YELLOW 2019-05 COMPLETADO en 0:08:07
[OBT] Filas en analytics.obt_trips para yellow 2019-05: 7598435
[RESUMEN PA

In [None]:
years = list(range(2016, 2026))
months = list(range(1, 13))

for y in years:
    for svc in ["yellow", "green"]:
        for m in months:
            build_obt_month(svc, y, m)


In [None]:
import time
from datetime import timedelta

# Años a procesar (2019 EXCLUIDO porque ya está completo)
years = [2015, 2017, 2018, 2020, 2021, 2022, 2023, 2024, 2025]
months = list(range(1, 13))
services = ["yellow", "green"]

# Número de reintentos permitidos por partición
MAX_RETRIES = 4

def run_with_retries(service, year, month):
    """
    Ejecuta build_obt_month(service, year, month) con reintentos.
    """
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            print(f"\n--- {service.upper()} {year}-{month:02d} | Intento {attempt}/{MAX_RETRIES} ---")
            cnt = build_obt_month(service, year, month)
            print(f"✔ ÉXITO: {service} {year}-{month:02d} → {cnt} filas")
            return cnt

        except Exception as e:
            print(f"❌ ERROR en {service} {year}-{month:02d}: {e}")
            if attempt < MAX_RETRIES:
                wait = attempt * 10
                print(f"⏳ Reintentando en {wait} segundos...")
                time.sleep(wait)
            else:
                print(f"💥 FALLÓ después de {MAX_RETRIES} intentos: {service} {year}-{month:02d}")
                return 0


# ==============================
#   LOOP MULTIANUAL COMPLETO
# ==============================
start_global = time.time()
results = []

total_jobs = len(years) * len(months) * len(services)
done = 0

print("\n==============================")
print("   INICIANDO PROCESO OBT")
print("==============================")
print(f"Años a cargar: {years}")
print(f"Total particiones: {total_jobs}")
print("==============================\n")


for year in years:
    print(f"\n\n======================================")
    print(f"   PROCESANDO AÑO {year}")
    print("======================================\n")

    for month in months:
        for svc in services:

            t0 = time.time()
            cnt = run_with_retries(svc, year, month)
            elapsed = timedelta(seconds=int(time.time() - t0))

            results.append((svc, year, month, cnt, str(elapsed)))

            done += 1
            remaining = total_jobs - done
            global_elapsed = time.time() - start_global
            avg_time = global_elapsed / max(done, 1)
            eta = timedelta(seconds=int(avg_time * remaining))

            # Barra de progreso
            bar_length = 40
            filled = int(bar_length * done / total_jobs)
            bar = "█" * filled + "-" * (bar_length - filled)

            print(f"\nProgreso global: [{bar}] {done}/{total_jobs}")
            print(f"Tiempo partición: {elapsed}")
            print(f"ETA restante aprox: {eta}\n")

    # RESUMEN POR AÑO
    print(f"\n##### RESUMEN {year} #####")
    for r in results:
        if r[1] == year:
            print(f"{r[0]} {r[1]}-{r[2]:02d}: {r[3]} filas (t={r[4]})")
    print("#########################\n")


# =============================
#   RESUMEN FINAL GLOBAL
# =============================
total_time = timedelta(seconds=int(time.time() - start_global))

print("\n\n===================================")
print("     RESUMEN FINAL MULTIANUAL")
print("===================================\n")
for svc, y, m, cnt, tt in results:
    print(f"{svc} {y}-{m:02d}: {cnt} filas (t={tt})")

print("\nDuración total:", total_time)
print("===================================\n")



   INICIANDO PROCESO OBT
Años a cargar: [2015, 2017, 2018, 2020, 2021, 2022, 2023, 2024, 2025]
Total particiones: 216



   PROCESANDO AÑO 2015


--- YELLOW 2015-01 | Intento 1/4 ---

[OBT] Construyendo analytics.obt_trips para YELLOW 2015-01
[OBT] Borradas filas previas en analytics.obt_trips: 0
[OBT] YELLOW 2015-01 COMPLETADO en 0:11:30
[OBT] Filas en analytics.obt_trips para yellow 2015-01: 12740740
✔ ÉXITO: yellow 2015-01 → 12740740 filas

Progreso global: [----------------------------------------] 1/216
Tiempo partición: 0:11:30
ETA restante aprox: 1 day, 17:13:56


--- GREEN 2015-01 | Intento 1/4 ---

[OBT] Construyendo analytics.obt_trips para GREEN 2015-01
[OBT] Borradas filas previas en analytics.obt_trips: 0
[OBT] GREEN 2015-01 COMPLETADO en 0:02:06
[OBT] Filas en analytics.obt_trips para green 2015-01: 1508493
✔ ÉXITO: green 2015-01 → 1508493 filas

Progreso global: [----------------------------------------] 2/216
Tiempo partición: 0:02:06
ETA restante aprox: 1 day, 0:17:33

In [None]:
import time
from datetime import timedelta

# Años a procesar (2019 EXCLUIDO porque ya está completo)
years = [ 2016]
months = list(range(1, 13))
services = ["yellow", "green"]

# Número de reintentos permitidos por partición
MAX_RETRIES = 4

def run_with_retries(service, year, month):
    """
    Ejecuta build_obt_month(service, year, month) con reintentos.
    """
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            print(f"\n--- {service.upper()} {year}-{month:02d} | Intento {attempt}/{MAX_RETRIES} ---")
            cnt = build_obt_month(service, year, month)
            print(f"✔ ÉXITO: {service} {year}-{month:02d} → {cnt} filas")
            return cnt

        except Exception as e:
            print(f"❌ ERROR en {service} {year}-{month:02d}: {e}")
            if attempt < MAX_RETRIES:
                wait = attempt * 10
                print(f"⏳ Reintentando en {wait} segundos...")
                time.sleep(wait)
            else:
                print(f"💥 FALLÓ después de {MAX_RETRIES} intentos: {service} {year}-{month:02d}")
                return 0


# ==============================
#   LOOP MULTIANUAL COMPLETO
# ==============================
start_global = time.time()
results = []

total_jobs = len(years) * len(months) * len(services)
done = 0

print("\n==============================")
print("   INICIANDO PROCESO OBT")
print("==============================")
print(f"Años a cargar: {years}")
print(f"Total particiones: {total_jobs}")
print("==============================\n")


for year in years:
    print(f"\n\n======================================")
    print(f"   PROCESANDO AÑO {year}")
    print("======================================\n")

    for month in months:
        for svc in services:

            t0 = time.time()
            cnt = run_with_retries(svc, year, month)
            elapsed = timedelta(seconds=int(time.time() - t0))

            results.append((svc, year, month, cnt, str(elapsed)))

            done += 1
            remaining = total_jobs - done
            global_elapsed = time.time() - start_global
            avg_time = global_elapsed / max(done, 1)
            eta = timedelta(seconds=int(avg_time * remaining))

            # Barra de progreso
            bar_length = 40
            filled = int(bar_length * done / total_jobs)
            bar = "█" * filled + "-" * (bar_length - filled)

            print(f"\nProgreso global: [{bar}] {done}/{total_jobs}")
            print(f"Tiempo partición: {elapsed}")
            print(f"ETA restante aprox: {eta}\n")

    # RESUMEN POR AÑO
    print(f"\n##### RESUMEN {year} #####")
    for r in results:
        if r[1] == year:
            print(f"{r[0]} {r[1]}-{r[2]:02d}: {r[3]} filas (t={r[4]})")
    print("#########################\n")


# =============================
#   RESUMEN FINAL GLOBAL
# =============================
total_time = timedelta(seconds=int(time.time() - start_global))

print("\n\n===================================")
print("     RESUMEN FINAL MULTIANUAL")
print("===================================\n")
for svc, y, m, cnt, tt in results:
    print(f"{svc} {y}-{m:02d}: {cnt} filas (t={tt})")

print("\nDuración total:", total_time)
print("===================================\n")



   INICIANDO PROCESO OBT
Años a cargar: [2016]
Total particiones: 24



   PROCESANDO AÑO 2016


--- YELLOW 2016-01 | Intento 1/4 ---

[OBT] Construyendo analytics.obt_trips para YELLOW 2016-01
[OBT] Borradas filas previas en analytics.obt_trips: 10905036
[OBT] YELLOW 2016-01 COMPLETADO en 0:21:42
[OBT] Filas en analytics.obt_trips para yellow 2016-01: 10905036
✔ ÉXITO: yellow 2016-01 → 10905036 filas

Progreso global: [█---------------------------------------] 1/24
Tiempo partición: 0:21:42
ETA restante aprox: 8:19:26


--- GREEN 2016-01 | Intento 1/4 ---

[OBT] Construyendo analytics.obt_trips para GREEN 2016-01
[OBT] Borradas filas previas en analytics.obt_trips: 0
[OBT] GREEN 2016-01 COMPLETADO en 0:06:54
[OBT] Filas en analytics.obt_trips para green 2016-01: 1445292
✔ ÉXITO: green 2016-01 → 1445292 filas

Progreso global: [███-------------------------------------] 2/24
Tiempo partición: 0:06:54
ETA restante aprox: 5:14:50


--- YELLOW 2016-02 | Intento 1/4 ---

[OBT] Construyendo