In [0]:

from pyspark.sql import functions as F

SILVER = {
    "yellow": "workspace.nyc_taxi.yellow_trips_silver",
    "green":  "workspace.nyc_taxi.green_trips_silver",
    "fhv":    "workspace.nyc_taxi.fhv_trips_silver",
    "fhvhv":  "workspace.nyc_taxi.fhvhv_trips_silver",
}

GOLD = {
    "yellow_2023_jan_may": "workspace.nyc_taxi.yellow_taxi_2023_jan_may_gold",
    "may_unified":     "workspace.nyc_taxi.may_2023_gold",
}

# ======================================================
# GOLD 1 — Yellow (apenas 2023-01 a 2023-05, colunas pedidas)
# ======================================================
df_yellow_silver = spark.table(SILVER["yellow"])

gold1 = (
    df_yellow_silver
    .filter((F.col("anomes") >= "202301") & (F.col("anomes") <= "202305"))
    .select(
        "vendorid",
        "passenger_count",
        "total_amount",
        "tpep_pickup_datetime",
        "tpep_dropoff_datetime",
        "anomes"
    )
)

print(f"📥 Gold 1 (Yellow 2023/01–2023/05) – linhas: {gold1.count():,}")

spark.sql(f"DROP TABLE IF EXISTS {GOLD['yellow_2023_jan_may']}")
(
    gold1
    .repartition("anomes")
    .write
    .format("delta")
    .mode("overwrite")
    .partitionBy("anomes")
    .saveAsTable(GOLD["yellow_2023_jan_may"])
)
print(f"✅ Gravado: {GOLD['yellow_2023_jan_may']}")

# ======================================================
# GOLD 2 — Unificada Maio/2023 (mínimo necessário)
# Schema final:
#   pickup_datetime TIMESTAMP
#   passenger_count DOUBLE (nullable)
#   anomes STRING
# ======================================================

# YELLOW -> mantém passenger_count
g_yellow = (
    spark.table(SILVER["yellow"])
    .filter(F.col("anomes") == "202305")
    .select(
        F.col("tpep_pickup_datetime").alias("pickup_datetime"),
        F.col("passenger_count").cast("double").alias("passenger_count"),
        F.col("anomes")
    )
)

# GREEN -> mantém passenger_count
g_green = (
    spark.table(SILVER["green"])
    .filter(F.col("anomes") == "202305")
    .select(
        F.col("lpep_pickup_datetime").alias("pickup_datetime"),
        F.col("passenger_count").cast("double").alias("passenger_count"),
        F.col("anomes")
    )
)

# FHV -> não possui passenger_count (NULL)
g_fhv = (
    spark.table(SILVER["fhv"])
    .filter(F.col("anomes") == "202305")
    .select(
        F.col("pickup_datetime").alias("pickup_datetime"),
        F.lit(None).cast("double").alias("passenger_count"),
        F.col("anomes")
    )
)

# FHVHV -> também sem passenger_count (NULL)
g_fhvhv = (
    spark.table(SILVER["fhvhv"])
    .filter(F.col("anomes") == "202305")
    .select(
        F.col("pickup_datetime").alias("pickup_datetime"),
        F.lit(None).cast("double").alias("passenger_count"),
        F.col("anomes")
    )
)

# União mínima necessária
gold2 = (
    g_yellow
    .unionByName(g_green, allowMissingColumns=True)
    .unionByName(g_fhv, allowMissingColumns=True)
    .unionByName(g_fhvhv, allowMissingColumns=True)
)

print(f"📥 Gold 2 (May/2023) – linhas: {gold2.count():,}")

spark.sql(f"DROP TABLE IF EXISTS {GOLD['may_unified']}")
(
    gold2
    .repartition("anomes")
    .write
    .format("delta")
    .mode("overwrite")
    .partitionBy("anomes")
    .saveAsTable(GOLD["may_unified"])
)
print(f"✅ Gravado: {GOLD['may_unified']}")