In [0]:
df = spark.read.format("parquet").load("/Volumes/workspace/nyc_taxi/raw/2023/*.parquet")
display(df.count())

In [0]:
from pyspark.sql import functions as F
from functools import reduce

base_dir = "/Volumes/workspace/nyc_taxi/raw/2023"

# Schema alvo (padronizado)
target_schema = {
    "vendorid": "long",
    "tpep_pickup_datetime": "timestamp",
    "tpep_dropoff_datetime": "timestamp",
    "passenger_count": "double",
    "trip_distance": "double",
    "ratecodeid": "double",
    "store_and_fwd_flag": "string",
    "pulocationid": "long",
    "dolocationid": "long",
    "payment_type": "long",
    "fare_amount": "double",
    "extra": "double",
    "mta_tax": "double",
    "tip_amount": "double",
    "tolls_amount": "double",
    "improvement_surcharge": "double",
    "total_amount": "double",
    "congestion_surcharge": "double",
    "airport_fee": "double",
}

target_cols = list(target_schema.keys())

files = [f.path for f in dbutils.fs.ls(base_dir) if f.path.endswith(".parquet")]
assert files, f"Nenhum parquet encontrado em {base_dir}"

def normalize_one(path: str):
    df = spark.read.parquet(path)

    # padroniza nomes de colunas
    for c in df.columns:
        new = c.lower()
        if new != c:
            df = df.withColumnRenamed(c, new)

    if "airport_fee" not in df.columns and "Airport_fee".lower() in [c.lower() for c in df.columns]:
        df = df.withColumnRenamed("Airport_fee", "airport_fee")

    for col, dtype in target_schema.items():
        if col not in df.columns:
            df = df.withColumn(col, F.lit(None).cast(dtype))

    for col, dtype in target_schema.items():
        df = df.withColumn(col, F.col(col).cast(dtype))

    # usa _metadata.file_path (suportado no Unity Catalog)
    df = df.withColumn(
        "anomes",
        F.regexp_replace(
            F.regexp_extract(F.col("_metadata.file_path"), r".*_(\d{4}-\d{2})\.parquet$", 1),
            "-", ""
        )
    )

    return df.select(*target_cols, "anomes")

dfs = [normalize_one(p) for p in files]
df_all = reduce(lambda a, b: a.unionByName(b, allowMissingColumns=True), dfs)

df_all.select("anomes").distinct().orderBy("anomes").show()
df_all.printSchema()

(df_all.write
    .format("delta")
    .mode("overwrite")
    .partitionBy("anomes")
    .saveAsTable("workspace.nyc_taxi.yellowtaxi_trips_2023"))


In [0]:
from pyspark.sql import functions as F

# Leitura da tabela Bronze (que você criou anteriormente)
df_bronze = spark.table("workspace.nyc_taxi.yellowtaxi_trips_2023")

# Filtro de colunas e de período
df_silver = (df_bronze
    .select("vendorid",
            "passenger_count",
            "total_amount",
            "tpep_pickup_datetime",
            "tpep_dropoff_datetime",
            "anomes")
    .filter((F.col("anomes") >= "202301") & (F.col("anomes") <= "202305"))
)

# Escrita em uma nova tabela Delta particionada por anomes
(df_silver.write
    .format("delta")
    .mode("overwrite")
    .partitionBy("anomes")
    .saveAsTable("workspace.nyc_taxi.yellowtaxi_trips_2023_silver"))

display(df_silver.limit(10))

In [0]:
%sql
SELECT anomes,
       COUNT(*) AS qtd_linhas
FROM workspace.nyc_taxi.yellowtaxi_trips_2023_silver
GROUP BY anomes
ORDER BY anomes;


In [0]:
%sql
SELECT anomes,
       ROUND(AVG(total_amount), 2) AS media_total_amount
FROM workspace.nyc_taxi.yellowtaxi_trips_2023_silver
GROUP BY anomes
ORDER BY anomes;


In [0]:
%sql
SELECT HOUR(tpep_pickup_datetime) AS hora_do_dia,
       ROUND(AVG(passenger_count), 2) AS media_passageiros
FROM workspace.nyc_taxi.yellowtaxi_trips_2023_silver
WHERE anomes = '202305'
GROUP BY hora_do_dia
ORDER BY hora_do_dia;


In [0]:
from pyspark.sql import functions as F
import matplotlib.pyplot as plt

# 1. Consulta e coleta os dados
df_maio = (spark.table("workspace.nyc_taxi.yellowtaxi_trips_2023_silver")
    .filter(F.col("anomes") == "202305")
    .groupBy(F.hour("tpep_pickup_datetime").alias("hora_do_dia"))
    .agg(F.round(F.avg("passenger_count"), 2).alias("media_passageiros"))
    .orderBy("hora_do_dia")
)

# Converte para pandas para plotar
pdf = df_maio.toPandas()

# 2. Cria o gráfico
plt.figure(figsize=(10,5))
plt.plot(pdf["hora_do_dia"], pdf["media_passageiros"], marker="o")
plt.title("Média de Passageiros por Hora - Maio/2023")
plt.xlabel("Hora do Dia")
plt.ylabel("Média de Passageiros")
plt.grid(True)
plt.xticks(range(0, 24))
plt.show()
