- Conversões: `ts_coleta` ➜ timestamp; `lat/lng` arredondados (6 casas) e limites geográficos SP (lat: [-24.1, -23.3], lng: [-47.1, -46.2]).
- **Deduplicação por veículo**: janela `partitionBy(prefixo) orderBy(ts_coleta desc)` e manter `rn=1` (posição mais recente).
- **Objetivo**: base limpa e estável para *joins* e KPIs.

In [0]:
from pyspark.sql.functions import col, to_timestamp, round, row_number
from pyspark.sql.window import Window

bronze = spark.table("workspace.sptrans.bronze_posicao")

# Limites da cidade
LAT_MIN, LAT_MAX = -24.1, -23.3
LNG_MIN, LNG_MAX = -47.1, -46.2

cleaned = (
    bronze
    .withColumn("ts_coleta", to_timestamp(col("ts_coleta")))
    .withColumn("latitude", round(col("lat"), 6))
    .withColumn("longitude", round(col("lng"), 6))
    .withColumn("cod_linha", col("cod_linha").cast("int"))
    .where(col("prefixo").isNotNull())
    .where((col("lat").between(LAT_MIN, LAT_MAX)) & (col("lng").between(LNG_MIN, LNG_MAX)))
)

# Deduplicar → manter só o mais recente por ônibus
w = Window.partitionBy("prefixo").orderBy(col("ts_coleta").desc())

silver_posicao = (
    cleaned
    .withColumn("rn", row_number().over(w))
    .where(col("rn") == 1)
    .drop("rn")
)

silver_posicao.write.format("delta") \
    .option("overwriteSchema", "true") \
    .mode("overwrite") \
    .saveAsTable("workspace.sptrans.silver_posicao")

print("✅ silver_posicao criada com sucesso!")
display(silver_posicao.limit(10))

✅ silver_posicao criada com sucesso!


acessivel,cod_linha,destino,hr_api,lat,letreiro,lng,origem,prefixo,sentido,ta_utc,ts_coleta,latitude,longitude
False,35077,SÃO MATEUS,20:38,-23.6382095,4027-41,-46.437941,MORRO DO CRUZEIRO,3117,2,2025-11-09T23:38:17Z,2025-11-09T23:38:19.785Z,-23.63821,-46.437941
True,34026,TERM. PIRITUBA,20:38,-23.4754595,8008-10,-46.75646,HAB. TURÍSTICA,10202,2,2025-11-09T23:37:53Z,2025-11-09T23:38:19.787Z,-23.47546,-46.75646
True,34026,TERM. PIRITUBA,20:38,-23.4686685,8008-10,-46.753776,HAB. TURÍSTICA,10203,2,2025-11-09T23:37:45Z,2025-11-09T23:38:19.787Z,-23.468669,-46.753776
True,34191,METRÔ BARRA FUNDA,20:38,-23.4694535,948A-10,-46.7135935,VL. ZATT,10207,2,2025-11-09T23:37:46Z,2025-11-09T23:38:19.787Z,-23.469454,-46.713594
True,34191,METRÔ BARRA FUNDA,20:38,-23.4803435,948A-10,-46.713063,VL. ZATT,10209,2,2025-11-09T23:37:36Z,2025-11-09T23:38:19.787Z,-23.480344,-46.713063
True,33233,VL. OLÍMPIA,20:38,-23.48044,958P-10,-46.7584765,JD. NARDINI,10212,2,2025-11-09T23:37:45Z,2025-11-09T23:38:19.789Z,-23.48044,-46.758477
True,33233,VL. OLÍMPIA,20:38,-23.520723250000003,958P-10,-46.74084775,JD. NARDINI,10214,2,2025-11-09T23:37:59Z,2025-11-09T23:38:19.789Z,-23.520723,-46.740848
True,465,VL. OLÍMPIA,20:38,-23.5289875,958P-10,-46.7390515,JD. NARDINI,10215,1,2025-11-09T23:38:11Z,2025-11-09T23:38:19.788Z,-23.528988,-46.739052
True,1260,TERM. PIRITUBA,20:38,-23.452141,8009-31,-46.7186905,CID. D'ABRIL 3ª GLEBA,10226,1,2025-11-09T23:37:54Z,2025-11-09T23:38:19.789Z,-23.452141,-46.718691
True,34191,METRÔ BARRA FUNDA,20:38,-23.5261985,948A-10,-46.6684055,VL. ZATT,10228,2,2025-11-09T23:38:09Z,2025-11-09T23:38:19.787Z,-23.526199,-46.668406


In [0]:
%sql
SELECT COUNT(*) AS total_registros
FROM workspace.sptrans.silver_posicao;

total_registros
4729


In [0]:
%sql
SELECT COUNT(DISTINCT prefixo) AS qtde_bronze
FROM workspace.sptrans.bronze_posicao;

In [0]:
%sql
SELECT COUNT(*) AS qtde_silver
FROM workspace.sptrans.silver_posicao;

In [0]:
%sql
SELECT DISTINCT prefixo
FROM workspace.sptrans.bronze_posicao
WHERE prefixo NOT IN (
    SELECT prefixo FROM workspace.sptrans.silver_posicao
);