In [40]:
from pyspark.sql import SparkSession
from datetime import datetime, timezone

ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")

spark = SparkSession.builder \
    .appName(f"AAnalyses-{ts}") \
    .getOrCreate()

In [41]:
from pyspark.sql.functions import current_timestamp, to_utc_timestamp, col
from pyspark.sql.types import DecimalType

In [42]:
df_purchase_silver = spark.read.parquet("data_lake/silver/purchase")
df_purchase_extra_info_silver = spark.read.parquet("data_lake/silver/product_item")
df_product_item_silver = spark.read.parquet("data_lake/silver/purchase_extra_info")

In [43]:
df_purchase_silver.createOrReplaceTempView("purchase_silver")
df_purchase_extra_info_silver.createOrReplaceTempView("purchase_extra_info_silver")
df_product_item_silver.createOrReplaceTempView("product_item_silver")

In [44]:
spark.sql("SELECT * FROM purchase_silver").show()
spark.sql("SELECT * FROM purchase_extra_info_silver").show()
spark.sql("SELECT * FROM product_item_silver").show()

+--------------------+-----------+--------+------------+----------+------------+-----------+---------------+--------------------+---------------------+----------------+
|transaction_datetime|purchase_id|buyer_id|prod_item_id|order_date|release_date|producer_id|invoiced_status|     line_created_at|bronze_ingestion_date|transaction_date|
+--------------------+-----------+--------+------------+----------+------------+-----------+---------------+--------------------+---------------------+----------------+
| 2023-02-05 10:00:00|         55|  160001|           5|2023-01-20|  2023-01-20|     852852|       Invoiced|2026-01-14 10:40:...| 2026-01-14 10:24:...|      2023-02-05|
| 2023-01-20 22:00:00|         55|   15947|           5|2023-01-20|  2023-01-20|     852852|       Invoiced|2026-01-14 10:40:...| 2026-01-14 10:24:...|      2023-01-20|
| 2023-07-15 09:00:00|         55|  160001|           5|2023-01-20|  2023-03-01|     852852|       Invoiced|2026-01-14 10:40:...| 2026-01-14 10:24:...|    

In [45]:
df_gvm_gold = spark.sql("""
WITH base_purchase AS (
    SELECT 
        a.transaction_datetime,
        a.transaction_date,
        a.purchase_id,
        a.buyer_id,
        a.prod_item_id,
        a.order_date,
        a.release_date,
        a.producer_id,
        ROW_NUMBER() OVER (
            PARTITION BY purchase_id 
            ORDER BY transaction_datetime DESC
        ) AS rn
    FROM purchase_silver a
    WHERE a.invoiced_status = 'Invoiced'
)
,base_product_item 
,base_purchase_time_travel AS (
    SELECT 
        transaction_datetime,
        transaction_date,
        purchase_id,
        buyer_id,
        prod_item_id,
        order_date,
        release_date,
        producer_id,
        CASE WHEN rn = 1 THEN TRUE ELSE FALSE END AS is_current,
        current_timestamp() AS snapshot_date
    FROM base_purchase
)
SELECT 
    transaction_datetime,
    transaction_date,
    purchase_id,
    buyer_id,
    prod_item_id,
    order_date,
    release_date,
    producer_id,
    is_current,
    CASE WHEN transaction_datetime > snapshot_date THEN NULL ELSE snapshot_date END AS snapshot_date
FROM base_purchase_time_travel
""")

df_gvm_gold.show()

+--------------------+----------------+-----------+--------+------------+----------+------------+-----------+----------+--------------------+
|transaction_datetime|transaction_date|purchase_id|buyer_id|prod_item_id|order_date|release_date|producer_id|is_current|       snapshot_date|
+--------------------+----------------+-----------+--------+------------+----------+------------+-----------+----------+--------------------+
| 2023-07-15 09:00:00|      2023-07-15|         55|  160001|           5|2023-01-20|  2023-03-01|     852852|      true|2026-01-14 22:15:...|
| 2023-02-05 10:00:00|      2023-02-05|         55|  160001|           5|2023-01-20|  2023-01-20|     852852|     false|2026-01-14 22:15:...|
| 2023-01-20 22:00:00|      2023-01-20|         55|   15947|           5|2023-01-20|  2023-01-20|     852852|     false|2026-01-14 22:15:...|
| 2023-02-26 03:00:00|      2023-02-26|         69|  160001|          18|2023-01-26|  2023-02-28|      96967|      true|2026-01-14 22:15:...|
+-----

In [46]:
# =====================================================
# 7. Escrita final da Silver
# =====================================================
df_gvm_gold.write \
    .format("parquet") \
    .mode("overwrite") \
    .partitionBy("snapshot_date","transaction_date") \
    .save("data_lake/gold/gvm")