In [2]:
from pyspark.sql import SparkSession
from datetime import datetime, timezone

ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")

spark = SparkSession.builder \
    .appName(f"Gold-GVM-{ts}") \
    .getOrCreate()

In [3]:
from pyspark.sql.functions import current_timestamp, to_utc_timestamp, col
from pyspark.sql.types import DecimalType

In [4]:
df_purchase_bronze = spark.read.parquet("data_lake/bronze/purchase")
df_purchase_extra_info_bronze = spark.read.parquet("data_lake/bronze/product_item")
df_product_item_bronze = spark.read.parquet("data_lake/bronze/purchase_extra_info")

In [5]:
df_purchase_bronze.createOrReplaceTempView("purchase_bronze")
df_purchase_extra_info_bronze.createOrReplaceTempView("purchase_extra_info_bronze")
df_product_item_bronze.createOrReplaceTempView("product_item_bronze")

# Bronze

In [6]:
spark.sql("SELECT * FROM purchase_bronze").show()
spark.sql("SELECT * FROM purchase_extra_info_bronze").show()
spark.sql("SELECT * FROM product_item_bronze").show()

+--------------------+-----------+--------+------------+----------+------------+-----------+--------------------+----------------+
|transaction_datetime|purchase_id|buyer_id|prod_item_id|order_date|release_date|producer_id|      ingestion_date|transaction_date|
+--------------------+-----------+--------+------------+----------+------------+-----------+--------------------+----------------+
| 2023-02-05 10:00:00|         55|  160001|           5|2023-01-20|  2023-01-20|     852852|2026-01-14 23:08:...|      2023-02-05|
| 2023-02-05 10:00:00|         55|  160001|           5|2023-01-20|  2023-01-20|     852852|2026-01-15 00:26:...|      2023-02-05|
| 2023-07-15 09:00:00|         55|  160001|           5|2023-01-20|  2023-03-01|     852852|2026-01-14 23:08:...|      2023-07-15|
| 2023-07-15 09:00:00|         55|  160001|           5|2023-01-20|  2023-03-01|     852852|2026-01-15 00:26:...|      2023-07-15|
| 2023-01-20 22:00:00|         55|   15947|           5|2023-01-20|  2023-01-20|   

# Silver

In [7]:
df_purchase_silver = spark.read.parquet("data_lake/silver/purchase")
df_purchase_extra_info_silver = spark.read.parquet("data_lake/silver/product_item")
df_product_item_silver = spark.read.parquet("data_lake/silver/purchase_extra_info")

In [8]:
df_purchase_silver.createOrReplaceTempView("purchase_silver")
df_purchase_extra_info_silver.createOrReplaceTempView("purchase_extra_info_silver")
df_product_item_silver.createOrReplaceTempView("product_item_silver")

In [9]:
spark.sql("SELECT * FROM purchase_silver ORDER BY purchase_id, transaction_datetime DESC").show()
spark.sql("SELECT * FROM purchase_extra_info_silver ORDER BY purchase_id, transaction_datetime DESC").show()
spark.sql("SELECT * FROM product_item_silver ORDER BY purchase_id, transaction_datetime DESC").show()

+--------------------+-----------+--------+------------+----------+------------+-----------+---------------+--------------------+---------------------+---------+----------------+
|transaction_datetime|purchase_id|buyer_id|prod_item_id|order_date|release_date|producer_id|invoiced_status|     line_created_at|bronze_ingestion_date|is_latest|transaction_date|
+--------------------+-----------+--------+------------+----------+------------+-----------+---------------+--------------------+---------------------+---------+----------------+
| 2023-07-15 09:00:00|         55|  160001|           5|2023-01-20|  2023-03-01|     852852|       Invoiced|2026-01-14 23:17:...| 2026-01-14 23:08:...|     true|      2023-07-15|
| 2023-02-05 10:00:00|         55|  160001|           5|2023-01-20|  2023-01-20|     852852|       Invoiced|2026-01-14 23:17:...| 2026-01-14 23:08:...|    false|      2023-02-05|
| 2023-01-20 22:00:00|         55|   15947|           5|2023-01-20|  2023-01-20|     852852|       Invoic

In [10]:
df_gvm_gold = spark.read.parquet("data_lake/gold/gvm")
df_gvm_gold.createOrReplaceTempView("gvm_gold")

In [15]:

df_new_gvm = spark.sql("""
    SELECT 
        transaction_date,
        subsidiary,
        SUM(item_quantity * purchase_value) AS GVM
    FROM gvm_gold 
    WHERE current_snapshot = TRUE
    GROUP BY 1,2
""").show()

+----------------+-------------+-------+
|transaction_date|   subsidiary|    GVM|
+----------------+-------------+-------+
|      2023-02-26|internacional|4000.00|
|      2023-07-15|     nacional| 550.00|
+----------------+-------------+-------+

