In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import functions as F
from pyspark.sql.window import *

In [0]:
dbutils.widgets.removeAll()

In [0]:
# Create widgets

dbutils.widgets.text("silver_schema", "adventureworks.silver", "Silver Schema")
dbutils.widgets.text("gold_schema", "adventureworks.gold", "Gold Schema")

# Retrieve values

silver_schema = dbutils.widgets.get("silver_schema")
gold_schema = dbutils.widgets.get("gold_schema")


sales_detail = silver_schema + ".sales_order_detail"
sales_header = silver_schema + ".sales_order_header"
sales_product = silver_schema + ".products"

gold_table = gold_schema + ".sales"

In [0]:
df_detail = spark.read.table(sales_detail)
df_detail.display()

df_header = spark.read.table(sales_header)
df_header.display()

df_product = spark.read.table(sales_product)
df_product.display()

In [0]:
# Example aggregation: Total sales amount by product

df_joined = (
    df_detail.alias("d")
    .join(df_header.alias("h"), col("d.sales_order_id") == col("h.sales_order_id"), "inner")
    .join(df_product.alias("p"), col("d.product_id") == col("p.product_id"), "inner")
    .groupBy(col("d.product_id"), col("p.name"))
    .agg(
        F.round(F.sum(col("d.order_qty") * col("d.unit_price")), 2).alias("total_sales_amount"),
        F.sum(col("d.order_qty")).alias("total_quantity_sold"),
        F.countDistinct(col("h.customer_id")).alias("unique_customers")
    )
    .orderBy(col("total_sales_amount").desc())
)

# Add timestamp
df_joined = df_joined.withColumn("ingestion_timestamp_utc", to_utc_timestamp(current_timestamp(), "UTC"))

display(df_joined)

In [0]:
from delta.tables import DeltaTable

# Get a reference to the Delta table
deltaTable = DeltaTable.forName(spark, gold_table)

# Count rows before merge
before_count = spark.read.table(gold_table).count()
print(f"Rows before merge: {before_count}")

# Perform merge (upsert) operation
deltaTable.alias("target").merge(
    df_joined.alias("source"),
    "target.product_id = source.product_id"
).whenMatchedUpdateAll() \
 .whenNotMatchedInsertAll() \
 .execute()

# Count rows after merge
after_count = spark.read.table(gold_table).count()
print(f"Rows after merge: {after_count}")

In [0]:
from pyspark.sql.functions import desc

most_recent_row = spark.table(gold_table).orderBy(desc("ingestion_timestamp_utc")).limit(1)
display(most_recent_row)