In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import functions as F
from pyspark.sql.window import *

In [0]:
spark.sql("USE CATALOG adventureworks")
spark.sql("USE SCHEMA silver")

In [0]:
dbutils.widgets.removeAll()

In [0]:
# Create widgets
dbutils.widgets.text("bronze_schema", "adventureworks.bronze", "Bronze Schema")
dbutils.widgets.text("silver_schema", "adventureworks.silver", "Silver Schema")

# Retrieve values
bronze_schema = dbutils.widgets.get("bronze_schema")
silver_schema = dbutils.widgets.get("silver_schema")

bronze_table = bronze_schema + ".sales_order_detail"
silver_table = silver_schema + ".sales_order_detail"

In [0]:
df = spark.read.table(bronze_table)

display(df)

## Data Cleaning

Drop Nulls

In [0]:
# Drop Nulls
df = df.dropna(how="all")\
    .filter((col("sales_order_detail_id").isNotNull()))

Drop Duplicates

In [0]:
# Drop duplicates
df = df.drop_duplicates(["sales_order_detail_id"])

## Data Enrichment

In [0]:
# Rename column
df = df.withColumnRenamed("unit_price_discount", "product_discount_percentage")

In [0]:
# Add a column with the number of products per order_id
df = df.withColumn(
    "order_total_products",
    F.count("product_id").over(Window.partitionBy("sales_order_id"))
)

In [0]:
# Calculate product discount

df = df.withColumn(
    "product_discount",
    F.round(col("unit_price") * col("order_qty") * (col("product_discount_percentage")), 4)
)


In [0]:
# Calculate order total

df = df.withColumn(
    "order_total",
    F.sum("line_total").over(Window.partitionBy("sales_order_id"))
)

In [0]:
# Calculate order total discount

df = df.withColumn(
    "order_total_discount",
    F.sum("product_discount").over(Window.partitionBy("sales_order_id"))
    )

In [0]:
# Add timestamp
df = df.withColumn("ingestion_timestamp_utc", to_utc_timestamp(current_timestamp(), "UTC"))

## Upsert

In [0]:
from delta.tables import DeltaTable

# Get a reference to the Delta table
deltaTable = DeltaTable.forName(spark, silver_table)

# Count rows before merge
before_count = spark.read.table(silver_table).count()
print(f"Rows before merge: {before_count}")

# Perform merge (upsert) operation
deltaTable.alias("target").merge(
    df.alias("source"),
    "target.sales_order_detail_id = source.sales_order_detail_id"
).whenMatchedUpdateAll() \
 .whenNotMatchedInsertAll() \
 .execute()

# Count rows after merge
after_count = spark.read.table(silver_table).count()
print(f"Rows after merge: {after_count}")

In [0]:
from pyspark.sql.functions import desc

most_recent_row = spark.table(silver_table).orderBy(desc("ingestion_timestamp_utc")).limit(1)
display(most_recent_row)