In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import functions as F
from pyspark.sql.window import *

In [0]:
spark.sql("USE CATALOG adventureworks")
spark.sql("USE SCHEMA silver")

In [0]:
dbutils.widgets.removeAll()

In [0]:
# Create widgets
dbutils.widgets.text("bronze_schema", "adventureworks.bronze", "Bronze Schema")
dbutils.widgets.text("silver_schema", "adventureworks.silver", "Silver Schema")

# Retrieve values
bronze_schema = dbutils.widgets.get("bronze_schema")
silver_schema = dbutils.widgets.get("silver_schema")

bronze_table = bronze_schema + ".sales_order_header"
silver_table = silver_schema + ".sales_order_header"

In [0]:
df = spark.read.table(bronze_table)

display(df)

## Data Cleaning

Drop Nulls

In [0]:
# Drop Nulls
df = df.dropna(how="all")\
    .filter((col("sales_order_id").isNotNull()))

Drop Duplicates

In [0]:
# Drop duplicates
df = df.drop_duplicates(["sales_order_id"])

## Data Enrichment

In [0]:
# Add a column with the total number of orders per customer

df = df.withColumn(
    "total_orders_by_customer",
    F.count("sales_order_id").over(Window.partitionBy("customer_id"))
)

In [0]:
# Add a column with the number of days between order and ship

df = df.withColumn(
    "days_between_order_and_ship",
    F.datediff(col("ship_date"), col("order_date"))
)

In [0]:
# Add timestamp
df = df.withColumn("ingestion_timestamp_utc", to_utc_timestamp(current_timestamp(), "UTC"))

## Upsert

In [0]:
from delta.tables import DeltaTable

# Get a reference to the Delta table
deltaTable = DeltaTable.forName(spark, silver_table)

# Count rows before merge
before_count = spark.read.table(silver_table).count()
print(f"Rows before merge: {before_count}")

# Perform merge (upsert) operation
deltaTable.alias("target").merge(
    df.alias("source"),
    "target.sales_order_id = source.sales_order_id"
).whenMatchedUpdateAll() \
 .whenNotMatchedInsertAll() \
 .execute()

# Count rows after merge
after_count = spark.read.table(silver_table).count()
print(f"Rows after merge: {after_count}")

In [0]:
from pyspark.sql.functions import desc

most_recent_row = spark.table(silver_table).orderBy(desc("ingestion_timestamp_utc")).limit(1)
display(most_recent_row)