In [0]:
spark.sql("USE CATALOG adventureworks")
spark.sql("USE SCHEMA bronze")

In [0]:
# Read External Storage path from Key Vault
blob_path = dbutils.secrets.get(scope="adventureworks-secret-scope", key="adventureworks-external-location")

# Read source file from External Storage (Data Lake)
source_path = f"{blob_path}/Sales SalesOrderHeader.csv"

In [0]:
dbutils.widgets.removeAll()

In [0]:
# Create widgets
dbutils.widgets.text("bronze_schema", "adventureworks.bronze", "Bronze Schema")

# Retrieve values
bronze_schema = dbutils.widgets.get("bronze_schema")
bronze_table = bronze_schema + ".sales_order_header"

## Read CSV file into Dataframe

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType
from pyspark.sql.functions import current_timestamp, to_utc_timestamp

# Define schema
sales_schema = StructType([
    StructField("sales_order_id", IntegerType(), False),
    StructField("revision_number", IntegerType(), False),
    StructField("order_date", TimestampType(), False),
    StructField("due_date", TimestampType(), False),
    StructField("ship_date", TimestampType(), False),
    StructField("status", IntegerType(), False),
    StructField("online_order_flag", IntegerType(), False),
    StructField("sales_order_number", StringType(), False),
    StructField("purchase_order_number", StringType(), False),
    StructField("account_number", StringType(), False),
    StructField("customer_id", IntegerType(), False),
    StructField("sales_person_id", IntegerType(), False),
    StructField("territory_id", IntegerType(), False),
    StructField("bill_to_address_id", IntegerType(), False),
    StructField("ship_to_address_id", IntegerType(), False),
    StructField("ship_method_id", IntegerType(), False),
    StructField("credit_card_id", IntegerType(), False),
    StructField("credit_card_approval_code", StringType(), False),
    StructField("currency_rate_id", IntegerType(), False),
    StructField("sub_total", DoubleType(), False),
    StructField("tax_amt", DoubleType(), False),
    StructField("freight", DoubleType(), False),
    StructField("total_due", DoubleType(), False),
    StructField("comment", StringType(), False),
    StructField("rowguid", StringType(), False),
    StructField("modified_date", TimestampType(), False)
])

# Load the CSV using the defined schema
df = (
    spark.read.format("csv")
    .option("header", True)
    .option("delimiter", ",")
    .schema(sales_schema)
    .load(source_path)
)

df = df.withColumn("ingestion_timestamp_utc", to_utc_timestamp(current_timestamp(), "UTC"))

display(df)

## Upsert Data

In [0]:
from delta.tables import DeltaTable

# Get a reference to the Delta table
delta_table = DeltaTable.forName(spark, bronze_table)

# Count rows before merge
before_count = spark.table(bronze_table).count()
print(f"Rows before merge: {before_count}")

# Perform merge (upsert) operation
(
    delta_table.alias("target")
    .merge(
        df.alias("source"),
        "target.sales_order_id = source.sales_order_id"
    )
    .whenMatchedUpdateAll()
    .whenNotMatchedInsertAll()
    .execute()
)

# Count rows after merge
after_count = spark.table(bronze_table).count()
print(f"Rows after merge: {after_count}")

In [0]:
from pyspark.sql.functions import desc

most_recent_row = spark.table(bronze_table).orderBy(desc("ingestion_timestamp_utc")).limit(1)
display(most_recent_row)