In [0]:
spark.sql("USE CATALOG adventureworks")
spark.sql("USE SCHEMA bronze")

In [0]:
# Read External Storage path from Key Vault
blob_path = dbutils.secrets.get(scope="adventureworks-secret-scope", key="adventureworks-external-location")

# Read source file from External Storage (Data Lake)
source_path = f"{blob_path}/Production Product.csv"

In [0]:
dbutils.widgets.removeAll()

In [0]:
# Create widgets
dbutils.widgets.text("bronze_schema", "adventureworks.bronze", "Bronze Schema")

# Retrieve values
bronze_schema = dbutils.widgets.get("bronze_schema")
bronze_table = bronze_schema + ".products"

## Read CSV file into Dataframe

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType
from pyspark.sql.functions import current_timestamp, to_utc_timestamp

# Define schema
sales_schema = StructType([
    StructField("product_id", IntegerType(), False),
    StructField("name", StringType(), False),
    StructField("product_number", StringType(), False),
    StructField("make_flag", IntegerType()),
    StructField("finished_goods_flag", IntegerType()),
    StructField("color", StringType()),
    StructField("safety_stock_level", IntegerType()),
    StructField("reorder_point", IntegerType()),
    StructField("standard_cost", DoubleType()),
    StructField("list_price", DoubleType()),
    StructField("size", StringType()),
    StructField("size_unit_measure_code", StringType()),
    StructField("weight_unit_measure_code", StringType()),
    StructField("weight", DoubleType()),
    StructField("days_to_manufacture", IntegerType()),
    StructField("product_line", StringType()),
    StructField("class", StringType()),
    StructField("style", StringType()),
    StructField("product_subcategory_id", IntegerType()),
    StructField("product_model_id", IntegerType()),
    StructField("sell_start_date", TimestampType()),
    StructField("sell_end_date", TimestampType()),
    StructField("discontinued_date", TimestampType()),
    StructField("rowguid", StringType()),
    StructField("modified_date", TimestampType())
])

# Load the CSV using the defined schema
df = (
    spark.read.format("csv")
    .option("header", True)
    .option("delimiter", ",")
    .schema(sales_schema)
    .load(source_path)
)

df = df.withColumn("ingestion_timestamp_utc", to_utc_timestamp(current_timestamp(), "UTC"))

display(df)

## Upsert Data

In [0]:
from delta.tables import DeltaTable

# Get a reference to the Delta table
delta_table = DeltaTable.forName(spark, bronze_table)

# Count rows before merge
before_count = spark.table(bronze_table).count()
print(f"Rows before merge: {before_count}")

# Perform merge (upsert) operation
(
    delta_table.alias("target")
    .merge(
        df.alias("source"),
        "target.product_id = source.product_id"
    )
    .whenMatchedUpdateAll()
    .whenNotMatchedInsertAll()
    .execute()
)

# Count rows after merge
after_count = spark.table(bronze_table).count()
print(f"Rows after merge: {after_count}")

In [0]:
from pyspark.sql.functions import desc

most_recent_row = spark.table(bronze_table).orderBy(desc("ingestion_timestamp_utc")).limit(1)
display(most_recent_row)