In [None]:
from datetime import datetime, timezone

from pyspark.sql import functions as F
from pyspark.sql.window import Window


CATALOG = "main"
SCHEMA = "retail_p1"
NAMESPACE = f"{CATALOG}.{SCHEMA}"

BRONZE_ORDERS_TABLE = f"{NAMESPACE}.bronze_orders"
BRONZE_CUSTOMERS_TABLE = f"{NAMESPACE}.bronze_customers"
BRONZE_PRODUCTS_TABLE = f"{NAMESPACE}.bronze_products"

SILVER_ORDERS_TABLE = f"{NAMESPACE}.silver_orders_clean"
SILVER_ORDERS_REJECTS_TABLE = f"{NAMESPACE}.silver_orders_rejects"
SILVER_CUSTOMERS_SCD2_TABLE = f"{NAMESPACE}.silver_customers_scd2"
SILVER_PRODUCTS_LATEST_TABLE = f"{NAMESPACE}.silver_products_latest"


def get_widget(name: str, default: str) -> str:
    try:
        dbutils.widgets.text(name, default)
        value = dbutils.widgets.get(name).strip()
        return value or default
    except Exception:
        return default


BATCH_ID = get_widget("batch_id", datetime.now(timezone.utc).strftime("%Y-%m-%d"))
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {NAMESPACE}")




In [None]:
orders_typed = (
    spark.table(BRONZE_ORDERS_TABLE)
    .select(
        "order_id",
        "customer_id",
        "product_id",
        F.to_timestamp("order_ts").alias("order_ts"),
        F.col("quantity").cast("int").alias("quantity"),
        F.col("price").cast("double").alias("price"),
        F.lower(F.col("status")).alias("status"),
        F.col("channel").cast("string").alias("channel"),
        F.col("_batch_id").cast("string").alias("_batch_id"),
        F.to_timestamp("_ingest_ts").alias("_ingest_ts"),
    )
)

invalid_conditions = (
    F.col("order_id").isNull()
    | F.col("customer_id").isNull()
    | F.col("product_id").isNull()
    | F.col("order_ts").isNull()
    | F.col("quantity").isNull()
    | (F.col("quantity") <= 0)
    | F.col("price").isNull()
    | (F.col("price") < 0)
)

orders_validated = (
    orders_typed.withColumn("_is_invalid", invalid_conditions)
    .withColumn(
        "reject_reason",
        F.concat_ws(
            ";",
            F.when(F.col("order_id").isNull(), F.lit("order_id_null")),
            F.when(F.col("customer_id").isNull(), F.lit("customer_id_null")),
            F.when(F.col("product_id").isNull(), F.lit("product_id_null")),
            F.when(F.col("order_ts").isNull(), F.lit("order_ts_null_or_invalid")),
            F.when(F.col("quantity").isNull(), F.lit("quantity_null")),
            F.when(F.col("quantity") <= 0, F.lit("quantity_non_positive")),
            F.when(F.col("price").isNull(), F.lit("price_null")),
            F.when(F.col("price") < 0, F.lit("price_negative")),
        ),
    )
)

orders_rejects = (
    orders_validated.filter(F.col("_is_invalid"))
    .drop("_is_invalid")
    .withColumn("rejected_at", F.current_timestamp())
)

orders_valid = orders_validated.filter(~F.col("_is_invalid")).drop("_is_invalid", "reject_reason")

dedup_window = Window.partitionBy("order_id", "product_id").orderBy(
    F.col("_ingest_ts").desc_nulls_last(),
    F.col("_batch_id").desc_nulls_last(),
)

silver_orders_clean = (
    orders_valid.withColumn("_rn", F.row_number().over(dedup_window))
    .filter(F.col("_rn") == 1)
    .drop("_rn")
)

(
    silver_orders_clean.write.format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable(SILVER_ORDERS_TABLE)
)
(
    orders_rejects.write.format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable(SILVER_ORDERS_REJECTS_TABLE)
)




In [None]:
products_typed = (
    spark.table(BRONZE_PRODUCTS_TABLE)
    .select(
        "product_id",
        F.col("category").cast("string").alias("category"),
        F.col("brand").cast("string").alias("brand"),
        F.col("list_price").cast("double").alias("list_price"),
        F.to_timestamp("updated_at").alias("updated_at"),
        F.col("_batch_id").cast("string").alias("_batch_id"),
        F.to_timestamp("_ingest_ts").alias("_ingest_ts"),
    )
)

products_window = Window.partitionBy("product_id").orderBy(
    F.col("updated_at").desc_nulls_last(),
    F.col("_ingest_ts").desc_nulls_last(),
)

silver_products_latest = (
    products_typed.withColumn("_rn", F.row_number().over(products_window))
    .filter(F.col("_rn") == 1)
    .drop("_rn")
)

(
    silver_products_latest.write.format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable(SILVER_PRODUCTS_LATEST_TABLE)
)




In [None]:
spark.sql(
    f"""
    CREATE TABLE IF NOT EXISTS {SILVER_CUSTOMERS_SCD2_TABLE} (
      customer_sk STRING,
      customer_id STRING,
      email STRING,
      city STRING,
      country STRING,
      valid_from TIMESTAMP,
      valid_to TIMESTAMP,
      is_current BOOLEAN,
      change_hash STRING
    ) USING DELTA
    """
)

customers_bronze = spark.table(BRONZE_CUSTOMERS_TABLE)
customers_stage = customers_bronze.filter(F.col("_batch_id") == BATCH_ID)
if customers_stage.limit(1).count() == 0:
    customers_stage = customers_bronze

customers_stage_latest = (
    customers_stage.select(
        "customer_id",
        "email",
        "city",
        "country",
        F.to_timestamp("updated_at").alias("updated_at"),
        F.to_timestamp("_ingest_ts").alias("_ingest_ts"),
    )
    .filter(F.col("customer_id").isNotNull())
    .withColumn(
        "_rn",
        F.row_number().over(
            Window.partitionBy("customer_id").orderBy(
                F.col("updated_at").desc_nulls_last(),
                F.col("_ingest_ts").desc_nulls_last(),
            )
        ),
    )
    .filter(F.col("_rn") == 1)
    .drop("_rn", "updated_at", "_ingest_ts")
    .withColumn(
        "change_hash",
        F.sha2(
            F.concat_ws(
                "||",
                F.coalesce(F.col("email"), F.lit("")),
                F.coalesce(F.col("city"), F.lit("")),
                F.coalesce(F.col("country"), F.lit("")),
            ),
            256,
        ),
    )
)

customers_stage_latest.createOrReplaceTempView("stg_customers_batch")

spark.sql(
    f"""
    MERGE INTO {SILVER_CUSTOMERS_SCD2_TABLE} tgt
    USING stg_customers_batch src
    ON tgt.customer_id = src.customer_id
      AND tgt.is_current = true
    WHEN MATCHED AND coalesce(tgt.change_hash, '') <> coalesce(src.change_hash, '')
      THEN UPDATE SET
        tgt.valid_to = current_timestamp(),
        tgt.is_current = false
    """
)

spark.sql(
    f"""
    INSERT INTO {SILVER_CUSTOMERS_SCD2_TABLE} (
      customer_sk,
      customer_id,
      email,
      city,
      country,
      valid_from,
      valid_to,
      is_current,
      change_hash
    )
    SELECT
      uuid() AS customer_sk,
      src.customer_id,
      src.email,
      src.city,
      src.country,
      current_timestamp() AS valid_from,
      CAST(NULL AS TIMESTAMP) AS valid_to,
      true AS is_current,
      src.change_hash
    FROM stg_customers_batch src
    LEFT JOIN {SILVER_CUSTOMERS_SCD2_TABLE} tgt
      ON tgt.customer_id = src.customer_id
      AND tgt.is_current = true
      AND coalesce(tgt.change_hash, '') = coalesce(src.change_hash, '')
    WHERE tgt.customer_id IS NULL
    """
)

print(f"[DONE] Built Silver tables for batch_id={BATCH_ID}")
