In [None]:
from datetime import datetime, timezone

from pyspark.sql import functions as F
from pyspark.sql.window import Window

CATALOG = "main"
SCHEMA = "retail_p1"
NAMESPACE = f"{CATALOG}.{SCHEMA}"
RAW_BASE_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/raw"

BRONZE_ORDERS_TABLE = f"{NAMESPACE}.bronze_orders"
BRONZE_CUSTOMERS_TABLE = f"{NAMESPACE}.bronze_customers"
BRONZE_PRODUCTS_TABLE = f"{NAMESPACE}.bronze_products"


def get_widget(name: str, default: str) -> str:
    try:
        dbutils.widgets.text(name, default)
        value = dbutils.widgets.get(name).strip()
        return value or default
    except Exception:
        return default


BATCH_ID = get_widget("batch_id", datetime.now(timezone.utc).strftime("%Y-%m-%d"))
SOURCE_PREFIX = get_widget("source_prefix", "olist")

spark.sql(f"CREATE SCHEMA IF NOT EXISTS {NAMESPACE}")


def read_csv(file_name: str):
    path = f"{RAW_BASE_PATH}/{SOURCE_PREFIX}/{file_name}"
    return (
        spark.read.option("header", True)
        .option("inferSchema", True)
        .csv(path)
    )


def table_exists(table_name: str) -> bool:
    namespace, object_name = table_name.rsplit(".", 1)
    return spark.sql(f"SHOW TABLES IN {namespace} LIKE '{object_name}'").limit(1).count() > 0


def append_without_same_batch(df, table_name: str, key_columns: list[str]) -> None:
    # Normalize duplicate keys inside the incoming batch before anti-joining existing rows.
    output_df = df.dropDuplicates(key_columns)
    if table_exists(table_name):
        existing_batch_keys = (
            spark.table(table_name)
            .filter(F.col("_batch_id") == BATCH_ID)
            .select(*key_columns)
            .dropDuplicates()
        )
        output_df = output_df.join(existing_batch_keys, key_columns, "left_anti")

    row_count = output_df.count()
    if row_count == 0:
        print(f"[SKIP] {table_name}: no new rows for batch_id={BATCH_ID}")
        return

    (
        output_df.write.format("delta")
        .mode("append")
        .saveAsTable(table_name)
    )
    print(f"[WRITE] {table_name}: wrote {row_count} rows for batch_id={BATCH_ID}")




In [None]:
# Read public Olist source files from Unity Catalog Volumes.
orders_raw = read_csv("olist_orders_dataset.csv")
order_items_raw = read_csv("olist_order_items_dataset.csv")
payments_raw = read_csv("olist_order_payments_dataset.csv")
customers_raw = read_csv("olist_customers_dataset.csv")
products_raw = read_csv("olist_products_dataset.csv")
category_translation_raw = read_csv("product_category_name_translation.csv")




In [None]:
# Build bronze_orders from order headers, items, and payment type as channel.
payments_primary = (
    payments_raw.select("order_id", "payment_type", "payment_value", "payment_sequential")
    .withColumn(
        "_rn",
        F.row_number().over(
            Window.partitionBy("order_id").orderBy(
                F.col("payment_value").desc_nulls_last(),
                F.col("payment_sequential").desc_nulls_last(),
            )
        ),
    )
    .filter(F.col("_rn") == 1)
    .select(
        "order_id",
        F.coalesce(F.col("payment_type"), F.lit("online")).alias("channel"),
    )
)

order_lines = (
    order_items_raw.select(
        "order_id",
        "product_id",
        F.col("price").cast("double").alias("item_price"),
    )
    .withColumn("quantity", F.lit(1))
)

bronze_orders = (
    orders_raw.select(
        "order_id",
        "customer_id",
        F.col("order_purchase_timestamp").cast("timestamp").alias("order_ts"),
        F.col("order_status").alias("status"),
    )
    .join(order_lines, on="order_id", how="inner")
    .join(payments_primary, on="order_id", how="left")
    .withColumn("channel", F.coalesce(F.col("channel"), F.lit("online")))
    .groupBy("order_id", "customer_id", "product_id", "order_ts", "status", "channel")
    .agg(
        F.sum("quantity").cast("int").alias("quantity"),
        F.round(F.sum("item_price"), 2).cast("double").alias("price"),
    )
    .withColumn("_batch_id", F.lit(BATCH_ID))
    .withColumn("_ingest_ts", F.current_timestamp())
    .select(
        "order_id",
        "customer_id",
        "product_id",
        "order_ts",
        "quantity",
        "price",
        "status",
        "channel",
        "_batch_id",
        "_ingest_ts",
    )
)




In [None]:
# Olist does not include email/country directly, so we normalize to a stable contract.
bronze_customers = (
    customers_raw.select(
        "customer_id",
        F.concat(F.col("customer_unique_id"), F.lit("@unknown.local")).alias("email"),
        F.col("customer_city").alias("city"),
        F.lit("Brazil").alias("country"),
    )
    .withColumn("updated_at", F.current_timestamp())
    .withColumn("_batch_id", F.lit(BATCH_ID))
    .withColumn("_ingest_ts", F.current_timestamp())
    .select(
        "customer_id",
        "email",
        "city",
        "country",
        "updated_at",
        "_batch_id",
        "_ingest_ts",
    )
)




In [None]:
category_lookup = category_translation_raw.select(
    F.col("product_category_name").alias("raw_category"),
    F.col("product_category_name_english").alias("category"),
)

price_lookup = (
    order_items_raw.groupBy("product_id")
    .agg(F.round(F.avg(F.col("price").cast("double")), 2).alias("list_price"))
)

bronze_products = (
    products_raw.select("product_id", "product_category_name")
    .join(
        category_lookup,
        on=F.col("product_category_name") == F.col("raw_category"),
        how="left",
    )
    .drop("raw_category")
    .join(price_lookup, on="product_id", how="left")
    .withColumn("category", F.coalesce(F.col("category"), F.col("product_category_name")))
    .withColumn("brand", F.lit("unknown"))
    .withColumn("list_price", F.coalesce(F.col("list_price"), F.lit(0.0)))
    .withColumn("updated_at", F.current_timestamp())
    .withColumn("_batch_id", F.lit(BATCH_ID))
    .withColumn("_ingest_ts", F.current_timestamp())
    .select(
        "product_id",
        "category",
        "brand",
        "list_price",
        "updated_at",
        "_batch_id",
        "_ingest_ts",
    )
)




In [None]:
append_without_same_batch(
    bronze_orders,
    BRONZE_ORDERS_TABLE,
    ["order_id", "product_id"],
)
append_without_same_batch(
    bronze_customers,
    BRONZE_CUSTOMERS_TABLE,
    ["customer_id"],
)
append_without_same_batch(
    bronze_products,
    BRONZE_PRODUCTS_TABLE,
    ["product_id"],
)
