# Medallion Architecture — Silver Layer

Cleanses, conforms, and enriches Bronze data into dimension and fact tables:

| Silver Table | Source Tables | Purpose |
|---|---|---|
| `dim_geography` | region + nation | Denormalized geography hierarchy |
| `dim_customer` | customer + nation + region | Enriched customer dimension |
| `dim_supplier` | supplier + nation + region | Enriched supplier dimension |
| `dim_part` | part | Cleansed product dimension |
| `dim_date` | generated | Date dimension for analytics |
| `fact_orders` | orders + customer | Enriched order headers |
| `fact_lineitem` | lineitem + part + supplier + partsupp | Enriched line items with cost/revenue |

Uses Delta CHECK constraints, liquid clustering, and column comments.

**Prereq**: Run `01_bronze_layer.ipynb` first.

## 1 — Configuration

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.types import *

CATALOG       = spark.catalog.currentCatalog()
BRONZE_SCHEMA = "retail_bronze"
SILVER_SCHEMA = "retail_silver"

bronze = f"{CATALOG}.{BRONZE_SCHEMA}"
silver = f"{CATALOG}.{SILVER_SCHEMA}"

print(f"Catalog : {CATALOG}")
print(f"Bronze  : {bronze}")
print(f"Silver  : {silver}")

## 2 — Create Silver Schema

In [None]:
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {silver}")
spark.sql(f"COMMENT ON SCHEMA {silver} IS 'Silver layer — cleansed, enriched retail dimensions and facts'")
print(f"Schema ready: {silver}")

## 3 — Helper: Write & Validate

In [None]:
def write_silver(df, table_name, cluster_cols=None, comment=None):
    fqn = f"{silver}.{table_name}"
    df.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable(fqn)
    if cluster_cols:
        spark.sql(f"ALTER TABLE {fqn} CLUSTER BY ({', '.join(cluster_cols)})")
    if comment:
        spark.sql(f"COMMENT ON TABLE {fqn} IS '{comment}'")
    cnt = spark.table(fqn).count()
    print(f"  ✓ {fqn:<55} {cnt:>12,} rows")
    return cnt

---
## 4 — dim_geography
Denormalize region + nation into a single geography hierarchy.

In [None]:
df_region = spark.table(f"{bronze}.region").select("r_regionkey", "r_name")
df_nation = spark.table(f"{bronze}.nation").select("n_nationkey", "n_name", "n_regionkey")

dim_geography = (
    df_nation
    .join(df_region, df_nation.n_regionkey == df_region.r_regionkey, "left")
    .select(
        F.col("n_nationkey").alias("nation_key"),
        F.col("n_name").alias("nation_name"),
        F.col("r_regionkey").alias("region_key"),
        F.col("r_name").alias("region_name"),
    )
)

write_silver(dim_geography, "dim_geography",
    comment="Denormalized geography: nation → region hierarchy")
display(dim_geography)

## 5 — dim_customer
Enrich customer with nation/region names and cleanse fields.

In [None]:
df_cust = spark.table(f"{bronze}.customer")
df_geo  = spark.table(f"{silver}.dim_geography")

dim_customer = (
    df_cust
    .join(df_geo, df_cust.c_nationkey == df_geo.nation_key, "left")
    .select(
        F.col("c_custkey").alias("customer_key"),
        F.trim(F.col("c_name")).alias("customer_name"),
        F.trim(F.col("c_address")).alias("address"),
        F.col("c_phone").alias("phone"),
        F.col("c_acctbal").alias("account_balance"),
        F.col("c_mktsegment").alias("market_segment"),
        F.col("nation_name"),
        F.col("region_name"),
        # Derived: balance tier
        F.when(F.col("c_acctbal") < 0, "Negative")
         .when(F.col("c_acctbal") < 3000, "Low")
         .when(F.col("c_acctbal") < 7000, "Medium")
         .otherwise("High")
         .alias("balance_tier"),
    )
)

write_silver(dim_customer, "dim_customer",
    cluster_cols=["market_segment", "region_name"],
    comment="Enriched customer dimension with geography and balance tier")
display(dim_customer.limit(5))

## 6 — dim_supplier
Enrich supplier with geography.

In [None]:
df_supp = spark.table(f"{bronze}.supplier")

dim_supplier = (
    df_supp
    .join(df_geo, df_supp.s_nationkey == df_geo.nation_key, "left")
    .select(
        F.col("s_suppkey").alias("supplier_key"),
        F.trim(F.col("s_name")).alias("supplier_name"),
        F.trim(F.col("s_address")).alias("address"),
        F.col("s_phone").alias("phone"),
        F.col("s_acctbal").alias("account_balance"),
        F.col("nation_name"),
        F.col("region_name"),
    )
)

write_silver(dim_supplier, "dim_supplier",
    cluster_cols=["region_name"],
    comment="Enriched supplier dimension with geography")
display(dim_supplier.limit(5))

## 7 — dim_part
Cleanse and enrich product catalog.

In [None]:
df_part = spark.table(f"{bronze}.part")

dim_part = (
    df_part
    .select(
        F.col("p_partkey").alias("part_key"),
        F.trim(F.col("p_name")).alias("part_name"),
        F.col("p_mfgr").alias("manufacturer"),
        F.col("p_brand").alias("brand"),
        F.col("p_type").alias("part_type"),
        F.col("p_size").alias("size"),
        F.col("p_container").alias("container"),
        F.col("p_retailprice").alias("retail_price"),
        # Derived: price band
        F.when(F.col("p_retailprice") < 950, "Economy")
         .when(F.col("p_retailprice") < 1050, "Standard")
         .when(F.col("p_retailprice") < 1500, "Premium")
         .otherwise("Luxury")
         .alias("price_band"),
        # Derived: size category
        F.when(F.col("p_size") <= 10, "Small")
         .when(F.col("p_size") <= 30, "Medium")
         .otherwise("Large")
         .alias("size_category"),
    )
)

write_silver(dim_part, "dim_part",
    cluster_cols=["brand", "part_type"],
    comment="Cleansed product dimension with price band and size category")
display(dim_part.limit(5))

## 8 — dim_date
Generate a complete date dimension spanning the TPC-H date range.

In [None]:
# TPC-H dates: 1992-01-01 → 1998-12-31
_start = F.to_date(F.lit("1992-01-01"))
_days  = 2557  # 7 full years

dim_date = (
    spark.range(0, _days)
    .select(F.date_add(_start, F.col("id").cast(IntegerType())).alias("date_key"))
    .withColumn("year",         F.year("date_key"))
    .withColumn("quarter",      F.quarter("date_key"))
    .withColumn("month",        F.month("date_key"))
    .withColumn("month_name",   F.date_format("date_key", "MMMM"))
    .withColumn("week_of_year", F.weekofyear("date_key"))
    .withColumn("day_of_month", F.dayofmonth("date_key"))
    .withColumn("day_of_week",  F.dayofweek("date_key"))
    .withColumn("day_name",     F.date_format("date_key", "EEEE"))
    .withColumn("is_weekend",   F.dayofweek("date_key").isin(1, 7))
    .withColumn("year_quarter", F.concat(F.col("year"), F.lit("-Q"), F.col("quarter")))
    .withColumn("year_month",   F.date_format("date_key", "yyyy-MM"))
)

write_silver(dim_date, "dim_date",
    comment="Date dimension spanning 1992-1998 for TPC-H analytics")
display(dim_date.limit(10))

## 9 — fact_orders
Enriched order headers joined with customer info.

In [None]:
df_orders = spark.table(f"{bronze}.orders")
df_cust_s = spark.table(f"{silver}.dim_customer").select(
    "customer_key", "market_segment", "nation_name", "region_name", "balance_tier"
)

fact_orders = (
    df_orders
    .join(df_cust_s, df_orders.o_custkey == df_cust_s.customer_key, "left")
    .select(
        F.col("o_orderkey").alias("order_key"),
        F.col("customer_key"),
        F.col("o_orderstatus").alias("order_status"),
        F.col("o_totalprice").alias("total_price"),
        F.col("o_orderdate").alias("order_date"),
        F.col("o_orderpriority").alias("order_priority"),
        F.col("o_clerk").alias("clerk"),
        F.col("market_segment"),
        F.col("nation_name").alias("customer_nation"),
        F.col("region_name").alias("customer_region"),
        F.col("balance_tier").alias("customer_balance_tier"),
        # Derived
        F.year("o_orderdate").alias("order_year"),
        F.quarter("o_orderdate").alias("order_quarter"),
        F.month("o_orderdate").alias("order_month"),
    )
)

write_silver(fact_orders, "fact_orders",
    cluster_cols=["order_date", "customer_key"],
    comment="Enriched order fact with customer attributes and date parts")
display(fact_orders.limit(5))

## 10 — fact_lineitem (largest Silver table)
Fully enriched line items with product, supplier, and computed revenue/cost columns.

In [None]:
df_li   = spark.table(f"{bronze}.lineitem")
df_ps   = spark.table(f"{bronze}.partsupp").select("ps_partkey", "ps_suppkey", "ps_supplycost")
df_pt   = spark.table(f"{silver}.dim_part").select("part_key", "brand", "part_type", "price_band", "manufacturer")
df_sp   = spark.table(f"{silver}.dim_supplier").select("supplier_key", "supplier_name", F.col("nation_name").alias("supplier_nation"), F.col("region_name").alias("supplier_region"))

fact_lineitem = (
    df_li
    # Join partsupp for supply cost
    .join(df_ps,
          (df_li.l_partkey == df_ps.ps_partkey) & (df_li.l_suppkey == df_ps.ps_suppkey),
          "left")
    # Join part dimension
    .join(df_pt, df_li.l_partkey == df_pt.part_key, "left")
    # Join supplier dimension
    .join(df_sp, df_li.l_suppkey == df_sp.supplier_key, "left")
    .select(
        # Keys
        F.col("l_orderkey").alias("order_key"),
        F.col("l_linenumber").alias("line_number"),
        F.col("l_partkey").alias("part_key"),
        F.col("l_suppkey").alias("supplier_key"),
        # Measures
        F.col("l_quantity").alias("quantity"),
        F.col("l_extendedprice").alias("extended_price"),
        F.col("l_discount").alias("discount"),
        F.col("l_tax").alias("tax"),
        # Computed revenue & cost
        F.round(F.col("l_extendedprice") * (1 - F.col("l_discount")), 2).alias("net_revenue"),
        F.round(F.col("l_extendedprice") * (1 - F.col("l_discount")) * (1 + F.col("l_tax")), 2).alias("gross_revenue"),
        F.round(F.col("ps_supplycost") * F.col("l_quantity"), 2).alias("supply_cost"),
        F.round(
            F.col("l_extendedprice") * (1 - F.col("l_discount")) - F.coalesce(F.col("ps_supplycost"), F.lit(0)) * F.col("l_quantity"),
            2
        ).alias("profit"),
        # Status
        F.col("l_returnflag").alias("return_flag"),
        F.col("l_linestatus").alias("line_status"),
        # Dates
        F.col("l_shipdate").alias("ship_date"),
        F.col("l_commitdate").alias("commit_date"),
        F.col("l_receiptdate").alias("receipt_date"),
        F.datediff(F.col("l_receiptdate"), F.col("l_commitdate")).alias("delivery_delay_days"),
        F.datediff(F.col("l_shipdate"), F.col("l_commitdate")).alias("ship_delay_days"),
        # Shipping
        F.col("l_shipinstruct").alias("ship_instruct"),
        F.col("l_shipmode").alias("ship_mode"),
        # Enriched from dimensions
        F.col("brand"),
        F.col("part_type"),
        F.col("price_band"),
        F.col("manufacturer"),
        F.col("supplier_name"),
        F.col("supplier_nation"),
        F.col("supplier_region"),
    )
)

write_silver(fact_lineitem, "fact_lineitem",
    cluster_cols=["ship_date", "order_key"],
    comment="Fully enriched line item fact with revenue, cost, profit, and dimension attributes")
display(fact_lineitem.limit(5))

## 11 — Data Quality Constraints
Add Delta CHECK constraints on key Silver tables — Databricks enforces these on every future write.

In [None]:
constraints = [
    ("dim_customer", "valid_customer_key",  "customer_key > 0"),
    ("dim_supplier", "valid_supplier_key",  "supplier_key > 0"),
    ("dim_part",     "valid_part_key",      "part_key > 0"),
    ("dim_part",     "positive_price",      "retail_price > 0"),
    ("fact_orders",  "valid_order_key",     "order_key > 0"),
    ("fact_orders",  "positive_total",      "total_price > 0"),
    ("fact_lineitem","valid_quantity",       "quantity > 0"),
    ("fact_lineitem","valid_net_revenue",    "net_revenue >= 0"),
]

for tbl, name, expr in constraints:
    try:
        spark.sql(f"ALTER TABLE {silver}.{tbl} ADD CONSTRAINT {name} CHECK ({expr})")
        print(f"  ✓ {tbl}.{name}")
    except Exception as e:
        if "already exists" in str(e).lower() or "CONSTRAINT_ALREADY_EXISTS" in str(e):
            print(f"  ○ {tbl}.{name} (already exists)")
        else:
            print(f"  ✗ {tbl}.{name}: {e}")

## 12 — Silver Summary

In [None]:
silver_tables = ["dim_geography", "dim_customer", "dim_supplier", "dim_part", "dim_date",
                 "fact_orders", "fact_lineitem"]

print(f"{'Silver Table':<25} {'Rows':>15}  {'Columns':>8}")
print("=" * 55)
for t in silver_tables:
    df = spark.table(f"{silver}.{t}")
    print(f"{t:<25} {df.count():>15,}  {len(df.columns):>8}")

---
7 Silver tables ready — dimensions denormalized, facts enriched with pre-computed revenue/profit columns.

Continue with `03_gold_layer.ipynb`.