# 03 Silver to Gold

Creates Gold layer aggregated tables from Silver Delta tables.

**Prerequisites:**
- Run `02-onelake-to-silver.ipynb` first to populate Silver tables

**Gold Tables Created:**
- Sales aggregates (minute, daily)
- Inventory position (current snapshot)
- Fulfillment metrics
- Marketing/campaign metrics
- Zone dwell and BLE presence

**Note:** Some tables depend on fact tables not yet implemented (stockouts, reorders, promotions, store_ops). These are stubbed but will be empty until datagen is updated.

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.utils import AnalysisException
from datetime import datetime
import os
import warnings

# =============================================================================
# PARAMETERS - Configure these for your environment
# =============================================================================
# REQUIRED ENVIRONMENT VARIABLES:
#   - SILVER_DB: Database name for Silver layer tables (source)
#   - GOLD_DB: Database name for Gold layer tables (target)
#
# These can be set via:
#   1. Fabric pipeline parameters (when run from a pipeline)
#   2. Environment variables in the Fabric workspace
#   3. Notebook %run magic or widget parameters
#
# For local testing, you can uncomment the defaults below:
#   SILVER_DB = "ag"
#   GOLD_DB = "au"
# =============================================================================

def get_required_env(var_name, description, default=None):
    """Get required environment variable with clear error message."""
    value = os.environ.get(var_name, default)
    if value is None:
        raise EnvironmentError(
            f"Required environment variable '{var_name}' is not set.\n"
            f"Description: {description}\n"
            f"Set it via Fabric pipeline parameters or workspace environment variables."
        )
    return value

# Database names - REQUIRED
# Uncomment the defaults for local testing
SILVER_DB = get_required_env(
    "SILVER_DB",
    "Source database containing Silver layer Delta tables",
    default="ag"  # Default for backward compatibility; remove in production
)
GOLD_DB = get_required_env(
    "GOLD_DB",
    "Target database for Gold layer aggregated tables",
    default="au"  # Default for backward compatibility; remove in production
)

print(f"Configuration: SILVER_DB={SILVER_DB}, GOLD_DB={GOLD_DB}")

def ensure_database(name):
    """Create database if it doesn't exist and validate access."""
    spark.sql(f"CREATE DATABASE IF NOT EXISTS {name}")
    # Validate we can access the database
    try:
        spark.sql(f"DESCRIBE DATABASE {name}")
        print(f"Database '{name}' is ready.")
    except AnalysisException as e:
        raise RuntimeError(f"Cannot access database '{name}': {e}")

def save_gold_table(df, table_name, mode="overwrite"):
    full_name = f"{GOLD_DB}.{table_name}"
    df.write.format("delta").mode(mode).saveAsTable(full_name)
    print(f"  Written to {full_name}: {df.count()} rows")

def read_silver(table_name):
    return spark.table(f"{SILVER_DB}.{table_name}")

def table_exists(db, table):
    try:
        spark.table(f"{db}.{table}")
        return True
    except AnalysisException:
        return False

def process_gold_table(table_name, transform_fn):
    """Process a gold table with proper exception handling.
    
    Args:
        table_name: Name of the gold table to create
        transform_fn: Function that returns a DataFrame
        
    Returns:
        True if processed successfully, False if skipped
        
    Raises:
        PermissionError: Re-raised for infrastructure issues
    """
    try:
        print(f"Creating {table_name}...")
        df = transform_fn()
        save_gold_table(df, table_name)
        return True
    except AnalysisException as e:
        # Table doesn't exist in Silver or schema mismatch
        print(f"  Skipping {table_name}: Source table not available - {e}")
        return False
    except PermissionError as e:
        # Re-raise permission errors - infrastructure problem
        raise
    except Exception as e:
        # Log unexpected errors with type for debugging
        print(f"  Skipping {table_name}: {type(e).__name__}: {e}")
        return False

ensure_database(GOLD_DB)
print(f"Gold database ready: {GOLD_DB}")

## Sales Aggregates

In [None]:
# gold_sales_minute_store - Sales by minute per store
def create_sales_minute_store():
    df_receipts = read_silver("fact_receipts")
    return (
        df_receipts
        .withColumn("ts", F.date_trunc("minute", F.col("event_ts")))
        .groupBy("store_id", "ts")
        .agg(
            F.sum("total").alias("total_sales"),
            F.count("*").alias("receipts"),
            F.avg("total").alias("avg_basket")
        )
        .orderBy("store_id", "ts")
    )

process_gold_table("sales_minute_store", create_sales_minute_store)

In [None]:
# gold_top_products_15m - Top products by revenue (rolling 15m windows)
def create_top_products_15m():
    df_lines = read_silver("fact_receipt_lines")
    return (
        df_lines
        .withColumn("ts", F.date_trunc("minute", F.col("event_ts")))
        .withColumn("window_15m", F.window(F.col("event_ts"), "15 minutes"))
        .groupBy("product_id", "window_15m")
        .agg(
            F.sum("ext_price").alias("revenue"),
            F.sum("quantity").alias("units")
        )
        .withColumn("computed_at", F.col("window_15m.end"))
        .drop("window_15m")
        .orderBy(F.desc("revenue"))
    )

process_gold_table("top_products_15m", create_top_products_15m)

## Inventory Position

In [None]:
# gold_inventory_position_current - Current store inventory
def create_inventory_position_current():
    df_store_inv = read_silver("fact_store_inventory_txn")
    window_spec = Window.partitionBy("store_id", "product_id").orderBy(F.desc("event_ts"))
    return (
        df_store_inv
        .withColumn("rn", F.row_number().over(window_spec))
        .filter(F.col("rn") == 1)
        .select(
            "store_id",
            "product_id",
            F.col("balance").alias("on_hand"),
            F.col("event_ts").alias("as_of")
        )
    )

process_gold_table("inventory_position_current", create_inventory_position_current)

In [None]:
# gold_dc_inventory_position_current - Current DC inventory
def create_dc_inventory_position_current():
    df_dc_inv = read_silver("fact_dc_inventory_txn")
    window_spec = Window.partitionBy("dc_id", "product_id").orderBy(F.desc("event_ts"))
    return (
        df_dc_inv
        .withColumn("rn", F.row_number().over(window_spec))
        .filter(F.col("rn") == 1)
        .select(
            "dc_id",
            "product_id",
            F.col("balance").alias("on_hand"),
            F.col("event_ts").alias("as_of")
        )
    )

process_gold_table("dc_inventory_position_current", create_dc_inventory_position_current)

## Logistics & Fulfillment

In [None]:
# gold_truck_dwell_daily - Truck dwell time by site per day
def create_truck_dwell_daily():
    df_trucks = read_silver("fact_truck_moves")
    return (
        df_trucks
        .withColumn("day", F.to_date("event_ts"))
        .withColumn("site", 
            F.when(F.col("store_id").isNotNull(), F.concat(F.lit("STORE_"), F.col("store_id")))
             .otherwise(F.concat(F.lit("DC_"), F.col("dc_id")))
        )
        .withColumn("dwell_min", 
            (F.unix_timestamp("etd") - F.unix_timestamp("eta")) / 60
        )
        .filter(F.col("dwell_min").isNotNull() & (F.col("dwell_min") > 0))
        .groupBy("site", "day")
        .agg(
            F.avg("dwell_min").alias("avg_dwell_min"),
            F.countDistinct("truck_id").alias("trucks")
        )
    )

process_gold_table("truck_dwell_daily", create_truck_dwell_daily)

In [None]:
# gold_online_sales_daily - Online sales aggregated daily
def create_online_sales_daily():
    df_online = read_silver("fact_online_order_headers")
    return (
        df_online
        .withColumn("day", F.to_date("event_ts"))
        .groupBy("day")
        .agg(
            F.count("*").alias("orders"),
            F.sum("subtotal").alias("subtotal"),
            F.sum("tax").alias("tax"),
            F.sum("total").alias("total"),
            F.avg("total").alias("avg_order_value")
        )
        .orderBy("day")
    )

process_gold_table("online_sales_daily", create_online_sales_daily)

In [None]:
# gold_fulfillment_daily - Fulfillment performance daily
def create_fulfillment_daily():
    df_lines = read_silver("fact_online_order_lines")
    return (
        df_lines
        .withColumn("day", F.to_date(F.coalesce("shipped_ts", "picked_ts", "delivered_ts")))
        .filter(F.col("day").isNotNull())
        .groupBy("day", "fulfillment_mode", "fulfillment_status")
        .agg(
            F.countDistinct("order_id").alias("orders"),
            F.sum("quantity").alias("units")
        )
        .orderBy("day")
    )

process_gold_table("fulfillment_daily", create_fulfillment_daily)

## Customer & Zone Analytics

In [None]:
# gold_zone_dwell_minute - Zone dwell per minute
def create_zone_dwell_minute():
    df_traffic = read_silver("fact_foot_traffic")
    return (
        df_traffic
        .withColumn("ts", F.date_trunc("minute", F.col("event_ts")))
        .groupBy("store_id", "zone", "ts")
        .agg(
            F.avg("dwell_seconds").alias("avg_dwell"),
            F.sum("count").alias("customers")
        )
        .orderBy("store_id", "zone", "ts")
    )

process_gold_table("zone_dwell_minute", create_zone_dwell_minute)

In [None]:
# gold_ble_presence_minute - BLE device presence per minute
def create_ble_presence_minute():
    df_ble = read_silver("fact_ble_pings")
    return (
        df_ble
        .withColumn("ts", F.date_trunc("minute", F.col("event_ts")))
        .groupBy("store_id", "ts")
        .agg(
            F.countDistinct("customer_ble_id").alias("devices")
        )
        .orderBy("store_id", "ts")
    )

process_gold_table("ble_presence_minute", create_ble_presence_minute)

## Marketing & Campaigns

In [None]:
# gold_marketing_cost_daily - Marketing impressions and cost daily
def create_marketing_cost_daily():
    df_marketing = read_silver("fact_marketing")
    return (
        df_marketing
        .withColumn("day", F.to_date("event_ts"))
        .groupBy("campaign_id", "day")
        .agg(
            F.count("*").alias("impressions"),
            F.sum("cost").alias("cost")
        )
        .orderBy("campaign_id", "day")
    )

process_gold_table("marketing_cost_daily", create_marketing_cost_daily)

In [None]:
# gold_campaign_revenue_daily - Campaign revenue with conversion tracking
# Note: This requires joining marketing impressions with receipts via customer_id
# which is a complex attribution model. Simplified version here.
def create_campaign_revenue_daily():
    df_marketing = read_silver("fact_marketing")
    df_receipts = read_silver("fact_receipts")
    
    # Simple attribution: count conversions where customer had impression same day
    df_mkt_day = df_marketing.withColumn("day", F.to_date("event_ts")).select(
        "campaign_id", "day", "customer_id"
    ).distinct()
    
    df_receipts_day = df_receipts.withColumn("day", F.to_date("event_ts")).select(
        "customer_id", "day", "total"
    )
    
    return (
        df_mkt_day
        .join(df_receipts_day, ["customer_id", "day"], "left")
        .groupBy("campaign_id", "day")
        .agg(
            F.count("*").alias("impressions"),
            F.count("total").alias("conversions"),
            F.sum("total").alias("revenue")
        )
        .orderBy("campaign_id", "day")
    )

process_gold_table("campaign_revenue_daily", create_campaign_revenue_daily)

## Tender Mix (Payments)

**Note:** This uses `payment_method` from `fact_receipts`. 
A dedicated `fact_payments` table is planned (see GitHub issue #7).

In [None]:
# gold_tender_mix_daily - Payment methods by day
def create_tender_mix_daily():
    df_receipts = read_silver("fact_receipts")
    return (
        df_receipts
        .withColumn("day", F.to_date("event_ts"))
        .groupBy("day", "payment_method")
        .agg(
            F.count("*").alias("transactions"),
            F.sum("total").alias("total_amount")
        )
        .orderBy("day", "payment_method")
    )

process_gold_table("tender_mix_daily", create_tender_mix_daily)