# 03 Silver to Gold

Creates Gold layer aggregated tables from Silver Delta tables.

**Prerequisites:**
- Run `02-onelake-to-silver.ipynb` first to populate Silver tables

**Gold Tables Created:**
- Sales aggregates (minute, daily)
- Inventory position (current snapshot)
- Fulfillment metrics
- Marketing/campaign metrics
- Zone dwell and BLE presence

**Note:** Some tables depend on fact tables not yet implemented (stockouts, reorders, promotions, store_ops). These are stubbed but will be empty until datagen is updated.

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from datetime import datetime

# Database names
SILVER_DB = "ag"
GOLD_DB = "ag_gold"

def ensure_database(name):
    spark.sql(f"CREATE DATABASE IF NOT EXISTS {name}")

def save_gold_table(df, table_name, mode="overwrite"):
    full_name = f"{GOLD_DB}.{table_name}"
    df.write.format("delta").mode(mode).saveAsTable(full_name)
    print(f"  Written to {full_name}: {df.count()} rows")

def read_silver(table_name):
    return spark.table(f"{SILVER_DB}.{table_name}")

def table_exists(db, table):
    try:
        spark.table(f"{db}.{table}")
        return True
    except:
        return False

ensure_database(GOLD_DB)
print(f"Gold database ready: {GOLD_DB}")

## Sales Aggregates

In [None]:
# gold_sales_minute_store - Sales by minute per store
print("Creating gold_sales_minute_store...")
try:
    df_receipts = read_silver("fact_receipts")
    
    df_sales_minute = (
        df_receipts
        .withColumn("ts", F.date_trunc("minute", F.col("event_ts")))
        .groupBy("store_id", "ts")
        .agg(
            F.sum("total").alias("total_sales"),
            F.count("*").alias("receipts"),
            F.avg("total").alias("avg_basket")
        )
        .orderBy("store_id", "ts")
    )
    
    save_gold_table(df_sales_minute, "gold_sales_minute_store")
except Exception as e:
    print(f"  Skipping: {e}")

In [None]:
# gold_top_products_15m - Top products by revenue (rolling 15m windows)
print("Creating gold_top_products_15m...")
try:
    df_lines = read_silver("fact_receipt_lines")
    
    df_top_products = (
        df_lines
        .withColumn("ts", F.date_trunc("minute", F.col("event_ts")))
        .withColumn("window_15m", F.window(F.col("event_ts"), "15 minutes"))
        .groupBy("product_id", "window_15m")
        .agg(
            F.sum("ext_price").alias("revenue"),
            F.sum("quantity").alias("units")
        )
        .withColumn("computed_at", F.col("window_15m.end"))
        .drop("window_15m")
        .orderBy(F.desc("revenue"))
    )
    
    save_gold_table(df_top_products, "gold_top_products_15m")
except Exception as e:
    print(f"  Skipping: {e}")

## Inventory Position

In [None]:
# gold_inventory_position_current - Current store inventory
print("Creating gold_inventory_position_current...")
try:
    df_store_inv = read_silver("fact_store_inventory_txn")
    
    # Get latest balance per store/product
    window_spec = Window.partitionBy("store_id", "product_id").orderBy(F.desc("event_ts"))
    
    df_current = (
        df_store_inv
        .withColumn("rn", F.row_number().over(window_spec))
        .filter(F.col("rn") == 1)
        .select(
            "store_id",
            "product_id",
            F.col("balance").alias("on_hand"),
            F.col("event_ts").alias("as_of")
        )
    )
    
    save_gold_table(df_current, "gold_inventory_position_current")
except Exception as e:
    print(f"  Skipping: {e}")

In [None]:
# gold_dc_inventory_position_current - Current DC inventory
print("Creating gold_dc_inventory_position_current...")
try:
    df_dc_inv = read_silver("fact_dc_inventory_txn")
    
    window_spec = Window.partitionBy("dc_id", "product_id").orderBy(F.desc("event_ts"))
    
    df_dc_current = (
        df_dc_inv
        .withColumn("rn", F.row_number().over(window_spec))
        .filter(F.col("rn") == 1)
        .select(
            "dc_id",
            "product_id",
            F.col("balance").alias("on_hand"),
            F.col("event_ts").alias("as_of")
        )
    )
    
    save_gold_table(df_dc_current, "gold_dc_inventory_position_current")
except Exception as e:
    print(f"  Skipping: {e}")

## Logistics & Fulfillment

In [None]:
# gold_truck_dwell_daily - Truck dwell time by site per day
print("Creating gold_truck_dwell_daily...")
try:
    df_trucks = read_silver("fact_truck_moves")
    
    # Calculate dwell as time between eta and etd
    df_dwell = (
        df_trucks
        .withColumn("day", F.to_date("event_ts"))
        .withColumn("site", 
            F.when(F.col("store_id").isNotNull(), F.concat(F.lit("STORE_"), F.col("store_id")))
             .otherwise(F.concat(F.lit("DC_"), F.col("dc_id")))
        )
        .withColumn("dwell_min", 
            (F.unix_timestamp("etd") - F.unix_timestamp("eta")) / 60
        )
        .filter(F.col("dwell_min").isNotNull() & (F.col("dwell_min") > 0))
        .groupBy("site", "day")
        .agg(
            F.avg("dwell_min").alias("avg_dwell_min"),
            F.countDistinct("truck_id").alias("trucks")
        )
    )
    
    save_gold_table(df_dwell, "gold_truck_dwell_daily")
except Exception as e:
    print(f"  Skipping: {e}")

In [None]:
# gold_online_sales_daily - Online sales aggregated daily
print("Creating gold_online_sales_daily...")
try:
    df_online = read_silver("fact_online_order_headers")
    
    df_online_daily = (
        df_online
        .withColumn("day", F.to_date("event_ts"))
        .groupBy("day")
        .agg(
            F.count("*").alias("orders"),
            F.sum("subtotal").alias("subtotal"),
            F.sum("tax").alias("tax"),
            F.sum("total").alias("total"),
            F.avg("total").alias("avg_order_value")
        )
        .orderBy("day")
    )
    
    save_gold_table(df_online_daily, "gold_online_sales_daily")
except Exception as e:
    print(f"  Skipping: {e}")

In [None]:
# gold_fulfillment_daily - Fulfillment performance daily
print("Creating gold_fulfillment_daily...")
try:
    df_lines = read_silver("fact_online_order_lines")
    
    # Use shipped_ts or picked_ts for the day
    df_fulfill = (
        df_lines
        .withColumn("day", F.to_date(F.coalesce("shipped_ts", "picked_ts", "delivered_ts")))
        .filter(F.col("day").isNotNull())
        .groupBy("day", "fulfillment_mode", "fulfillment_status")
        .agg(
            F.countDistinct("order_id").alias("orders"),
            F.sum("quantity").alias("units")
        )
        .orderBy("day")
    )
    
    save_gold_table(df_fulfill, "gold_fulfillment_daily")
except Exception as e:
    print(f"  Skipping: {e}")

## Customer & Zone Analytics

In [None]:
# gold_zone_dwell_minute - Zone dwell per minute
print("Creating gold_zone_dwell_minute...")
try:
    df_traffic = read_silver("fact_foot_traffic")
    
    df_zone_dwell = (
        df_traffic
        .withColumn("ts", F.date_trunc("minute", F.col("event_ts")))
        .groupBy("store_id", "zone", "ts")
        .agg(
            F.avg("dwell_seconds").alias("avg_dwell"),
            F.sum("count").alias("customers")
        )
        .orderBy("store_id", "zone", "ts")
    )
    
    save_gold_table(df_zone_dwell, "gold_zone_dwell_minute")
except Exception as e:
    print(f"  Skipping: {e}")

In [None]:
# gold_ble_presence_minute - BLE device presence per minute
print("Creating gold_ble_presence_minute...")
try:
    df_ble = read_silver("fact_ble_pings")
    
    df_presence = (
        df_ble
        .withColumn("ts", F.date_trunc("minute", F.col("event_ts")))
        .groupBy("store_id", "ts")
        .agg(
            F.countDistinct("customer_ble_id").alias("devices")
        )
        .orderBy("store_id", "ts")
    )
    
    save_gold_table(df_presence, "gold_ble_presence_minute")
except Exception as e:
    print(f"  Skipping: {e}")

## Marketing & Campaigns

In [None]:
# gold_marketing_cost_daily - Marketing impressions and cost daily
print("Creating gold_marketing_cost_daily...")
try:
    df_marketing = read_silver("fact_marketing")
    
    df_mkt_daily = (
        df_marketing
        .withColumn("day", F.to_date("event_ts"))
        .groupBy("campaign_id", "day")
        .agg(
            F.count("*").alias("impressions"),
            F.sum("cost").alias("cost")
        )
        .orderBy("campaign_id", "day")
    )
    
    save_gold_table(df_mkt_daily, "gold_marketing_cost_daily")
except Exception as e:
    print(f"  Skipping: {e}")

In [None]:
# gold_campaign_revenue_daily - Campaign revenue with conversion tracking
# Note: This requires joining marketing impressions with receipts via customer_id
# which is a complex attribution model. Simplified version here.
print("Creating gold_campaign_revenue_daily...")
try:
    df_marketing = read_silver("fact_marketing")
    df_receipts = read_silver("fact_receipts")
    
    # Simple attribution: count conversions where customer had impression same day
    df_mkt_day = df_marketing.withColumn("day", F.to_date("event_ts")).select(
        "campaign_id", "day", "customer_id"
    ).distinct()
    
    df_receipts_day = df_receipts.withColumn("day", F.to_date("event_ts")).select(
        "customer_id", "day", "total"
    )
    
    df_attributed = (
        df_mkt_day
        .join(df_receipts_day, ["customer_id", "day"], "left")
        .groupBy("campaign_id", "day")
        .agg(
            F.count("*").alias("impressions"),
            F.count("total").alias("conversions"),
            F.sum("total").alias("revenue")
        )
        .orderBy("campaign_id", "day")
    )
    
    save_gold_table(df_attributed, "gold_campaign_revenue_daily")
except Exception as e:
    print(f"  Skipping: {e}")

## Tender Mix (Payments)

**Note:** This uses `payment_method` from `fact_receipts`. 
A dedicated `fact_payments` table is planned (see GitHub issue #7).

In [None]:
# gold_tender_mix_daily - Payment methods by day
print("Creating gold_tender_mix_daily...")
try:
    df_receipts = read_silver("fact_receipts")
    
    df_tender = (
        df_receipts
        .withColumn("day", F.to_date("event_ts"))
        .groupBy("day", "payment_method")
        .agg(
            F.count("*").alias("transactions"),
            F.sum("total").alias("total_amount")
        )
        .orderBy("day", "payment_method")
    )
    
    save_gold_table(df_tender, "gold_tender_mix_daily")
except Exception as e:
    print(f"  Skipping: {e}")

## Future Tables (Pending Datagen Updates)

The following Gold tables depend on fact tables not yet implemented:

| Gold Table | Depends On | GitHub Issue |
|------------|------------|---------------|
| `gold_stockouts_daily` | `fact_stockouts` | #8 |
| `gold_reorders_daily` | `fact_reorders` | #9 |
| `gold_promo_performance_daily` | `fact_promotions` | #10 |
| `gold_store_ops_daily` | `fact_store_ops` | #11 |

These are stubbed below but will return empty until the source tables are implemented.

In [None]:
# Stubbed future tables - these will be empty until source facts exist

from pyspark.sql.types import StructType, StructField, DateType, LongType, StringType, DoubleType, TimestampType

# gold_stockouts_daily (pending fact_stockouts - issue #8)
print("Creating gold_stockouts_daily (stub)...")
schema_stockouts = StructType([
    StructField("day", DateType(), True),
    StructField("store_id", LongType(), True),
    StructField("dc_id", LongType(), True),
    StructField("stockout_count", LongType(), True),
    StructField("products_affected", LongType(), True)
])
df_empty = spark.createDataFrame([], schema_stockouts)
save_gold_table(df_empty, "gold_stockouts_daily")

# gold_reorders_daily (pending fact_reorders - issue #9)
print("Creating gold_reorders_daily (stub)...")
schema_reorders = StructType([
    StructField("day", DateType(), True),
    StructField("store_id", LongType(), True),
    StructField("dc_id", LongType(), True),
    StructField("priority", StringType(), True),
    StructField("reorder_count", LongType(), True),
    StructField("total_units_ordered", LongType(), True)
])
df_empty = spark.createDataFrame([], schema_reorders)
save_gold_table(df_empty, "gold_reorders_daily")

# gold_promo_performance_daily (pending fact_promotions - issue #10)
print("Creating gold_promo_performance_daily (stub)...")
schema_promo = StructType([
    StructField("day", DateType(), True),
    StructField("promo_code", StringType(), True),
    StructField("discount_type", StringType(), True),
    StructField("times_applied", LongType(), True),
    StructField("total_discount", DoubleType(), True),
    StructField("products_discounted", LongType(), True)
])
df_empty = spark.createDataFrame([], schema_promo)
save_gold_table(df_empty, "gold_promo_performance_daily")

# gold_store_ops_daily (pending fact_store_ops - issue #11)
print("Creating gold_store_ops_daily (stub)...")
schema_store_ops = StructType([
    StructField("day", DateType(), True),
    StructField("store_id", LongType(), True),
    StructField("operation_type", StringType(), True),
    StructField("operation_count", LongType(), True),
    StructField("first_operation", TimestampType(), True),
    StructField("last_operation", TimestampType(), True)
])
df_empty = spark.createDataFrame([], schema_store_ops)
save_gold_table(df_empty, "gold_store_ops_daily")

In [None]:
print("\n" + "="*60)
print("Silver to Gold transformation complete!")
print("="*60)
print(f"\nGold tables created in: {GOLD_DB}")
print("\nRun: spark.sql('SHOW TABLES IN ag_gold').show() to list all tables")