# Historical Data Load (Bronze -> Silver -> Gold)

Loads batch historical data from Files/ parquet shortcuts through the complete medallion pipeline.

## Data Flow
```
Files/ (parquet) --> Silver (Delta) --> Gold (Delta)
```

## Usage
Run this notebook **once** to load historical batch data.

For streaming data, use:
- `03-streaming-to-silver.ipynb` - Process Eventhouse events to Silver
- `04-streaming-to-gold.ipynb` - Aggregate streaming data to Gold

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.utils import AnalysisException
from notebookutils import mssparkutils
import os

In [None]:
# =============================================================================
# PARAMETERS
# =============================================================================

def get_env(var_name, default=None):
    return os.environ.get(var_name, default)

SILVER_DB = get_env("SILVER_DB", default="ag")
GOLD_DB = get_env("GOLD_DB", default="au")

print(f"Configuration: SILVER_DB={SILVER_DB}, GOLD_DB={GOLD_DB}")

In [None]:
# =============================================================================
# HELPER FUNCTIONS
# =============================================================================

def ensure_database(name):
    spark.sql(f"CREATE DATABASE IF NOT EXISTS {name}")
    print(f"Database '{name}' ready.")

def parquet_exists(table_name):
    try:
        mssparkutils.fs.ls(f"Files/{table_name}")
        return True
    except Exception:
        return False

def drop_table_if_exists(db, table_name):
    """Drop table if it exists."""
    try:
        spark.sql(f"DROP TABLE IF EXISTS {db}.{table_name}")
    except Exception:
        pass

def load_to_silver(table_name):
    """Load parquet from Files/ to Silver Delta table (drop and recreate)."""
    if not parquet_exists(table_name):
        print(f"  Skipping: Files/{table_name} does not exist")
        return False
    
    try:
        # Drop existing table first
        drop_table_if_exists(SILVER_DB, table_name)
        
        # Read parquet - disable schema merge to avoid type conflicts
        # Uses schema from first file encountered
        df = spark.read.option("mergeSchema", "false").parquet(f"Files/{table_name}")
        
        # Cast Source column to string if present (known schema inconsistency)
        if "Source" in df.columns:
            df = df.withColumn("Source", F.col("Source").cast("string"))
        
        full_name = f"{SILVER_DB}.{table_name}"
        df.write.format("delta").mode("overwrite").saveAsTable(full_name)
        print(f"  {full_name}: {df.count()} rows")
        return True
    except Exception as e:
        print(f"  Error loading {table_name}: {e}")
        return False

def read_silver(table_name):
    return spark.table(f"{SILVER_DB}.{table_name}")

def save_gold(df, table_name):
    drop_table_if_exists(GOLD_DB, table_name)
    full_name = f"{GOLD_DB}.{table_name}"
    df.write.format("delta").mode("overwrite").saveAsTable(full_name)
    print(f"  {full_name}: {df.count()} rows")

def silver_exists(table_name):
    try:
        spark.table(f"{SILVER_DB}.{table_name}")
        return True
    except AnalysisException:
        return False

ensure_database(SILVER_DB)
ensure_database(GOLD_DB)

---
## Part 1: Files/ -> Silver

Load dimension and fact tables from parquet to Delta.

In [None]:
print("="*60)
print("LOADING DIMENSIONS TO SILVER")
print("="*60)

dimensions = [
    "dim_geographies", "dim_stores", "dim_distribution_centers",
    "dim_trucks", "dim_customers", "dim_products"
]

for table in dimensions:
    load_to_silver(table)

print()

In [None]:
print("="*60)
print("LOADING FACTS TO SILVER")
print("="*60)

facts = [
    "fact_receipts", "fact_receipt_lines", "fact_payments",
    "fact_store_inventory_txn", "fact_dc_inventory_txn",
    "fact_truck_moves", "fact_truck_inventory",
    "fact_foot_traffic", "fact_ble_pings", "fact_customer_zone_changes",
    "fact_marketing", "fact_online_order_headers", "fact_online_order_lines",
    "fact_store_ops", "fact_stockouts", "fact_promotions",
    "fact_promo_lines", "fact_reorders"
]

for table in facts:
    load_to_silver(table)

print()

---
## Part 2: Silver -> Gold

Create aggregated Gold tables for dashboards.

In [None]:
print("="*60)
print("CREATING GOLD AGGREGATES")
print("="*60)

# Sales by minute per store
if silver_exists("fact_receipts"):
    print("Creating sales_minute_store...")
    df = (
        read_silver("fact_receipts")
        .withColumn("ts", F.date_trunc("minute", F.col("event_ts")))
        .groupBy("store_id", "ts")
        .agg(
            F.sum("total").alias("total_sales"),
            F.count("*").alias("receipts"),
            F.avg("total").alias("avg_basket")
        )
    )
    save_gold(df, "sales_minute_store")

In [None]:
# Top products by revenue (15m windows)
if silver_exists("fact_receipt_lines"):
    print("Creating top_products_15m...")
    df = (
        read_silver("fact_receipt_lines")
        .withColumn("window_15m", F.window(F.col("event_ts"), "15 minutes"))
        .groupBy("product_id", "window_15m")
        .agg(
            F.sum("ext_price").alias("revenue"),
            F.sum("quantity").alias("units")
        )
        .withColumn("computed_at", F.col("window_15m.end"))
        .drop("window_15m")
    )
    save_gold(df, "top_products_15m")

In [None]:
# Current store inventory position
if silver_exists("fact_store_inventory_txn"):
    print("Creating inventory_position_current...")
    window_spec = Window.partitionBy("store_id", "product_id").orderBy(F.desc("event_ts"))
    df = (
        read_silver("fact_store_inventory_txn")
        .withColumn("rn", F.row_number().over(window_spec))
        .filter(F.col("rn") == 1)
        .select(
            "store_id", "product_id",
            F.col("balance").alias("on_hand"),
            F.col("event_ts").alias("as_of")
        )
    )
    save_gold(df, "inventory_position_current")

In [None]:
# DC inventory position
if silver_exists("fact_dc_inventory_txn"):
    print("Creating dc_inventory_position_current...")
    window_spec = Window.partitionBy("dc_id", "product_id").orderBy(F.desc("event_ts"))
    df = (
        read_silver("fact_dc_inventory_txn")
        .withColumn("rn", F.row_number().over(window_spec))
        .filter(F.col("rn") == 1)
        .select(
            "dc_id", "product_id",
            F.col("balance").alias("on_hand"),
            F.col("event_ts").alias("as_of")
        )
    )
    save_gold(df, "dc_inventory_position_current")

In [None]:
# Truck dwell time daily
if silver_exists("fact_truck_moves"):
    print("Creating truck_dwell_daily...")
    df = (
        read_silver("fact_truck_moves")
        .withColumn("day", F.to_date("event_ts"))
        .withColumn("site", 
            F.when(F.col("store_id").isNotNull(), F.concat(F.lit("STORE_"), F.col("store_id")))
             .otherwise(F.concat(F.lit("DC_"), F.col("dc_id")))
        )
        .withColumn("dwell_min", 
            (F.unix_timestamp("etd") - F.unix_timestamp("eta")) / 60
        )
        .filter(F.col("dwell_min").isNotNull() & (F.col("dwell_min") > 0))
        .groupBy("site", "day")
        .agg(
            F.avg("dwell_min").alias("avg_dwell_min"),
            F.countDistinct("truck_id").alias("trucks")
        )
    )
    save_gold(df, "truck_dwell_daily")

In [None]:
# Online sales daily
if silver_exists("fact_online_order_headers"):
    print("Creating online_sales_daily...")
    df = (
        read_silver("fact_online_order_headers")
        .withColumn("day", F.to_date("event_ts"))
        .groupBy("day")
        .agg(
            F.count("*").alias("orders"),
            F.sum("subtotal").alias("subtotal"),
            F.sum("tax").alias("tax"),
            F.sum("total").alias("total"),
            F.avg("total").alias("avg_order_value")
        )
    )
    save_gold(df, "online_sales_daily")

In [None]:
# Zone dwell per minute
if silver_exists("fact_foot_traffic"):
    print("Creating zone_dwell_minute...")
    df = (
        read_silver("fact_foot_traffic")
        .withColumn("ts", F.date_trunc("minute", F.col("event_ts")))
        .groupBy("store_id", "zone", "ts")
        .agg(
            F.avg("dwell_seconds").alias("avg_dwell"),
            F.sum("count").alias("customers")
        )
    )
    save_gold(df, "zone_dwell_minute")

In [None]:
# Marketing cost daily
if silver_exists("fact_marketing"):
    print("Creating marketing_cost_daily...")
    df = (
        read_silver("fact_marketing")
        .withColumn("day", F.to_date("event_ts"))
        .groupBy("campaign_id", "day")
        .agg(
            F.count("*").alias("impressions"),
            F.sum("cost").alias("cost")
        )
    )
    save_gold(df, "marketing_cost_daily")

In [None]:
# Tender mix daily
if silver_exists("fact_receipts"):
    print("Creating tender_mix_daily...")
    df = (
        read_silver("fact_receipts")
        .withColumn("day", F.to_date("event_ts"))
        .groupBy("day", "payment_method")
        .agg(
            F.count("*").alias("transactions"),
            F.sum("total").alias("total_amount")
        )
    )
    save_gold(df, "tender_mix_daily")

---
## Summary

In [None]:
print("\n" + "="*60)
print("HISTORICAL DATA LOAD COMPLETE")
print("="*60)

silver_tables = spark.sql(f"SHOW TABLES IN {SILVER_DB}").collect()
gold_tables = spark.sql(f"SHOW TABLES IN {GOLD_DB}").collect()

print(f"\nSilver ({SILVER_DB}): {len(silver_tables)} tables")
print(f"Gold ({GOLD_DB}): {len(gold_tables)} tables")

print("\nHistorical data pipeline complete!")