# Delta Table Maintenance

This notebook optimizes Delta tables in Silver and Gold layers for improved query performance and storage efficiency.

## Operations
1. **OPTIMIZE**: Compacts small files into larger ones for better query performance
2. **ZORDER**: Co-locates related data for faster filtering on common columns
3. **VACUUM**: Removes old file versions to reclaim storage (7-day retention)

## Schedule
- **OPTIMIZE + ZORDER**: Weekly or when query performance degrades
- **VACUUM**: Monthly or when storage costs increase significantly

## Prerequisites
- Silver and Gold layers must be populated
- Sufficient Fabric capacity for compaction operations

⚠️ **Warning**: VACUUM permanently deletes old file versions. Ensure time travel queries beyond 7 days are not required.

In [None]:
from pyspark.sql.utils import AnalysisException
import os
from datetime import datetime

# Constants
SEPARATOR = "="*80

# Configuration
SILVER_DB = os.environ.get("SILVER_DB", "ag")
GOLD_DB = os.environ.get("GOLD_DB", "au")
VACUUM_RETENTION_HOURS = int(os.environ.get("VACUUM_RETENTION_HOURS", "168"))  # 7 days default
DRY_RUN = os.environ.get("DRY_RUN", "false").lower() == "true"

print(f"Configuration:")
print(f"  Silver DB: {SILVER_DB}")
print(f"  Gold DB: {GOLD_DB}")
print(f"  Vacuum Retention: {VACUUM_RETENTION_HOURS} hours ({VACUUM_RETENTION_HOURS / 24:.1f} days)")
print(f"  Dry Run: {DRY_RUN}")
print(f"  Start Time: {datetime.now().isoformat()}")
print()

## Helper Functions

In [None]:
def optimize_table(db, table, zorder_cols=None):
    """
    Optimize a Delta table with optional ZORDER.
    
    Args:
        db: Database name
        table: Table name
        zorder_cols: List of columns to ZORDER by (optional)
    """
    full_name = f"{db}.{table}"
    
    try:
        if DRY_RUN:
            print(f"  [DRY RUN] Would optimize {full_name}")
            if zorder_cols:
                print(f"  [DRY RUN] Would ZORDER by: {', '.join(zorder_cols)}")
            return True
        
        # Basic optimization
        if zorder_cols:
            zorder_clause = ", ".join(zorder_cols)
            sql = f"OPTIMIZE {full_name} ZORDER BY ({zorder_clause})"
            print(f"  Optimizing {full_name} with ZORDER by ({zorder_clause})...")
        else:
            sql = f"OPTIMIZE {full_name}"
            print(f"  Optimizing {full_name}...")
        
        result = spark.sql(sql)
        metrics = result.collect()[0]
        print(f"    ✓ Files compacted: {metrics['metrics.numFilesAdded']} added, {metrics['metrics.numFilesRemoved']} removed")
        return True
        
    except AnalysisException as e:
        print(f"    ✗ Table not found: {full_name}")
        return False
    except Exception as e:
        print(f"    ✗ Error optimizing {full_name}: {e}")
        return False

def vacuum_table(db, table, retention_hours):
    """
    Vacuum a Delta table to remove old file versions.
    
    Args:
        db: Database name
        table: Table name
        retention_hours: Hours to retain old versions
    """
    full_name = f"{db}.{table}"
    
    try:
        if DRY_RUN:
            print(f"  [DRY RUN] Would vacuum {full_name} (retain {retention_hours} hours)")
            return True
        
        print(f"  Vacuuming {full_name} (retain {retention_hours} hours)...")
        spark.sql(f"VACUUM {full_name} RETAIN {retention_hours} HOURS")
        print(f"    ✓ Vacuumed successfully")
        return True
        
    except AnalysisException as e:
        print(f"    ✗ Table not found: {full_name}")
        return False
    except Exception as e:
        print(f"    ✗ Error vacuuming {full_name}: {e}")
        return False

def get_table_stats(db, table):
    """Get table statistics."""
    try:
        df = spark.table(f"{db}.{table}")
        return df.count()
    except:
        return None

## Silver Layer Optimization

Optimize Silver fact and dimension tables with ZORDER on frequently filtered columns.

In [None]:
print(SEPARATOR)
print("OPTIMIZING SILVER LAYER")
print(SEPARATOR)
print()

# Define Silver tables with optimal ZORDER columns
silver_tables = [
    # Dimensions - no ZORDER needed (small tables)
    ("dim_geographies", None),
    ("dim_stores", None),
    ("dim_distribution_centers", None),
    ("dim_trucks", None),
    ("dim_customers", None),
    ("dim_products", None),
    
    # Facts - ZORDER by time + primary dimension
    ("fact_receipts", ["event_ts", "store_id"]),
    ("fact_receipt_lines", ["event_ts", "product_id"]),
    ("fact_store_inventory_txn", ["event_ts", "store_id", "product_id"]),
    ("fact_dc_inventory_txn", ["event_ts", "dc_id", "product_id"]),
    ("fact_truck_moves", ["event_ts", "truck_id"]),
    ("fact_truck_inventory", ["event_ts", "truck_id", "product_id"]),
    ("fact_foot_traffic", ["event_ts", "store_id"]),
    ("fact_ble_pings", ["event_ts", "store_id"]),
    ("fact_customer_zone_changes", ["event_ts", "customer_id"]),
    ("fact_marketing", ["event_ts", "campaign_id"]),
    ("fact_online_order_headers", ["event_ts", "customer_id"]),
    ("fact_online_order_lines", ["event_ts", "order_id"]),
    ("fact_payments", ["event_ts", "customer_id"]),
    ("fact_store_ops", ["event_ts", "store_id"]),
    ("fact_stockouts", ["event_ts", "store_id", "product_id"]),
    ("fact_promotions", ["event_ts", "store_id"]),
    ("fact_promo_lines", ["event_ts", "promo_id"]),
    ("fact_reorders", ["event_ts", "dc_id", "product_id"])
]

silver_success = 0
silver_failed = 0

for table, zorder_cols in silver_tables:
    if optimize_table(SILVER_DB, table, zorder_cols):
        silver_success += 1
    else:
        silver_failed += 1

print()
print(f"Silver optimization complete: {silver_success} succeeded, {silver_failed} failed")
print()

## Gold Layer Optimization

Optimize Gold aggregated tables with ZORDER on time-based columns.

In [None]:
print(SEPARATOR)
print("OPTIMIZING GOLD LAYER")
print(SEPARATOR)
print()

# Define Gold tables with optimal ZORDER columns
gold_tables = [
    ("sales_minute_store", ["ts", "store_id"]),
    ("top_products_15m", ["computed_at", "product_id"]),
    ("inventory_position_current", ["store_id", "product_id"]),
    ("dc_inventory_position_current", ["dc_id", "product_id"]),
    ("truck_dwell_daily", ["day", "site"]),
    ("online_sales_daily", ["day"]),
    ("fulfillment_daily", ["day", "fulfillment_mode"]),
    ("zone_dwell_minute", ["ts", "store_id"]),
    ("ble_presence_minute", ["ts", "store_id"]),
    ("marketing_cost_daily", ["day", "campaign_id"]),
    ("campaign_revenue_daily", ["day", "campaign_id"]),
    ("tender_mix_daily", ["day"]),
    ("stockout_duration_daily", ["day", "store_id"]),
    ("reorder_cycle_daily", ["day", "dc_id"]),
    ("promotion_effectiveness_daily", ["day"]),
    ("store_performance_daily", ["day", "store_id"])
]

gold_success = 0
gold_failed = 0

for table, zorder_cols in gold_tables:
    if optimize_table(GOLD_DB, table, zorder_cols):
        gold_success += 1
    else:
        gold_failed += 1

print()
print(f"Gold optimization complete: {gold_success} succeeded, {gold_failed} failed")
print()

## Vacuum Old Versions

Remove old file versions to reclaim storage. Default retention is 7 days.

⚠️ **Warning**: This operation is irreversible. Time travel queries beyond the retention period will fail.

In [None]:
print(SEPARATOR)
print("VACUUMING OLD FILE VERSIONS")
print(SEPARATOR)
print()

vacuum_success = 0
vacuum_failed = 0

# Vacuum Silver tables
print("Vacuuming Silver layer...")
for table, _ in silver_tables:
    if vacuum_table(SILVER_DB, table, VACUUM_RETENTION_HOURS):
        vacuum_success += 1
    else:
        vacuum_failed += 1

print()

# Vacuum Gold tables
print("Vacuuming Gold layer...")
for table, _ in gold_tables:
    if vacuum_table(GOLD_DB, table, VACUUM_RETENTION_HOURS):
        vacuum_success += 1
    else:
        vacuum_failed += 1

print()
print(f"Vacuum complete: {vacuum_success} succeeded, {vacuum_failed} failed")
print()

## Summary Report

In [None]:
print(SEPARATOR)
print("MAINTENANCE SUMMARY")
print(SEPARATOR)
print(f"End Time: {datetime.now().isoformat()}")
print()
print(f"Silver Layer:")
print(f"  Optimized: {silver_success}/{len(silver_tables)} tables")
print()
print(f"Gold Layer:")
print(f"  Optimized: {gold_success}/{len(gold_tables)} tables")
print()
print(f"Vacuum:")
print(f"  Cleaned: {vacuum_success}/{len(silver_tables) + len(gold_tables)} tables")
print()

if DRY_RUN:
    print("⚠️  DRY RUN MODE - No changes were made")
    print("   Set DRY_RUN=false to execute maintenance operations")
else:
    print("✓ Maintenance complete")
    print()
    print("Next Steps:")
    print("  1. Monitor query performance for improvements")
    print("  2. Check storage usage reduction")
    print("  3. Schedule this notebook to run weekly for OPTIMIZE")
    print("  4. Schedule this notebook to run monthly for VACUUM")