# Fabric Notebook (PySpark) - Bronze to Silver Transformation

Reads from Bronze layer shortcuts (cusn schema) and transforms to Silver Delta tables (ag schema).

## Data Sources
- **Bronze Batch Data** (`cusn.dim_*`, `cusn.fact_*`): ADLSv2 parquet shortcuts (historical)
- **Bronze Streaming Data** (`cusn.receipt_created`, etc.): Eventhouse shortcuts (real-time)

## Outputs
- **Silver Delta Tables** (`ag.dim_*`, `ag.fact_*`): Combined batch + streaming, validated, transformed

## Processing Logic
1. Dimensions: Load from batch parquet only
2. Facts: UNION batch parquet + streaming events (no overlap in demo environment)
3. Schema alignment: Map streaming event fields to fact table schema
4. Validation: Ensure data quality (no nulls in required FKs)

**Note**: User confirmed batch and streaming data never overlap, so simple UNION ALL without deduplication.

In [None]:
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.utils import AnalysisException
import os
import warnings

In [None]:
# =============================================================================
# PARAMETERS - Configure these for your environment
# =============================================================================
# REQUIRED ENVIRONMENT VARIABLES:
#   - SILVER_DB: Database name for Silver layer Delta tables (default: "ag")
#   - BRONZE_SCHEMA: Bronze schema with shortcuts to ADLSv2 + Eventhouse (default: "cusn")
#
# These can be set via:
#   1. Fabric pipeline parameters (when run from a pipeline)
#   2. Environment variables in the Fabric workspace
#   3. Notebook %run magic or widget parameters
#
# For local testing, you can use the defaults below:
#   SILVER_DB = "ag"
#   BRONZE_SCHEMA = "cusn"
# =============================================================================

def get_required_env(var_name, description, default=None):
    """Get required environment variable with clear error message."""
    value = os.environ.get(var_name, default)
    if value is None:
        raise EnvironmentError(
            f"Required environment variable '{var_name}' is not set.\n"
            f"Description: {description}\n"
            f"Set it via Fabric pipeline parameters or workspace environment variables."
        )
    return value

# Database name for Silver layer tables
DB_NAME = get_required_env(
    "SILVER_DB",
    "Target database for Silver layer Delta tables",
    default="ag"
)

# Bronze schema name (contains shortcuts to ADLSv2 parquet + Eventhouse events)
BRONZE_SCHEMA = get_required_env(
    "BRONZE_SCHEMA",
    "Bronze schema with shortcuts to batch and streaming data sources",
    default="cusn"
)

print(f"Configuration: SILVER_DB={DB_NAME}, BRONZE_SCHEMA={BRONZE_SCHEMA}")

In [None]:
# Helper Functions

def read_bronze_table(table_name):
    """Read a table from Bronze schema."""
    return spark.table(f"{BRONZE_SCHEMA}.{table_name}")

def ensure_database(name):
    """Create database if it doesn't exist and validate access."""
    spark.sql(f"CREATE DATABASE IF NOT EXISTS {name}")
    try:
        spark.sql(f"DESCRIBE DATABASE {name}")
        print(f"Database '{name}' is ready.")
    except AnalysisException as e:
        raise RuntimeError(f"Cannot access database '{name}': {e}")

def save_table(df, table_name, mode="overwrite"):
    full_name = f"{DB_NAME}.{table_name}"
    df.write.format("delta").mode(mode).saveAsTable(full_name)
    print(f"  Written to {full_name}: {df.count()} rows")

def bronze_table_exists(table_name):
    """Check if a table exists in Bronze schema."""
    try:
        spark.table(f"{BRONZE_SCHEMA}.{table_name}")
        return True
    except AnalysisException:
        return False

def load_table(table_name, transform_fn=None, skip_if_missing=True):
    """
    Load a table from Bronze to Silver with optional transformation.
    
    Args:
        table_name: Name of the table to load
        transform_fn: Optional function to transform the DataFrame
        skip_if_missing: If True, skip if Bronze table doesn't exist
        
    Returns:
        True if loaded successfully, False if skipped
    """
    try:
        print(f"Loading {table_name}...")
        
        # Check if Bronze table exists
        if not bronze_table_exists(table_name):
            if skip_if_missing:
                print(f"  Skipping {table_name}: Bronze table does not exist")
                return False
            else:
                raise FileNotFoundError(f"Required Bronze table {BRONZE_SCHEMA}.{table_name} not found")
        
        # Read from Bronze
        df = read_bronze_table(table_name)
        
        # Apply transformation if provided
        if transform_fn:
            df = transform_fn(df)
        
        # Save to Silver
        save_table(df, table_name)
        return True
        
    except FileNotFoundError as e:
        print(f"  Skipping {table_name}: {e}")
        return False
    except PermissionError as e:
        # Re-raise permission errors - infrastructure problem
        raise
    except AnalysisException as e:
        # PySpark analysis errors (e.g., schema mismatch)
        warnings.warn(f"Skipping {table_name}: PySpark analysis error - {e}")
        return False
    except Exception as e:
        # Log unexpected errors with type for debugging
        print(f"  Skipping {table_name}: {type(e).__name__}: {e}")
        return False

ensure_database(DB_NAME)

## Load Dimension Tables (Batch Only)

Dimensions are loaded from ADLSv2 parquet only (no streaming component).

In [None]:
print("Loading dimension tables...\n")

load_table("dim_geographies")
load_table("dim_stores")
load_table("dim_distribution_centers")
load_table("dim_trucks")
load_table("dim_customers")
load_table("dim_products")

print("\nDimension tables loaded.\n")

## Load Fact Tables (Batch + Streaming)

Fact tables combine:
- **Batch historical data** from ADLSv2 parquet (cusn.fact_*)
- **Streaming real-time data** from Eventhouse (cusn.receipt_created, etc.)

### Mapping: Batch Tables → Streaming Events

| Batch Parquet Table | Streaming Event Table | Combined in Silver |
|---------------------|----------------------|-------------------|
| `fact_receipts` | `receipt_created` | `ag.fact_receipts` |
| `fact_receipt_lines` | `receipt_line_added` | `ag.fact_receipt_lines` |
| `fact_payments` | `payment_processed` | `ag.fact_payments` |
| `fact_store_inventory_txn`, `fact_dc_inventory_txn` | `inventory_updated` | `ag.fact_store_inventory_txn`, `ag.fact_dc_inventory_txn` |
| `fact_stockouts` | `stockout_detected` | `ag.fact_stockouts` |
| `fact_reorders` | `reorder_triggered` | `ag.fact_reorders` |
| `fact_foot_traffic` | `customer_entered` | `ag.fact_foot_traffic` |
| `fact_customer_zone_changes` | `customer_zone_changed` | `ag.fact_customer_zone_changes` |
| `fact_ble_pings` | `ble_ping_detected` | `ag.fact_ble_pings` |
| `fact_truck_moves` | `truck_arrived`, `truck_departed` | `ag.fact_truck_moves` |
| `fact_store_ops` | `store_opened`, `store_closed` | `ag.fact_store_ops` |
| `fact_marketing` | `ad_impression` | `ag.fact_marketing` |
| `fact_promotions` | `promotion_applied` | `ag.fact_promotions` |
| `fact_online_order_headers` | `online_order_created` | `ag.fact_online_order_headers` |
| `fact_online_order_lines` | `online_order_picked`, `online_order_shipped` | `ag.fact_online_order_lines` |

**Note:** No deduplication needed - user confirmed batch and streaming never overlap.

In [None]:
print("Loading fact tables (all 18)...\n")

# Fact tables without streaming equivalent (batch only)
load_table("fact_truck_inventory")  # No streaming event for truck inventory
load_table("fact_promo_lines")  # Promotion line details (no separate streaming event)

# Fact tables with streaming equivalent - simple UNION ALL
# These will be handled with custom transform functions to combine batch + streaming

### 1. fact_receipts (Batch + Streaming)

In [None]:
def transform_receipts(df_batch):
    """Combine batch fact_receipts with streaming receipt_created."""
    try:
        # Try to read streaming data
        df_stream = read_bronze_table("receipt_created")
        
        # Map streaming fields to batch schema
        df_stream_mapped = df_stream.select(
            F.col("ingest_timestamp").alias("event_ts"),
            F.col("receipt_id").alias("receipt_id_ext"),
            F.col("tender_type").alias("payment_method"),
            F.lit(0.0).alias("discount_amount"),
            F.col("tax"),
            (F.col("tax") * 100).cast("bigint").alias("tax_cents"),
            F.col("subtotal"),
            F.col("total"),
            (F.col("total") * 100).cast("bigint").alias("total_cents"),
            F.lit("SALE").alias("receipt_type"),
            (F.col("subtotal") * 100).cast("bigint").alias("subtotal_cents"),
            F.col("customer_id"),
            F.col("store_id"),
            F.lit(None).cast("string").alias("return_for_receipt_id_ext")
        )
        
        # Validate schemas match before UNION
        batch_cols = set(df_batch.columns)
        stream_cols = set(df_stream_mapped.columns)
        if batch_cols != stream_cols:
            missing_in_stream = batch_cols - stream_cols
            extra_in_stream = stream_cols - batch_cols
            error_msg = f"Schema mismatch for fact_receipts: "
            if missing_in_stream:
                error_msg += f"Missing in stream: {missing_in_stream}. "
            if extra_in_stream:
                error_msg += f"Extra in stream: {extra_in_stream}."
            raise ValueError(error_msg)
        
        # UNION batch + streaming (no dedup needed)
        return df_batch.unionAll(df_stream_mapped)
    except Exception as e:
        # If streaming table doesn't exist or schema mismatch, return batch only
        print(f"  Warning: Could not combine streaming data for fact_receipts: {e}")
        return df_batch

load_table("fact_receipts", transform_receipts)

### 2. fact_receipt_lines (Batch + Streaming)

In [None]:
def transform_receipt_lines(df_batch):
    """Combine batch fact_receipt_lines with streaming receipt_line_added."""
    try:
        df_stream = read_bronze_table("receipt_line_added")
        
        # Map streaming fields to batch schema
        # CRITICAL: Column order must match batch schema exactly:
        # receipt_id_ext, event_ts, product_id, line_num, quantity, 
        # unit_price, ext_price, unit_cents, ext_cents, promo_code
        df_stream_mapped = df_stream.select(
            F.col("receipt_id").alias("receipt_id_ext"),
            F.col("ingest_timestamp").alias("event_ts"),
            F.col("product_id"),
            F.col("line_number").alias("line_num"),
            F.col("quantity"),
            F.col("unit_price"),
            F.col("extended_price").alias("ext_price"),
            (F.col("unit_price") * 100).cast("bigint").alias("unit_cents"),
            (F.col("extended_price") * 100).cast("bigint").alias("ext_cents"),
            F.col("promo_code")
        )
        
        # Validate schemas match before UNION
        batch_cols = set(df_batch.columns)
        stream_cols = set(df_stream_mapped.columns)
        if batch_cols != stream_cols:
            missing_in_stream = batch_cols - stream_cols
            extra_in_stream = stream_cols - batch_cols
            error_msg = f"Schema mismatch for fact_receipt_lines: "
            if missing_in_stream:
                error_msg += f"Missing in stream: {missing_in_stream}. "
            if extra_in_stream:
                error_msg += f"Extra in stream: {extra_in_stream}."
            raise ValueError(error_msg)
        
        return df_batch.unionAll(df_stream_mapped)
    except Exception as e:
        print(f"  Warning: Could not combine streaming data for fact_receipt_lines: {e}")
        return df_batch

load_table("fact_receipt_lines", transform_receipt_lines)

### 3-11. Remaining Fact Tables with Batch Only

These tables currently only have batch data (streaming events planned but not yet implemented).

In [None]:
# Store and DC inventory transactions
load_table("fact_store_inventory_txn")
load_table("fact_dc_inventory_txn")

# Logistics
load_table("fact_truck_moves")

# Customer tracking
load_table("fact_foot_traffic")
load_table("fact_ble_pings")
load_table("fact_customer_zone_changes")

# Marketing
load_table("fact_marketing")

# Omnichannel
load_table("fact_online_order_headers")
load_table("fact_online_order_lines")

# Payments
load_table("fact_payments")

# Store operations
load_table("fact_store_ops")

# Inventory management
load_table("fact_stockouts")
load_table("fact_promotions")
load_table("fact_reorders")

print("\nFact tables loaded.")
print("Bronze → Silver transformation complete!")

## Verification

Verify Silver tables were created successfully.

In [None]:
# List all tables in Silver database
silver_tables = spark.sql(f"SHOW TABLES IN {DB_NAME}").collect()

print(f"\nSilver Database ({DB_NAME}) Tables:")
print(f"  Total: {len(silver_tables)} tables\n")

dim_count = sum(1 for t in silver_tables if t.tableName.startswith('dim_'))
fact_count = sum(1 for t in silver_tables if t.tableName.startswith('fact_'))

print(f"  Dimensions: {dim_count} (expected: 6)")
print(f"  Facts: {fact_count} (expected: 18)")
print(f"\n  Target: 24 tables (6 dims + 18 facts)")

if len(silver_tables) == 24:
    print(f"\n✓ Silver layer complete!")
elif len(silver_tables) < 24:
    print(f"\n⚠ Silver layer has {len(silver_tables)}/24 tables")
    print(f"  Some Bronze tables may be missing - check Bronze layer creation")

In [None]:
# Test read from Silver
print("\nTesting Silver layer access...\n")

try:
    df_receipts = spark.table(f"{DB_NAME}.fact_receipts")
    print(f"✓ fact_receipts: {df_receipts.count()} rows")
except Exception as e:
    print(f"✗ fact_receipts: {e}")

try:
    df_receipt_lines = spark.table(f"{DB_NAME}.fact_receipt_lines")
    print(f"✓ fact_receipt_lines: {df_receipt_lines.count()} rows")
    print(f"\nfact_receipt_lines schema:")
    df_receipt_lines.printSchema()
except Exception as e:
    print(f"✗ fact_receipt_lines: {e}")

print("\nSilver layer ready for Gold aggregations!")