In [None]:
# Fabric Notebook (PySpark) - Load exported Parquet files directly to Silver Delta
# Use this for historical batch data from datagen Parquet exports
# (Bronze layer is for streaming JSON events only)

from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.utils import AnalysisException
import os
import warnings

# =============================================================================
# PARAMETERS - Configure these for your environment
# =============================================================================
# REQUIRED ENVIRONMENT VARIABLES:
#   - SILVER_DB: Database name for Silver layer tables (e.g., "silver_retail")
#   - PARQUET_SOURCE: Path to Parquet files (e.g., "Files/adls-parquet-copy")
#
# These can be set via:
#   1. Fabric pipeline parameters (when run from a pipeline)
#   2. Environment variables in the Fabric workspace
#   3. Notebook %run magic or widget parameters
#
# For local testing, you can uncomment the defaults below:
#   DB_NAME = "ag"
#   PARQUET_SOURCE = "Files"
# =============================================================================

def get_required_env(var_name, description, default=None):
    """Get required environment variable with clear error message."""
    value = os.environ.get(var_name, default)
    if value is None:
        raise EnvironmentError(
            f"Required environment variable '{var_name}' is not set.\n"
            f"Description: {description}\n"
            f"Set it via Fabric pipeline parameters or workspace environment variables."
        )
    return value

# Database name for Silver layer tables - REQUIRED
# Uncomment the default for local testing: default="ag"
DB_NAME = get_required_env(
    "SILVER_DB",
    "Target database for Silver layer Delta tables",
    default="ag"  # Default for backward compatibility; remove in production
)

# Configure source path - REQUIRED
# Uncomment the default for local testing: default="Files"
PARQUET_SOURCE = get_required_env(
    "PARQUET_SOURCE",
    "Path to Parquet source files (e.g., 'Files/adls-parquet-copy')",
    default="Files"  # Default for backward compatibility; remove in production
)

print(f"Configuration: DB_NAME={DB_NAME}, PARQUET_SOURCE={PARQUET_SOURCE}")


def get_fs():
    try:
        from notebookutils import mssparkutils
        return mssparkutils.fs
    except ImportError:
        return dbutils.fs


FS = get_fs()


def path_exists(path):
    try:
        FS.ls(path)
        return True
    except Exception:
        return False


def resolve_path(*candidates):
    for candidate in candidates:
        if path_exists(candidate):
            return candidate
    raise FileNotFoundError(f"No matching path for: {candidates}")


def read_parquet_recursive(path):
    return spark.read.option("recursiveFileLookup", "true").parquet(path)


def read_dim(table_name):
    path = resolve_path(
        f"{PARQUET_SOURCE}/master/{table_name}",
        f"{PARQUET_SOURCE}/{table_name}",
    )
    return read_parquet_recursive(path)


def read_fact(table_name):
    path = resolve_path(
        f"{PARQUET_SOURCE}/facts/{table_name}",
        f"{PARQUET_SOURCE}/{table_name}",
    )
    return read_parquet_recursive(path)


def normalize_columns(df):
    return df.toDF(*[c.lower() for c in df.columns])


def col_with_fallback(df, candidates, alias, required=True):
    cols = {c.lower(): c for c in df.columns}
    for name in candidates:
        key = name.lower()
        if key in cols:
            return F.col(cols[key]).alias(alias)
    if required:
        raise ValueError(f"Missing column for '{alias}', tried {candidates}")
    return F.lit(None).cast("string").alias(alias)


def select_columns(df, mapping):
    df = normalize_columns(df)
    columns = []
    for alias, spec in mapping.items():
        if isinstance(spec, dict):
            candidates = spec.get("candidates", [])
            required = spec.get("required", True)
        else:
            candidates = spec
            required = True
        columns.append(col_with_fallback(df, candidates, alias, required=required))
    return df.select(*columns)


def ensure_database(name):
    """Create database if it doesn't exist and validate access."""
    spark.sql(f"CREATE DATABASE IF NOT EXISTS {name}")
    # Validate we can access the database
    try:
        spark.sql(f"DESCRIBE DATABASE {name}")
        print(f"Database '{name}' is ready.")
    except AnalysisException as e:
        raise RuntimeError(f"Cannot access database '{name}': {e}")


def save_table(df, table_name, mode="overwrite"):
    full_name = f"{DB_NAME}.{table_name}"
    df.write.format("delta").mode(mode).saveAsTable(full_name)
    print(f"  Written to {full_name}")


def load_table(table_name, read_fn, columns_mapping=None):
    """Load a table with proper exception handling.
    
    Args:
        table_name: Name of the table to load
        read_fn: Function to read the source data (read_dim or read_fact)
        columns_mapping: Optional column mapping for select_columns
    
    Returns:
        True if loaded successfully, False if skipped due to missing data
        
    Raises:
        PermissionError: Re-raised for infrastructure issues
        ValueError: Re-raised for schema/mapping errors
    """
    try:
        if columns_mapping:
            df = select_columns(read_fn(table_name), columns_mapping)
        else:
            df = read_fn(table_name)
        save_table(df, table_name)
        return True
    except FileNotFoundError as e:
        print(f"  Skipping {table_name}: source file not found")
        return False
    except PermissionError as e:
        # Re-raise permission errors - infrastructure problem that needs attention
        raise
    except ValueError as e:
        # Re-raise schema errors - requires manual fix
        raise
    except AnalysisException as e:
        # PySpark analysis errors (e.g., schema mismatch)
        warnings.warn(f"Skipping {table_name}: PySpark analysis error - {e}")
        return False
    except Exception as e:
        # Log unexpected errors with type for debugging
        print(f"  Skipping {table_name}: {type(e).__name__}: {e}")
        return False


ensure_database(DB_NAME)

# =============================================================================
# DIMENSION TABLES (Master Data)
# =============================================================================

print("Loading dimension tables...")

load_table("dim_geographies", read_dim)
load_table("dim_stores", read_dim)
load_table("dim_distribution_centers", read_dim)
load_table("dim_trucks", read_dim)
load_table("dim_customers", read_dim)
load_table("dim_products", read_dim)

print("Dimension tables loaded.\n")

# =============================================================================
# FACT TABLES
# =============================================================================

print("Loading fact tables...")

# 1) Receipts
load_table("fact_receipts", read_fact, {
    "event_ts": ["event_ts"],
    "receipt_id_ext": ["receipt_id_ext", "receipt_id"],
    "payment_method": ["payment_method"],
    "discount_amount": ["discount_amount"],
    "tax_cents": ["tax_cents"],
    "subtotal": ["subtotal", "subtotal_amount"],
    "total": ["total", "total_amount"],
    "total_cents": ["total_cents"],
    "receipt_type": ["receipt_type"],
    "subtotal_cents": ["subtotal_cents"],
    "tax": ["tax", "tax_amount"],
    "customer_id": {"candidates": ["customer_id"], "required": False},
    "store_id": ["store_id"],
    "return_for_receipt_id_ext": ["return_for_receipt_id_ext"],
})

# 2) Receipt Lines
load_table("fact_receipt_lines", read_fact, {
    "unit_cents": ["unit_cents"],
    "unit_price": ["unit_price"],
    "event_ts": ["event_ts"],
    "product_id": ["product_id"],
    "quantity": ["quantity"],
    "ext_price": ["ext_price"],
    "line_num": ["line_num"],
    "promo_code": ["promo_code"],
    "ext_cents": ["ext_cents"],
    "receipt_id_ext": ["receipt_id_ext", "receipt_id"],
})

# 3) Store Inventory Transactions
load_table("fact_store_inventory_txn", read_fact, {
    "event_ts": ["event_ts"],
    "product_id": ["product_id"],
    "txn_type": ["txn_type"],
    "quantity": ["quantity"],
    "source": ["source"],
    "store_id": ["store_id"],
    "balance": ["balance"],
})

# 4) DC Inventory Transactions
load_table("fact_dc_inventory_txn", read_fact, {
    "event_ts": ["event_ts"],
    "product_id": ["product_id"],
    "txn_type": ["txn_type"],
    "quantity": ["quantity"],
    "dc_id": ["dc_id"],
    "balance": ["balance"],
    "source": ["source"],
})

# 5) Foot Traffic
load_table("fact_foot_traffic", read_fact, {
    "count": ["count"],
    "zone": ["zone"],
    "event_ts": ["event_ts"],
    "sensor_id": ["sensor_id"],
    "dwell_seconds": ["dwell_seconds"],
    "store_id": ["store_id"],
})

# 6) BLE Pings
load_table("fact_ble_pings", read_fact, {
    "zone": ["zone"],
    "event_ts": ["event_ts"],
    "rssi": ["rssi"],
    "customer_ble_id": ["customer_ble_id"],
    "customer_id": {"candidates": ["customer_id"], "required": False},
    "store_id": ["store_id"],
    "beacon_id": ["beacon_id"],
})

# 7) Marketing
load_table("fact_marketing", read_fact, {
    "event_ts": ["event_ts"],
    "campaign_id": ["campaign_id"],
    "device": ["device"],
    "creative_id": ["creative_id"],
    "customer_ad_id": ["customer_ad_id"],
    "impression_id_ext": ["impression_id_ext", "impression_id"],
    "cost": ["cost"],
    "cost_cents": {"candidates": ["cost_cents"], "required": False},
    "customer_id": {"candidates": ["customer_id"], "required": False},
    "channel": ["channel"],
})

# 8) Online Order Headers
load_table("fact_online_order_headers", read_fact, {
    "completed_ts": ["completed_ts"],
    "event_ts": ["event_ts"],
    "order_id_ext": ["order_id_ext", "order_id"],
    "tax_cents": ["tax_cents"],
    "subtotal": ["subtotal", "subtotal_amount"],
    "total": ["total", "total_amount"],
    "total_cents": ["total_cents"],
    "subtotal_cents": ["subtotal_cents"],
    "tax": ["tax", "tax_amount"],
    "customer_id": {"candidates": ["customer_id"], "required": False},
    "payment_method": ["payment_method"],
})

# 9) Online Order Lines
load_table("fact_online_order_lines", read_fact, {
    "unit_cents": ["unit_cents"],
    "shipped_ts": ["shipped_ts"],
    "unit_price": ["unit_price"],
    "fulfillment_status": ["fulfillment_status"],
    "order_id": ["order_id"],
    "delivered_ts": ["delivered_ts"],
    "product_id": ["product_id"],
    "quantity": ["quantity"],
    "ext_price": ["ext_price"],
    "node_type": ["node_type"],
    "fulfillment_mode": ["fulfillment_mode"],
    "picked_ts": ["picked_ts"],
    "node_id": ["node_id"],
    "line_num": ["line_num"],
    "promo_code": ["promo_code"],
    "ext_cents": ["ext_cents"],
})

# 10) Truck Moves
load_table("fact_truck_moves", read_fact, {
    "event_ts": ["event_ts"],
    "truck_id": ["truck_id"],
    "dc_id": ["dc_id"],
    "store_id": ["store_id"],
    "shipment_id": ["shipment_id"],
    "status": ["status"],
    "eta": ["eta"],
    "etd": ["etd"],
})

print("\nFact tables loaded.")
print("Parquet -> Silver load complete!")