In [None]:
# Fabric Notebook (PySpark) - Load exported Parquet files directly to Silver Delta
# Use this for historical batch data from datagen Parquet exports
# (Bronze layer is for streaming JSON events only)

from pyspark.sql import functions as F
from pyspark.sql.types import *

# Configure source path - update this to your Parquet export location
# If using Azure Storage: "abfss://container@account.dfs.core.windows.net/path"
# If using Lakehouse Files: "Files/adls-parquet-copy"
PARQUET_SOURCE = "Files"
DB_NAME = "ag"


def get_fs():
    try:
        from notebookutils import mssparkutils
        return mssparkutils.fs
    except Exception:
        return dbutils.fs


FS = get_fs()


def path_exists(path):
    try:
        FS.ls(path)
        return True
    except Exception:
        return False


def resolve_path(*candidates):
    for candidate in candidates:
        if path_exists(candidate):
            return candidate
    raise FileNotFoundError(f"No matching path for: {candidates}")


def read_parquet_recursive(path):
    return spark.read.option("recursiveFileLookup", "true").parquet(path)


def read_dim(table_name):
    path = resolve_path(
        f"{PARQUET_SOURCE}/master/{table_name}",
        f"{PARQUET_SOURCE}/{table_name}",
    )
    return read_parquet_recursive(path)


def read_fact(table_name):
    path = resolve_path(
        f"{PARQUET_SOURCE}/facts/{table_name}",
        f"{PARQUET_SOURCE}/{table_name}",
    )
    return read_parquet_recursive(path)


def normalize_columns(df):
    return df.toDF(*[c.lower() for c in df.columns])


def col_with_fallback(df, candidates, alias, required=True):
    cols = {c.lower(): c for c in df.columns}
    for name in candidates:
        key = name.lower()
        if key in cols:
            return F.col(cols[key]).alias(alias)
    if required:
        raise ValueError(f"Missing column for '{alias}', tried {candidates}")
    return F.lit(None).cast("string").alias(alias)


def select_columns(df, mapping):
    df = normalize_columns(df)
    columns = []
    for alias, spec in mapping.items():
        if isinstance(spec, dict):
            candidates = spec.get("candidates", [])
            required = spec.get("required", True)
        else:
            candidates = spec
            required = True
        columns.append(col_with_fallback(df, candidates, alias, required=required))
    return df.select(*columns)


def ensure_database(name):
    spark.sql(f"CREATE DATABASE IF NOT EXISTS {name}")


def save_table(df, table_name, mode="overwrite"):
    full_name = f"{DB_NAME}.{table_name}"
    df.write.format("delta").mode(mode).saveAsTable(full_name)
    print(f"  Written to {full_name}")


ensure_database(DB_NAME)

# =============================================================================
# DIMENSION TABLES (Master Data)
# =============================================================================

print("Loading dimension tables...")

# Geographies
try:
    df_geo = read_dim("dim_geographies")
    save_table(df_geo, "dim_geographies")
except Exception as e:
    print(f"  Skipping dim_geographies: {e}")

# Stores
try:
    df_stores = read_dim("dim_stores")
    save_table(df_stores, "dim_stores")
except Exception as e:
    print(f"  Skipping dim_stores: {e}")

# Distribution Centers
try:
    df_dcs = read_dim("dim_distribution_centers")
    save_table(df_dcs, "dim_distribution_centers")
except Exception as e:
    print(f"  Skipping dim_distribution_centers: {e}")

# Trucks
try:
    df_trucks = read_dim("dim_trucks")
    save_table(df_trucks, "dim_trucks")
except Exception as e:
    print(f"  Skipping dim_trucks: {e}")

# Customers
try:
    df_customers = read_dim("dim_customers")
    save_table(df_customers, "dim_customers")
except Exception as e:
    print(f"  Skipping dim_customers: {e}")

# Products
try:
    df_products = read_dim("dim_products")
    save_table(df_products, "dim_products")
except Exception as e:
    print(f"  Skipping dim_products: {e}")

print("Dimension tables loaded.\n")

# =============================================================================
# FACT TABLES
# =============================================================================

print("Loading fact tables...")

# 1) Receipts
try:
    df_receipts = select_columns(
        read_fact("fact_receipts"),
        {
            "event_ts": ["event_ts"],
            "receipt_id_ext": ["receipt_id_ext", "receipt_id"],
            "payment_method": ["payment_method"],
            "discount_amount": ["discount_amount"],
            "tax_cents": ["tax_cents"],
            "subtotal": ["subtotal", "subtotal_amount"],
            "total": ["total", "total_amount"],
            "total_cents": ["total_cents"],
            "receipt_type": ["receipt_type"],
            "subtotal_cents": ["subtotal_cents"],
            "tax": ["tax", "tax_amount"],
            "customer_id": ["customer_id"],
            "store_id": ["store_id"],
            "return_for_receipt_id_ext": ["return_for_receipt_id_ext"],
        },
    )
    save_table(df_receipts, "fact_receipts")
except Exception as e:
    print(f"  Skipping receipts: {e}")

# 2) Receipt Lines
try:
    df_receipt_lines = select_columns(
        read_fact("fact_receipt_lines"),
        {
            "unit_cents": ["unit_cents"],
            "unit_price": ["unit_price"],
            "event_ts": ["event_ts"],
            "product_id": ["product_id"],
            "quantity": ["quantity"],
            "ext_price": ["ext_price"],
            "line_num": ["line_num"],
            "promo_code": ["promo_code"],
            "ext_cents": ["ext_cents"],
            "receipt_id_ext": ["receipt_id_ext", "receipt_id"],
        },
    )
    save_table(df_receipt_lines, "fact_receipt_lines")
except Exception as e:
    print(f"  Skipping receipt_lines: {e}")

# 3) Store Inventory Transactions
try:
    df_store_inv = select_columns(
        read_fact("fact_store_inventory_txn"),
        {
            "event_ts": ["event_ts"],
            "product_id": ["product_id"],
            "txn_type": ["txn_type"],
            "quantity": ["quantity"],
            "source": ["source"],
            "store_id": ["store_id"],
            "balance": ["balance"],
        },
    )
    save_table(df_store_inv, "fact_store_inventory_txn")
except Exception as e:
    print(f"  Skipping store_inventory_txn: {e}")

# 4) DC Inventory Transactions
try:
    df_dc_inv = select_columns(
        read_fact("fact_dc_inventory_txn"),
        {
            "event_ts": ["event_ts"],
            "product_id": ["product_id"],
            "txn_type": ["txn_type"],
            "quantity": ["quantity"],
            "dc_id": ["dc_id"],
            "balance": ["balance"],
            "source": ["source"],
        },
    )
    save_table(df_dc_inv, "fact_dc_inventory_txn")
except Exception as e:
    print(f"  Skipping dc_inventory_txn: {e}")

# 5) Foot Traffic
try:
    df_foot = select_columns(
        read_fact("fact_foot_traffic"),
        {
            "count": ["count"],
            "zone": ["zone"],
            "event_ts": ["event_ts"],
            "sensor_id": ["sensor_id"],
            "dwell_seconds": ["dwell_seconds"],
            "store_id": ["store_id"],
        },
    )
    save_table(df_foot, "fact_foot_traffic")
except Exception as e:
    print(f"  Skipping foot_traffic: {e}")

# 6) BLE Pings
try:
    df_ble = select_columns(
        read_fact("fact_ble_pings"),
        {
            "zone": ["zone"],
            "event_ts": ["event_ts"],
            "rssi": ["rssi"],
            "customer_ble_id": ["customer_ble_id"],
            "customer_id": {"candidates": ["customer_id"], "required": False},
            "store_id": ["store_id"],
            "beacon_id": ["beacon_id"],
        },
    )
    save_table(df_ble, "fact_ble_pings")
except Exception as e:
    print(f"  Skipping ble_pings: {e}")

# 7) Marketing
try:
    df_mkt = select_columns(
        read_fact("fact_marketing"),
        {
            "event_ts": ["event_ts"],
            "campaign_id": ["campaign_id"],
            "device": ["device"],
            "creative_id": ["creative_id"],
            "customer_ad_id": ["customer_ad_id"],
            "impression_id_ext": ["impression_id_ext", "impression_id"],
            "cost": ["cost"],
            "cost_cents": {"candidates": ["cost_cents"], "required": False},
            "customer_id": ["customer_id"],
            "channel": ["channel"],
        },
    )
    save_table(df_mkt, "fact_marketing")
except Exception as e:
    print(f"  Skipping marketing: {e}")

# 8) Online Order Headers
try:
    df_oo_hdr = select_columns(
        read_fact("fact_online_order_headers"),
        {
            "completed_ts": ["completed_ts"],
            "event_ts": ["event_ts"],
            "order_id_ext": ["order_id_ext", "order_id"],
            "tax_cents": ["tax_cents"],
            "subtotal": ["subtotal", "subtotal_amount"],
            "total": ["total", "total_amount"],
            "total_cents": ["total_cents"],
            "subtotal_cents": ["subtotal_cents"],
            "tax": ["tax", "tax_amount"],
            "customer_id": ["customer_id"],
            "payment_method": ["payment_method"],
        },
    )
    save_table(df_oo_hdr, "fact_online_order_headers")
except Exception as e:
    print(f"  Skipping online_order_headers: {e}")

# 9) Online Order Lines
try:
    df_oo_lines = select_columns(
        read_fact("fact_online_order_lines"),
        {
            "unit_cents": ["unit_cents"],
            "shipped_ts": ["shipped_ts"],
            "unit_price": ["unit_price"],
            "fulfillment_status": ["fulfillment_status"],
            "order_id": ["order_id"],
            "delivered_ts": ["delivered_ts"],
            "product_id": ["product_id"],
            "quantity": ["quantity"],
            "ext_price": ["ext_price"],
            "node_type": ["node_type"],
            "fulfillment_mode": ["fulfillment_mode"],
            "picked_ts": ["picked_ts"],
            "node_id": ["node_id"],
            "line_num": ["line_num"],
            "promo_code": ["promo_code"],
            "ext_cents": ["ext_cents"],
        },
    )
    save_table(df_oo_lines, "fact_online_order_lines")
except Exception as e:
    print(f"  Skipping online_order_lines: {e}")

# 10) Truck Moves
try:
    df_trucks = select_columns(
        read_fact("fact_truck_moves"),
        {
            "event_ts": ["event_ts"],
            "truck_id": ["truck_id"],
            "dc_id": ["dc_id"],
            "store_id": ["store_id"],
            "shipment_id": ["shipment_id"],
            "status": ["status"],
            "eta": ["eta"],
            "etd": ["etd"],
        },
    )
    save_table(df_trucks, "fact_truck_moves")
except Exception as e:
    print(f"  Skipping truck_moves: {e}")

print("\nFact tables loaded.")
print("Parquet -> Silver load complete!")
