In [None]:
# dlt_pipelines/ecom_dlt_pipeline.py
# This script is intended to be run as a Delta Live Tables pipeline in Databricks.
# It defines the Bronze (cleaned) and Silver (transformed) layers.

import dlt
from pyspark.sql.functions import col, to_date, lit, coalesce, sum, count, current_timestamp, sha2, concat_ws, expr
from pyspark.sql.types import IntegerType, DoubleType, StringType, DateType, BooleanType

# Define GCS paths for raw data using DLT's cloud_files for Auto Loader.
# This path is relative to the DLT pipeline's storage location unless specified as absolute.
# It's best practice to use an absolute GCS path here if raw data is outside the DLT storage.
RAW_DATA_GCS_ROOT = "gs://batch-processing-de_raw_data/" # Make sure to update this GCS bucket

# --- Bronze Layer: Ingest and Cleanse Raw Data ---
print("Starting to create the Bronze layer...")

@dlt.table(
    comment="Raw orders data, incrementally loaded from GCS using Auto Loader. Bronze layer.",
    table_properties={"quality": "bronze"},
    # schema for Auto Loader: DLT will manage it, but schema hint can be useful.
    # For more complex schemas, consider a specific schema definition for cloudFiles
)
def bronze_orders():
    return (
        spark.readStream.format("cloudFiles")
        .option("cloudFiles.format", "csv")
        .option("cloudFiles.schemaLocation", f"{RAW_DATA_GCS_ROOT}_schemas/bronze_orders") # Auto Loader schema inference checkpoint
        .option("header", "true")
        .load(f"{RAW_DATA_GCS_ROOT}orders/")
        .select(
            col("order_id").cast(IntegerType()).alias("order_id"),
            col("customer_id").cast(IntegerType()).alias("customer_id"),
            # Handle potential date parsing errors
            to_date(col("order_date"), "yyyy-MM-dd").alias("order_date"),
            col("total_amount").cast(DoubleType()).alias("total_amount"),
            col("status").cast(StringType()).alias("order_status"),
            current_timestamp().alias("bronze_ingestion_timestamp")
        )
        .filter(col("order_id").isNotNull() & col("customer_id").isNotNull() & col("order_date").isNotNull() & col("total_amount").isNotNull())
        .withWatermark("bronze_ingestion_timestamp", "10 minutes")
    )

print("created---table----")

@dlt.table(
    comment="Raw order items data, incrementally loaded from GCS using Auto Loader. Bronze layer.",
    table_properties={"quality": "bronze"}
)
def bronze_order_items():
    return (
        spark.readStream.format("cloudFiles")
        .option("cloudFiles.format", "csv")
        .option("cloudFiles.schemaLocation", f"{RAW_DATA_GCS_ROOT}_schemas/bronze_order_items")
        .option("header", "true")
        .load(f"{RAW_DATA_GCS_ROOT}order_items/")
        .select(
            col("order_item_id").cast(IntegerType()).alias("order_item_id"),
            col("order_id").cast(IntegerType()).alias("order_id"),
            col("product_id").cast(IntegerType()).alias("product_id"),
            col("quantity").cast(IntegerType()).alias("quantity"),
            col("unit_price").cast(DoubleType()).alias("unit_price"),
            current_timestamp().alias("bronze_ingestion_timestamp")
        )
        .filter(col("order_item_id").isNotNull() & col("order_id").isNotNull() & col("product_id").isNotNull())
        .withWatermark("bronze_ingestion_timestamp", "10 minutes")
    )

@dlt.table(
    comment="Raw products data, incrementally loaded from GCS using Auto Loader. Bronze layer.",
    table_properties={"quality": "bronze"}
)
def bronze_products():
    return (
        spark.readStream.format("cloudFiles")
        .option("cloudFiles.format", "csv")
        .option("cloudFiles.schemaLocation", f"{RAW_DATA_GCS_ROOT}_schemas/bronze_products")
        .option("header", "true")
        .load(f"{RAW_DATA_GCS_ROOT}products/")
        .select(
            col("product_id").cast(IntegerType()).alias("product_id"),
            col("product_name").cast(StringType()).alias("product_name"),
            col("category").cast(StringType()).alias("product_category"),
            col("price").cast(DoubleType()).alias("product_price"),
            current_timestamp().alias("bronze_ingestion_timestamp")
        )
        .filter(col("product_id").isNotNull() & col("product_name").isNotNull() & col("price").isNotNull())
    )

@dlt.table(
    comment="Raw customers data, incrementally loaded from GCS using Auto Loader. Bronze layer.",
    table_properties={"quality": "bronze"}
)
def bronze_customers():
    return (
        spark.readStream.format("cloudFiles")
        .option("cloudFiles.format", "csv")
        .option("cloudFiles.schemaLocation", f"{RAW_DATA_GCS_ROOT}_schemas/bronze_customers")
        .option("header", "true")
        .load(f"{RAW_DATA_GCS_ROOT}customers/")
        .select(
            col("customer_id").cast(IntegerType()).alias("customer_id"),
            col("first_name").cast(StringType()).alias("first_name"),
            col("last_name").cast(StringType()).alias("last_name"),
            col("email").cast(StringType()).alias("email"),
            to_date(col("registration_date"), "yyyy-MM-dd").alias("registration_date"),
            col("country").cast(StringType()).alias("country"),
            current_timestamp().alias("bronze_ingestion_timestamp")
        )
        .filter(col("customer_id").isNotNull() & col("email").isNotNull())
    )


# --- Silver Layer: Transform and Enrich Data ---

@dlt.table(
    comment="Silver sales fact table, joined with product and customer details. Partitioned by order_date.",
    table_properties={"quality": "silver"},
    partition_cols=["order_date"] # Optimize for date-based queries
)
@dlt.expect("order_total_accuracy", "abs(original_total_amount - calculated_order_total) < 0.01")
@dlt.expect_or_drop("valid_sales_record", "order_id IS NOT NULL AND customer_id IS NOT NULL AND order_date IS NOT NULL")
def silver_sales():
    # Read from bronze tables. Use `dlt.read` for batch, `dlt.read_stream` for streaming.
    # For facts, typically streaming. For dimensions, often batch unless changes are frequent and critical.
    orders = dlt.read("bronze_orders")
    order_items = dlt.read("bronze_order_items")
    products = dlt.read("bronze_products") # Read as batch, assuming product changes are less frequent for simplicity.
    customers = dlt.read("bronze_customers") # Read as batch

    # Join Order Items with Products and Orders
    sales_details = order_items.alias("oi") \
        .join(orders.alias("o"), col("oi.order_id") == col("o.order_id"), "inner") \
        .join(products.alias("p"), col("oi.product_id") == col("p.product_id"), "inner") \
        .withColumn("line_item_total", col("oi.quantity") * col("oi.unit_price"))

    # Aggregate sales by order and perform basic calculations
    orders_enriched = sales_details.groupBy(
        "o.order_id", "o.customer_id", "o.order_date", "o.order_status", "o.total_amount"
    ).agg(
        sum("line_item_total").alias("calculated_order_total"),
        count("oi.product_id").alias("total_products_in_order"),
        sum("oi.quantity").alias("total_quantity_in_order")
    )

    # Join with Customer data to enrich sales records
    final_silver_sales = orders_enriched.alias("ose") \
        .join(customers.alias("c"), col("ose.customer_id") == col("c.customer_id"), "inner") \
        .select(
            col("ose.order_id"),
            col("ose.customer_id"),
            col("c.first_name"),
            col("c.last_name"),
            col("c.email"),
            col("c.country"),
            col("c.registration_date").alias("customer_registration_date"),
            col("ose.order_date"),
            col("ose.order_status"),
            col("ose.total_amount").alias("original_total_amount"),
            col("ose.calculated_order_total"),
            col("ose.total_products_in_order"),
            col("ose.total_quantity_in_order"),
            current_timestamp().alias("silver_processed_timestamp")
            # For CDC, you might add a hash of relevant columns to detect changes
            # sha2(concat_ws("||", *[c for c in ose.columns if c not in ["order_id", "customer_id", "order_date"]]), 256).alias("record_hash")
        )
    return final_silver_sales

@dlt.table(
    comment="Silver products dimension table. SCD Type 1 (latest state) for simplicity.",
    table_properties={"quality": "silver"}
)
@dlt.expect_or_drop("valid_product_record", "product_id IS NOT NULL AND product_name IS NOT NULL")
def silver_products():
    return (
        dlt.read("bronze_products") # Reading as batch for dimension table
        .select(
            col("product_id"),
            col("product_name"),
            col("product_category"),
            col("product_price"),
            current_timestamp().alias("silver_processed_timestamp")
        )
    )

@dlt.table(
    comment="Silver customers dimension table. SCD Type 1 (latest state) for simplicity.",
    table_properties={"quality": "silver"}
)
@dlt.expect_or_drop("valid_customer_record", "customer_id IS NOT NULL AND customer_email IS NOT NULL")
def silver_customers():
    return (
        dlt.read("bronze_customers") # Reading as batch for dimension table
        .select(
            col("customer_id"),
            col("first_name").alias("customer_first_name"),
            col("last_name").alias("customer_last_name"),
            col("email").alias("customer_email"),
            col("registration_date").alias("customer_registration_date"),
            col("country").alias("customer_country"),
            current_timestamp().alias("silver_processed_timestamp")
        )
    )
