In [0]:
%python
# This notebook is designed to be run as a Delta Live Tables (DLT) pipeline.
# It implements a Medallion Architecture (Bronze, Silver, Gold) for customer analytics.
import dlt
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
%python
from pyspark.sql.types import StructType, StructField, StringType
raw_data_path = "abfss://rawanalytics@adlsexternalfororders.dfs.core.windows.net/"
bronze_table_name = "analyticscatalog .analytics_bronze_schema.bronze_customer_master"

# Define the schema for the customer master data
customer_master_schema = StructType([
    StructField("customer_id", StringType(), True),
    StructField("customer_name", StringType(), True),
    StructField("customer_email", StringType(), True)
])

@dlt.table(
  comment = "Raw customer master data from CSV, ingested into Bronze.",
  table_properties ={"quality": "bronze"}
)   
def bronze_customer_master():
  return (
    spark.readStream.format("cloudFiles")          # Auto Loader for incremental data processing
    .option("cloudFiles.format", "csv")
    .option("cloudFiles.schemaLocation", f"{raw_data_path}/rawanalytics/checkpoints/customer_master_bronze")
    .schema(customer_master_schema)                # Provide the schema explicitly
    .load(f"{raw_data_path}/customer_master.csv")
    .select(
      col("customer_id").cast(StringType()).alias("customer_id"),
      col("customer_name").cast(StringType()).alias("customer_name"),
      col("customer_email").cast(StringType()).alias("customer_email"),
      current_timestamp().alias("processing_timestamp"), # Add ingestion timestamp
      input_file_name().alias("source_file")             # Track source file    
    )
  )