# Parameterized DLT Pipeline

This pipeline uses configuration parameters passed from the DABs bundle to avoid duplicating pipeline definitions.

In [None]:
import dlt
from pyspark.sql import functions as F

## Get Configuration Parameters

These parameters are passed from the pipeline configuration in the DABs bundle.

In [None]:
# Get configuration from pipeline settings
catalog = spark.conf.get("catalog", "main")
schema = spark.conf.get("schema", "default")
source_table = spark.conf.get("source_table", "raw_data")
target_table = spark.conf.get("target_table", "processed_data")
pipeline_config = spark.conf.get("pipeline_config", "dev")

print(f"Pipeline Configuration: {pipeline_config}")
print(f"Catalog: {catalog}")
print(f"Schema: {schema}")
print(f"Source Table: {source_table}")
print(f"Target Table: {target_table}")

## Bronze Layer - Raw Data Ingestion

This table ingests raw data. In this example, we'll create sample data.

In [None]:
@dlt.table(
    name="raw_data",
    comment="Raw data ingestion layer",
    table_properties={
        "quality": "bronze",
        "pipeline_config": pipeline_config
    }
)
def bronze_raw_data():
    """Ingest raw data - in this example we create sample data"""
    return (
        spark.range(0, 100)
        .withColumn("name", F.concat(F.lit("user_"), F.col("id")))
        .withColumn("value", F.rand() * 100)
        .withColumn("timestamp", F.current_timestamp())
        .withColumn("config", F.lit(pipeline_config))
    )

## Silver Layer - Data Cleansing and Transformation

This table applies data quality checks and transformations.

In [None]:
@dlt.table(
    name="processed_data",
    comment="Processed data with quality checks",
    table_properties={
        "quality": "silver",
        "pipeline_config": pipeline_config
    }
)
@dlt.expect_or_drop("valid_id", "id IS NOT NULL")
@dlt.expect_or_drop("valid_value", "value >= 0")
def silver_processed_data():
    """Apply data quality checks and transformations"""
    return (
        dlt.read("raw_data")
        .withColumn("value_rounded", F.round(F.col("value"), 2))
        .withColumn("value_category", 
            F.when(F.col("value") < 33, "low")
            .when(F.col("value") < 66, "medium")
            .otherwise("high")
        )
        .withColumn("processed_timestamp", F.current_timestamp())
    )

## Gold Layer - Aggregated Data

This table creates business-ready aggregated data.

In [None]:
@dlt.table(
    name="aggregated_data",
    comment="Aggregated business-ready data",
    table_properties={
        "quality": "gold",
        "pipeline_config": pipeline_config
    }
)
def gold_aggregated_data():
    """Create aggregated metrics"""
    return (
        dlt.read("processed_data")
        .groupBy("value_category", "config")
        .agg(
            F.count("*").alias("record_count"),
            F.avg("value_rounded").alias("avg_value"),
            F.min("value_rounded").alias("min_value"),
            F.max("value_rounded").alias("max_value"),
            F.stddev("value_rounded").alias("stddev_value")
        )
        .withColumn("aggregation_timestamp", F.current_timestamp())
    )