# Parameterized DLT Pipeline with External Config File

This pipeline demonstrates how to use external YAML configuration files in a DLT pipeline.
The configuration file path is passed as a parameter from the DABs bundle.

In [None]:
import dlt
from pyspark.sql import functions as F
import sys
import os

# Add the src directory to the path so we can import config_loader
sys.path.append('/Workspace' + os.path.dirname(os.path.abspath('')))

## Load Configuration from YAML File

The configuration file path is passed from the pipeline definition in the DABs bundle.

In [None]:
from config_loader import ConfigLoader

# Get configuration file path from pipeline settings
config_file_path = spark.conf.get("config_file_path", "../config/dev_config.yml")
catalog = spark.conf.get("catalog", "main")
schema = spark.conf.get("schema", "default")
pipeline_config = spark.conf.get("pipeline_config", "dev")

print(f"Pipeline Configuration: {pipeline_config}")
print(f"Config File Path: {config_file_path}")
print(f"Catalog: {catalog}")
print(f"Schema: {schema}")

# Load the configuration
config = ConfigLoader(config_file_path)
config.display_config(mask_secrets=True)

## Access Configuration Values

Demonstrate how to access values from the configuration file.

In [None]:
# Access secrets configuration
secret_scope = config.get('secrets.databricks_secret_scope')
keyvault_url = config.get('secrets.keyvault_url')

print(f"Secret Scope: {secret_scope}")
print(f"KeyVault URL: {keyvault_url}")

# Access Kafka configuration
kafka_host = config.get('general.kafka.main.host')
use_oauth = config.get('general.kafka.main.use_oauth')

print(f"\nKafka Host: {kafka_host}")
print(f"Use OAuth: {use_oauth}")

# Access streaming configuration
checkpoint_path = config.get('general.streaming.checkpoint_path')
consumer_group_prefix = config.get('general.streaming.consumer_group_prefix')

print(f"\nCheckpoint Path: {checkpoint_path}")
print(f"Consumer Group Prefix: {consumer_group_prefix}")

# Access OAuth configuration
client_id = config.get('general.kafka_oauth.client_id')
token_url = config.get('general.kafka_oauth.token_url')

print(f"\nOAuth Client ID: {client_id}")
print(f"OAuth Token URL: {token_url}")

## Bronze Layer - Raw Data Ingestion

This table demonstrates using configuration values in a DLT table.

In [None]:
@dlt.table(
    name="raw_data_with_config",
    comment="Raw data ingestion with configuration",
    table_properties={
        "quality": "bronze",
        "pipeline_config": pipeline_config,
        "kafka_host": kafka_host,
        "consumer_group": consumer_group_prefix
    }
)
def bronze_raw_data_with_config():
    """Ingest raw data using configuration parameters"""
    return (
        spark.range(0, 100)
        .withColumn("name", F.concat(F.lit("user_"), F.col("id")))
        .withColumn("value", F.rand() * 100)
        .withColumn("timestamp", F.current_timestamp())
        .withColumn("config", F.lit(pipeline_config))
        .withColumn("kafka_host", F.lit(kafka_host))
        .withColumn("consumer_group", F.lit(consumer_group_prefix))
    )

## Silver Layer - Data Processing with Configuration

This table applies transformations using configuration parameters.

In [None]:
@dlt.table(
    name="processed_data_with_config",
    comment="Processed data using configuration parameters",
    table_properties={
        "quality": "silver",
        "pipeline_config": pipeline_config,
        "checkpoint_path": checkpoint_path
    }
)
@dlt.expect_or_drop("valid_id", "id IS NOT NULL")
@dlt.expect_or_drop("valid_value", "value >= 0")
def silver_processed_data_with_config():
    """Process data with quality checks"""
    return (
        dlt.read("raw_data_with_config")
        .withColumn("value_rounded", F.round(F.col("value"), 2))
        .withColumn("value_category", 
            F.when(F.col("value") < 33, "low")
            .when(F.col("value") < 66, "medium")
            .otherwise("high")
        )
        .withColumn("processed_timestamp", F.current_timestamp())
        .withColumn("checkpoint_path", F.lit(checkpoint_path))
    )

## Example: Retrieve Secrets from Key Vault

This cell demonstrates how to retrieve secrets using the configuration.

In [None]:
# Example: Retrieve a secret from Databricks Secret Scope
# Uncomment and use this when you have actual secrets configured

# secret_key_name = config.get('general.kafka_oauth.client_secret_vault_key')
# if secret_key_name:
#     try:
#         client_secret = config.get_secret_from_scope(secret_key_name)
#         print(f"✓ Successfully retrieved secret: {secret_key_name}")
#         # Use the secret in your pipeline
#     except Exception as e:
#         print(f"✗ Failed to retrieve secret: {e}")

print("Note: Secret retrieval example is commented out. Uncomment when you have secrets configured.")

## Gold Layer - Aggregated Metrics

Create aggregated data with configuration metadata.

In [None]:
@dlt.table(
    name="aggregated_data_with_config",
    comment="Aggregated business metrics with configuration metadata",
    table_properties={
        "quality": "gold",
        "pipeline_config": pipeline_config,
        "kafka_host": kafka_host
    }
)
def gold_aggregated_data_with_config():
    """Create aggregated metrics"""
    return (
        dlt.read("processed_data_with_config")
        .groupBy("value_category", "config", "kafka_host")
        .agg(
            F.count("*").alias("record_count"),
            F.avg("value_rounded").alias("avg_value"),
            F.min("value_rounded").alias("min_value"),
            F.max("value_rounded").alias("max_value"),
            F.stddev("value_rounded").alias("stddev_value")
        )
        .withColumn("aggregation_timestamp", F.current_timestamp())
        .withColumn("environment", F.lit(pipeline_config))
    )