In [0]:
# Databricks notebook source
# MAGIC %md
# MAGIC # Configuration Notebook
# MAGIC Central configuration for the entire pipeline. This notebook is included in all other notebooks using %run.

# COMMAND ----------

# MAGIC %md
# MAGIC ## Catalog & Schema Configuration

# COMMAND ----------

# Unity Catalog Configuration
CATALOG = "`na-dbxtraining`"
SCHEMA_BRONZE = "biju_bronze"
SCHEMA_SILVER = "biju_silver"
SCHEMA_GOLD = "biju_gold"

# Full table paths
def get_table_path(schema, table):
    """Helper function to get full table path"""
    return f"{CATALOG}.{schema}.{table}"

print(f"Catalog: {CATALOG}")
print(f"Bronze Schema: {SCHEMA_BRONZE}")
print(f"Silver Schema: {SCHEMA_SILVER}")
print(f"Gold Schema: {SCHEMA_GOLD}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Azure SQL Configuration

# COMMAND ----------

# Azure SQL Connection Details
JDBC_URL = (
    "jdbc:sqlserver://sqldbdbxtraining.database.windows.net:1433;"
    "database=sqldb-nadbxtraining;"
    "encrypt=true;trustServerCertificate=false;"
    "hostNameInCertificate=*.database.windows.net;"
)

# Secret configuration
SECRET_SCOPE = "dbx-ss-kv-natraining-2"
SECRET_KEY = "sqladministrator-password"

# Get password from secrets
DB_PASSWORD = dbutils.secrets.get(SECRET_SCOPE, SECRET_KEY)

# Connection properties
CONNECTION_PROPS = {
    "user": "sqladministrator@sqldbdbxtraining",
    "password": DB_PASSWORD,
    "driver": "com.microsoft.sqlserver.jdbc.SQLServerDriver"
}

print("Azure SQL connection configured")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Table Configurations

# COMMAND ----------

# Table configurations with source schema and staging paths
BRONZE_TABLES = {
    "SalesOrderHeader": {
        "source_schema": "bijuadventureworks",
        "primary_key": "SalesOrderID",
        "watermark_column": "ModifiedDate",
        "staging_path": "/Volumes/na-dbxtraining/biju_raw/biju_vol/staging/adventureworks/SalesOrderHeader"
    },
    "SalesOrderDetail": {
        "source_schema": "bijuadventureworks",
        "primary_key": "SalesOrderDetailID",
        "watermark_column": "ModifiedDate",
        "staging_path": "/Volumes/na-dbxtraining/biju_raw/biju_vol/staging/adventureworks/SalesOrderDetail"
    },
    "Customer": {
        "source_schema": "bijuadventureworks",
        "primary_key": "CustomerID",
        "watermark_column": "ModifiedDate",
        "staging_path": "/Volumes/na-dbxtraining/biju_raw/biju_vol/staging/adventureworks/Customer"
    },
    "Person": {
        "source_schema": "bijuadventureworks",
        "primary_key": "BusinessEntityID",
        "watermark_column": "ModifiedDate",
        "staging_path": "/Volumes/na-dbxtraining/biju_raw/biju_vol/staging/adventureworks/Person"
    },
    "Address": {
        "source_schema": "bijuadventureworks",
        "primary_key": "AddressID",
        "watermark_column": "ModifiedDate",
        "staging_path": "/Volumes/na-dbxtraining/biju_raw/biju_vol/staging/adventureworks/Address"
    },
    "BusinessEntityAddress": {
        "source_schema": "bijuadventureworks",
        "primary_key": ["BusinessEntityID", "AddressID", "AddressTypeID"],
        "watermark_column": "ModifiedDate",
        "staging_path": "/Volumes/na-dbxtraining/biju_raw/biju_vol/staging/adventureworks/BusinessEntityAddress"
    }
}

print(f"Configured {len(BRONZE_TABLES)} bronze tables")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Checkpoint & Schema Locations

# COMMAND ----------

# Checkpoint and schema locations for Auto Loader
CHECKPOINT_BASE_PATH = "/Volumes/na-dbxtraining/biju_raw/biju_vol/checkpoints/adventureworks"
SCHEMA_BASE_PATH = "/Volumes/na-dbxtraining/biju_raw/biju_vol/schema/adventureworks"

def get_checkpoint_path(layer, table_name):
    """Get checkpoint path for a table"""
    return f"{CHECKPOINT_BASE_PATH}/{layer}/{table_name}"

def get_schema_path(layer, table_name):
    """Get schema location path for a table"""
    return f"{SCHEMA_BASE_PATH}/{layer}/{table_name}"

print("Checkpoint and schema paths configured")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Auto Loader Configuration

# COMMAND ----------

# Auto Loader settings
AUTOLOADER_CONFIG = {
    "format": "parquet",
    "max_files_per_trigger": 1000,
    "schema_inference": True,
    "schema_evolution": "addNewColumns"
}

print("Auto Loader configured")

# COMMAND ----------

# MAGIC %md
# MAGIC ## JDBC Read Configuration

# COMMAND ----------

# JDBC read settings
JDBC_CONFIG = {
    "fetch_size": 10000,
    "num_partitions": 8,
    "lower_bound": "2011-01-01",
    "upper_bound": "2015-01-01"
}

print("JDBC configuration set")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Utility Functions

# COMMAND ----------

from pyspark.sql import DataFrame
from delta.tables import DeltaTable
from typing import List
import logging

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def table_exists(catalog: str, schema: str, table: str) -> bool:
    """Check if a table exists"""
    try:
        spark.sql(f"DESCRIBE TABLE {catalog}.{schema}.{table}")
        return True
    except:
        return False

def create_schema_if_not_exists(catalog: str, schema: str, comment: str = ""):
    """Create schema if it doesn't exist"""
    try:
        spark.sql(f"CREATE SCHEMA IF NOT EXISTS {catalog}.{schema} COMMENT '{comment}'")
        logger.info(f"Schema {catalog}.{schema} ready")
    except Exception as e:
        logger.error(f"Error creating schema {catalog}.{schema}: {str(e)}")

def upsert_delta_table(
    source_df: DataFrame,
    target_table: str,
    merge_keys: List[str],
    update_columns: List[str] = None
):
    """Perform upsert operation on Delta table"""
    
    if not table_exists(*target_table.split('.')):
        # First load - just write
        source_df.write.format("delta").mode("overwrite").saveAsTable(target_table)
        logger.info(f"Created table {target_table} with {source_df.count()} records")
        return
    
    # Build merge condition
    merge_condition = " AND ".join([f"target.{key} = source.{key}" for key in merge_keys])
    
    # Determine columns to update
    if update_columns is None:
        update_columns = [col for col in source_df.columns if col not in merge_keys]
    
    # Perform merge
    target = DeltaTable.forName(spark, target_table)
    
    target.alias("target").merge(
        source_df.alias("source"),
        merge_condition
    ).whenMatchedUpdate(
        set={col: f"source.{col}" for col in update_columns}
    ).whenNotMatchedInsert(
        values={col: f"source.{col}" for col in source_df.columns}
    ).execute()
    
    logger.info(f"Upserted data into {target_table}")

def merge_scd_type2(
    source_df: DataFrame,
    target_table: str,
    merge_keys: List[str],
    compare_columns: List[str]
):
    """Perform SCD Type 2 merge operation"""
    
    if not table_exists(*target_table.split('.')):
        # First load
        source_df.write.format("delta").mode("overwrite").saveAsTable(target_table)
        logger.info(f"Created SCD Type 2 table {target_table}")
        return
    
    target = DeltaTable.forName(spark, target_table)
    
    # Build merge condition
    merge_condition = " AND ".join([f"target.{key} = source.{key}" for key in merge_keys])
    merge_condition += " AND target.is_current = true"
    
    # Build comparison condition
    compare_condition = " OR ".join([
        f"target.{col} <> source.{col} OR (target.{col} IS NULL AND source.{col} IS NOT NULL) OR (target.{col} IS NOT NULL AND source.{col} IS NULL)"
        for col in compare_columns
    ])
    
    # Close out changed records
    target.alias("target").merge(
        source_df.alias("source"),
        merge_condition
    ).whenMatchedUpdate(
        condition=compare_condition,
        set={
            "is_current": "false",
            "effective_end_date": "current_timestamp()"
        }
    ).whenNotMatchedInsert(
        values={col: f"source.{col}" for col in source_df.columns}
    ).execute()
    
    logger.info(f"Applied SCD Type 2 merge to {target_table}")

print("Utility functions loaded")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Print Configuration Summary

# COMMAND ----------

print("\n" + "="*60)
print("CONFIGURATION SUMMARY")
print("="*60)
print(f"Catalog: {CATALOG}")
print(f"Bronze Schema: {SCHEMA_BRONZE}")
print(f"Silver Schema: {SCHEMA_SILVER}")
print(f"Gold Schema: {SCHEMA_GOLD}")
print(f"Bronze Tables: {len(BRONZE_TABLES)}")
print(f"JDBC URL: {JDBC_URL.split(';')[0]}")
print("="*60)