In [0]:
%run ./00_config

In [0]:
# Databricks notebook source
# MAGIC %md
# MAGIC # Bronze Layer - Batch Load
# MAGIC Extract data from Azure SQL and load to Bronze using Auto Loader

# COMMAND ----------

# MAGIC %run ./00_config

# COMMAND ----------

from pyspark.sql.functions import col, current_timestamp, lit, input_file_name
from datetime import datetime

# COMMAND ----------

# MAGIC %md
# MAGIC ## Create Bronze Schema

# COMMAND ----------

create_schema_if_not_exists(CATALOG, SCHEMA_BRONZE, "Bronze layer - raw data from Azure SQL")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 1: Extract from Azure SQL to Staging

# COMMAND ----------

def extract_table_to_staging(table_name: str, table_config: dict):
    """Extract table from Azure SQL and save to staging"""
    
    source_schema = table_config['source_schema']
    staging_path = table_config['staging_path']
    watermark_column = table_config['watermark_column']
    jdbc_table = f"{source_schema}.{table_name}"
    
    logger.info(f"Extracting {jdbc_table}...")
    
    try:
        # Read from Azure SQL with partitioning
        df = (
            spark.read
            .jdbc(
                url=JDBC_URL,
                table=jdbc_table,
                properties=CONNECTION_PROPS,
                column=watermark_column,
                lowerBound=JDBC_CONFIG['lower_bound'],
                upperBound=JDBC_CONFIG['upper_bound'],
                numPartitions=JDBC_CONFIG['num_partitions']
            )
        )
        
        # Add extraction metadata
        df = (# Databricks notebook source
# MAGIC %md
# MAGIC # Bronze Layer - Batch Load
# MAGIC Extract data from Azure SQL and load to Bronze using Auto Loader

# COMMAND ----------

# MAGIC %run ./00_config

# COMMAND ----------

from pyspark.sql.functions import col, current_timestamp, lit, input_file_name
from datetime import datetime

# COMMAND ----------

# MAGIC %md
# MAGIC ## Create Bronze Schema

# COMMAND ----------

create_schema_if_not_exists(CATALOG, SCHEMA_BRONZE, "Bronze layer - raw data from Azure SQL")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 1: Extract from Azure SQL to Staging

# COMMAND ----------

def extract_table_to_staging(table_name: str, table_config: dict):
    """Extract table from Azure SQL and save to staging"""
    
    source_schema = table_config['source_schema']
    staging_path = table_config['staging_path']
    watermark_column = table_config['watermark_column']
    jdbc_table = f"{source_schema}.{table_name}"
    
    logger.info(f"Extracting {jdbc_table}...")
    
    try:
        # Read from Azure SQL
        # Method 1: Simple read without partitioning (good for smaller tables)
        df = (
            spark.read
            .format("jdbc")
            .option("url", JDBC_URL)
            .option("dbtable", jdbc_table)
            .option("user", CONNECTION_PROPS["user"])
            .option("password", CONNECTION_PROPS["password"])
            .option("driver", CONNECTION_PROPS["driver"])
            .option("fetchsize", JDBC_CONFIG['fetch_size'])
            .load()
        )
        
        # Add extraction metadata
        df = (
            df
            .withColumn("_extracted_at", current_timestamp())
            .withColumn("_source_system", lit("azure_sql"))
            .withColumn("_source_table", lit(jdbc_table))
        )
        
        # Get count before writing
        record_count = df.count()
        
        # Write to staging as Parquet
        df.write.mode("overwrite").parquet(staging_path)
        
        logger.info(f"✓ Extracted {record_count:,} records from {jdbc_table}")
        
        return {"status": "SUCCESS", "records": record_count}
        
    except Exception as e:
        logger.error(f"✗ Failed to extract {jdbc_table}: {str(e)}")
        return {"status": "FAILED", "error": str(e)}

# COMMAND ----------

# Alternative: Extract with partitioning for large tables
def extract_table_to_staging_partitioned(table_name: str, table_config: dict):
    """Extract large table from Azure SQL with partitioning"""
    
    source_schema = table_config['source_schema']
    staging_path = table_config['staging_path']
    primary_key = table_config['primary_key']
    jdbc_table = f"{source_schema}.{table_name}"
    
    # Use primary key for partitioning if it's a single numeric column
    if isinstance(primary_key, str):
        partition_column = primary_key
    else:
        # If composite key, use first column or don't partition
        logger.warning(f"Composite primary key for {table_name}, reading without partitioning")
        return extract_table_to_staging(table_name, table_config)
    
    logger.info(f"Extracting {jdbc_table} with partitioning on {partition_column}...")
    
    try:
        # First, get min and max values for the partition column
        bounds_query = f"(SELECT MIN({partition_column}) as min_val, MAX({partition_column}) as max_val FROM {jdbc_table}) as bounds"
        
        bounds_df = (
            spark.read
            .format("jdbc")
            .option("url", JDBC_URL)
            .option("dbtable", bounds_query)
            .option("user", CONNECTION_PROPS["user"])
            .option("password", CONNECTION_PROPS["password"])
            .option("driver", CONNECTION_PROPS["driver"])
            .load()
        )
        
        bounds = bounds_df.collect()[0]
        min_val = bounds['min_val']
        max_val = bounds['max_val']
        
        logger.info(f"Partition bounds: {min_val} to {max_val}")
        
        # Read with partitioning
        df = (
            spark.read
            .format("jdbc")
            .option("url", JDBC_URL)
            .option("dbtable", jdbc_table)
            .option("user", CONNECTION_PROPS["user"])
            .option("password", CONNECTION_PROPS["password"])
            .option("driver", CONNECTION_PROPS["driver"])
            .option("fetchsize", JDBC_CONFIG['fetch_size'])
            .option("partitionColumn", partition_column)
            .option("lowerBound", str(min_val))
            .option("upperBound", str(max_val))
            .option("numPartitions", str(JDBC_CONFIG['num_partitions']))
            .load()
        )
        
        # Add extraction metadata
        df = (
            df
            .withColumn("_extracted_at", current_timestamp())
            .withColumn("_source_system", lit("azure_sql"))
            .withColumn("_source_table", lit(jdbc_table))
        )
        
        record_count = df.count()
        
        # Write to staging as Parquet
        df.write.mode("overwrite").parquet(staging_path)
        
        logger.info(f"✓ Extracted {record_count:,} records from {jdbc_table}")
        
        return {"status": "SUCCESS", "records": record_count}
        
    except Exception as e:
        logger.error(f"✗ Failed to extract {jdbc_table}: {str(e)}")
        # Fallback to non-partitioned read
        logger.info(f"Retrying without partitioning...")
        return extract_table_to_staging(table_name, table_config)

# COMMAND ----------

# Extract all tables
print("\n" + "="*60)
print("EXTRACTING DATA FROM AZURE SQL")
print("="*60 + "\n")

extraction_results = {}
start_time = datetime.now()

# Determine which tables to partition
large_tables = ['SalesOrderHeader', 'SalesOrderDetail']  # Tables that benefit from partitioning

for table_name, table_config in BRONZE_TABLES.items():
    # Use partitioned extraction for large tables
    if table_name in large_tables:
        result = extract_table_to_staging_partitioned(table_name, table_config)
    else:
        result = extract_table_to_staging(table_name, table_config)
    
    extraction_results[table_name] = result

extraction_duration = (datetime.now() - start_time).total_seconds()

# Print summary
print("\n" + "="*60)
print("EXTRACTION SUMMARY")
print("="*60)
successful = 0
failed = 0
total_records = 0

for table_name, result in extraction_results.items():
    status_symbol = "✓" if result['status'] == 'SUCCESS' else "✗"
    if result['status'] == 'SUCCESS':
        print(f"{status_symbol} {table_name}: {result['records']:,} records")
        successful += 1
        total_records += result['records']
    else:
        print(f"{status_symbol} {table_name}: {result['error']}")
        failed += 1

print(f"\nSuccessful: {successful}/{len(BRONZE_TABLES)}")
print(f"Failed: {failed}/{len(BRONZE_TABLES)}")
print(f"Total Records Extracted: {total_records:,}")
print(f"Total Time: {extraction_duration:.2f} seconds")
print("="*60)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 2: Load to Bronze using Auto Loader

# COMMAND ----------

def load_table_to_bronze(table_name: str, table_config: dict):
    """Load table from staging to Bronze using Auto Loader"""
    
    staging_path = table_config['staging_path']
    checkpoint_path = get_checkpoint_path('bronze', table_name)
    schema_location = get_schema_path('bronze', table_name)
    
    bronze_table = get_table_path(SCHEMA_BRONZE, f"bronze_{table_name.lower()}")
    
    logger.info(f"Loading {table_name} to {bronze_table}...")
    
    try:
        # Read using Auto Loader
        df = (
            spark.readStream
            .format("cloudFiles")
            .option("cloudFiles.format", AUTOLOADER_CONFIG['format'])
            .option("cloudFiles.schemaLocation", schema_location)
            .option("cloudFiles.inferColumnTypes", "true")
            .option("cloudFiles.schemaEvolutionMode", AUTOLOADER_CONFIG['schema_evolution'])
            .load(staging_path)
        )
        
        # Add Bronze metadata
        df = (
            df
            .withColumn("_bronze_load_timestamp", current_timestamp())
            .withColumn("_source_file", input_file_name())
        )
        
        # Write to Bronze using trigger(availableNow=True) for batch processing
        query = (
            df.writeStream
            .format("delta")
            .outputMode("append")
            .option("checkpointLocation", checkpoint_path)
            .trigger(availableNow=True)
            .toTable(bronze_table)
        )
        
        # Wait for completion
        query.awaitTermination()
        
        # Get record count
        record_count = spark.table(bronze_table).count()
        logger.info(f"✓ Loaded {record_count:,} records to {bronze_table}")
        
        return {"status": "SUCCESS", "records": record_count}
        
    except Exception as e:
        logger.error(f"✗ Failed to load {table_name}: {str(e)}")
        return {"status": "FAILED", "error": str(e)}

# COMMAND ----------

# Load all tables to Bronze
print("\n" + "="*60)
print("LOADING DATA TO BRONZE")
print("="*60 + "\n")

load_results = {}
start_time = datetime.now()

for table_name, table_config in BRONZE_TABLES.items():
    result = load_table_to_bronze(table_name, table_config)
    load_results[table_name] = result

load_duration = (datetime.now() - start_time).total_seconds()

# Print summary
print("\n" + "="*60)
print("LOAD SUMMARY")
print("="*60)
successful = 0
failed = 0
total_records = 0

for table_name, result in load_results.items():
    status_symbol = "✓" if result['status'] == 'SUCCESS' else "✗"
    if result['status'] == 'SUCCESS':
        print(f"{status_symbol} {table_name}: {result['records']:,} records")
        successful += 1
        total_records += result['records']
    else:
        print(f"{status_symbol} {table_name}: {result['error']}")
        failed += 1

print(f"\nSuccessful: {successful}/{len(BRONZE_TABLES)}")
print(f"Failed: {failed}/{len(BRONZE_TABLES)}")
print(f"Total Records Loaded: {total_records:,}")
print(f"Total Time: {load_duration:.2f} seconds")
print("="*60)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Verify Bronze Tables

# COMMAND ----------

# MAGIC %sql
# MAGIC SHOW TABLES IN na-dbxtraining.biju_bronze;

# COMMAND ----------

# Display sample data from a bronze table
print("Sample data from bronze_salesorderheader:")
display(spark.table(get_table_path(SCHEMA_BRONZE, "bronze_salesorderheader")).limit(10))

# COMMAND ----------

# Print all table counts
print("\n" + "="*60)
print("BRONZE TABLE COUNTS")
print("="*60)

total_records = 0
for table_name in BRONZE_TABLES.keys():
    bronze_table = f"bronze_{table_name.lower()}"
    try:
        count = spark.table(get_table_path(SCHEMA_BRONZE, bronze_table)).count()
        print(f"{bronze_table}: {count:,} rows")
        total_records += count
    except Exception as e:
        print(f"{bronze_table}: ERROR - {str(e)}")

print(f"\nTotal Records: {total_records:,}")
print("="*60)

# COMMAND ----------

# Show table schemas
print("\n" + "="*60)
print("BRONZE TABLE SCHEMAS")
print("="*60 + "\n")

for table_name in BRONZE_TABLES.keys():
    bronze_table = f"bronze_{table_name.lower()}"
    print(f"\n{bronze_table}:")
    try:
        spark.table(get_table_path(SCHEMA_BRONZE, bronze_table)).printSchema()
    except Exception as e:
        print(f"ERROR: {str(e)}")
            df
            .withColumn("_extracted_at", current_timestamp())
            .withColumn("_source_system", lit("azure_sql"))
            .withColumn("_source_table", lit(jdbc_table))
        )
        
        # Write to staging as Parquet
        df.write.mode("overwrite").parquet(staging_path)
        
        record_count = df.count()
        logger.info(f"✓ Extracted {record_count:,} records from {jdbc_table}")
        
        return {"status": "SUCCESS", "records": record_count}
        
    except Exception as e:
        logger.error(f"✗ Failed to extract {jdbc_table}: {str(e)}")
        return {"status": "FAILED", "error": str(e)}

# COMMAND ----------

# Extract all tables
print("\n" + "="*60)
print("EXTRACTING DATA FROM AZURE SQL")
print("="*60 + "\n")

extraction_results = {}
start_time = datetime.now()

for table_name, table_config in BRONZE_TABLES.items():
    result = extract_table_to_staging(table_name, table_config)
    extraction_results[table_name] = result

extraction_duration = (datetime.now() - start_time).total_seconds()



In [0]:
# Print summary
print("\n" + "="*60)
print("EXTRACTION SUMMARY")
print("="*60)
for table_name, result in extraction_results.items():
    status_symbol = "✓" if result['status'] == 'SUCCESS' else "✗"
    if result['status'] == 'SUCCESS':
        print(f"{status_symbol} {table_name}: {result['records']:,} records")
    else:
        print(f"{status_symbol} {table_name}: {result['error']}")
print(f"\nTotal Time: {extraction_duration:.2f} seconds")
print("="*60)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 2: Load to Bronze using Auto Loader

# COMMAND ----------

def load_table_to_bronze(table_name: str, table_config: dict):
    """Load table from staging to Bronze using Auto Loader"""
    
    staging_path = table_config['staging_path']
    checkpoint_path = get_checkpoint_path('bronze', table_name)
    schema_location = get_schema_path('bronze', table_name)
    
    bronze_table = get_table_path(SCHEMA_BRONZE, f"bronze_{table_name.lower()}")
    
    logger.info(f"Loading {table_name} to {bronze_table}...")
    
    try:
        # Read using Auto Loader
        df = (
            spark.readStream
            .format("cloudFiles")
            .option("cloudFiles.format", AUTOLOADER_CONFIG['format'])
            .option("cloudFiles.schemaLocation", schema_location)
            .option("cloudFiles.inferColumnTypes", "true")
            .option("cloudFiles.schemaEvolutionMode", AUTOLOADER_CONFIG['schema_evolution'])
            .load(staging_path)
        )
        
        # Add Bronze metadata
        df = (
            df
            .withColumn("_bronze_load_timestamp", current_timestamp())
            .withColumn("_source_file", input_file_name())
        )
        
        # Write to Bronze using trigger(availableNow=True) for batch processing
        query = (
            df.writeStream
            .format("delta")
            .outputMode("append")
            .option("checkpointLocation", checkpoint_path)
            .trigger(availableNow=True)
            .toTable(bronze_table)
        )
        
        # Wait for completion
        query.awaitTermination()
        
        # Get record count
        record_count = spark.table(bronze_table).count()
        logger.info(f"✓ Loaded {record_count:,} records to {bronze_table}")
        
        return {"status": "SUCCESS", "records": record_count}
        
    except Exception as e:
        logger.error(f"✗ Failed to load {table_name}: {str(e)}")
        return {"status": "FAILED", "error": str(e)}

# COMMAND ----------

# Load all tables to Bronze
print("\n" + "="*60)
print("LOADING DATA TO BRONZE")
print("="*60 + "\n")

load_results = {}
start_time = datetime.now()

for table_name, table_config in BRONZE_TABLES.items():
    result = load_table_to_bronze(table_name, table_config)
    load_results[table_name] = result

load_duration = (datetime.now() - start_time).total_seconds()

# Print summary
print("\n" + "="*60)
print("LOAD SUMMARY")
print("="*60)
for table_name, result in load_results.items():
    status_symbol = "✓" if result['status'] == 'SUCCESS' else "✗"
    if result['status'] == 'SUCCESS':
        print(f"{status_symbol} {table_name}: {result['records']:,} records")
    else:
        print(f"{status_symbol} {table_name}: {result['error']}")
print(f"\nTotal Time: {load_duration:.2f} seconds")
print("="*60)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Verify Bronze Tables

# COMMAND ----------

# MAGIC %sql
# MAGIC SHOW TABLES IN na-dbxtraining.biju_bronze;

# COMMAND ----------

# Display sample data
display(
    spark.table("`na-dbxtraining.biju_bronze.salesorderheader`").limit(10)
)

# COMMAND ----------

# Print all table counts
print("\n" + "="*60)
print("BRONZE TABLE COUNTS")
print("="*60)

total_records = 0
for table_name in BRONZE_TABLES.keys():
    bronze_table = f"bronze_{table_name.lower()}"
    try:
        count = spark.table(get_table_path(SCHEMA_BRONZE, bronze_table)).count()
        print(f"{bronze_table}: {count:,} rows")
        total_records += count
    except Exception as e:
        print(f"{bronze_table}: ERROR - {str(e)}")

print(f"\nTotal Records: {total_records:,}")
print("="*60)