In [0]:
 %run /Workspace/Users/biju.thottathil@3cloudsolutions.com/training/databricksinternaldemo/greentaxiautoloader/00setupconfig

In [0]:
# Databricks notebook source
# MAGIC %md
# MAGIC # 01 - Bronze Layer: Autoloader Ingestion
# MAGIC 
# MAGIC This notebook implements the Bronze layer using Databricks Autoloader for incremental data ingestion.
# MAGIC 
# MAGIC **Features:**
# MAGIC - Incremental file processing with Autoloader
# MAGIC - Schema inference and evolution
# MAGIC - Checkpoint management for exactly-once semantics
# MAGIC - Error handling and data quality checks
# MAGIC 
# MAGIC **Author:** Data Engineering Team  
# MAGIC **Last Updated:** December 2024

# COMMAND ----------

# MAGIC %md
# MAGIC ## 1. Load Configuration

# COMMAND ----------



# COMMAND ----------

# Access configuration from temp view
config_df = spark.table("pipeline_config")
config = config_df.first().asDict()

CATALOG = config["catalog"]
SCHEMA = config["schema"]
BRONZE_TABLE = config["bronze_table"]
SOURCE_DATA_PATH = config["source_data_path"]
CHECKPOINT_LOCATION = config["checkpoint_location"]

print("‚úÖ Configuration loaded:")
print(f"   Source: {SOURCE_DATA_PATH}")
print(f"   Target: {CATALOG}.{SCHEMA}.{BRONZE_TABLE}")
print(f"   Checkpoint: {CHECKPOINT_LOCATION}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## 2. Define Bronze Table Schema
# MAGIC 
# MAGIC Define the expected schema for Green Taxi data to ensure data quality.

# COMMAND ----------

from pyspark.sql.types import *

bronze_schema = StructType([
    StructField("VendorID", IntegerType(), True),
    StructField("lpep_pickup_datetime", TimestampType(), True),
    StructField("lpep_dropoff_datetime", TimestampType(), True),
    StructField("store_and_fwd_flag", StringType(), True),
    StructField("RatecodeID", IntegerType(), True),
    StructField("PULocationID", IntegerType(), True),
    StructField("DOLocationID", IntegerType(), True),
    StructField("passenger_count", IntegerType(), True),
    StructField("trip_distance", DoubleType(), True),
    StructField("fare_amount", DoubleType(), True),
    StructField("extra", DoubleType(), True),
    StructField("mta_tax", DoubleType(), True),
    StructField("tip_amount", DoubleType(), True),
    StructField("tolls_amount", DoubleType(), True),
    StructField("ehail_fee", DoubleType(), True),
    StructField("improvement_surcharge", DoubleType(), True),
    StructField("total_amount", DoubleType(), True),
    StructField("payment_type", IntegerType(), True),
    StructField("trip_type", IntegerType(), True),
    StructField("congestion_surcharge", DoubleType(), True)
])

print("‚úÖ Bronze schema defined")
print(f"   Total fields: {len(bronze_schema.fields)}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## 3. Create Autoloader Stream
# MAGIC 
# MAGIC Use Autoloader to incrementally ingest new files from the source location.

# COMMAND ----------

from pyspark.sql import functions as F

# Configure Autoloader
autoloader_df = (spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "csv")
    .option("cloudFiles.schemaLocation", f"{CHECKPOINT_LOCATION}schema")
    .option("cloudFiles.inferColumnTypes", "true")
    .option("cloudFiles.schemaHints", "VendorID Integer, lpep_pickup_datetime Timestamp, lpep_dropoff_datetime Timestamp, store_and_fwd_flag String, RatecodeID Integer, PULocationID Integer, DOLocationID Integer, passenger_count Integer, trip_distance Double, fare_amount Double, extra Double, mta_tax Double, tip_amount Double, tolls_amount Double, ehail_fee Double, improvement_surcharge Double, total_amount Double, payment_type Integer, trip_type Integer, congestion_surcharge Double")
    .option("header", "true")
    .option("cloudFiles.maxFilesPerTrigger", 100)  # Process up to 100 files per trigger
    .schema(bronze_schema)  # Provide schema hint
    .load(SOURCE_DATA_PATH)
)

# Add audit columns
bronze_stream = (autoloader_df
    .withColumn("ingestion_timestamp", F.current_timestamp())
    .withColumn("source_file", F.input_file_name())
    .withColumn("processing_date", F.current_date())
)

print("‚úÖ Autoloader stream configured")
display(bronze_stream.limit(5))

# COMMAND ----------

# MAGIC %md
# MAGIC ## 4. Write to Bronze Delta Table
# MAGIC 
# MAGIC Write the streaming data to a Bronze Delta table with checkpoint management.

# COMMAND ----------

# Define checkpoint location for this stream
bronze_checkpoint = f"{CHECKPOINT_LOCATION}bronze_stream/"

# Write stream to Delta table
bronze_stream_query = (
    bronze_stream.writeStream
    .format("delta")
    .outputMode("append")
    .option("checkpointLocation", bronze_checkpoint)
    .option("mergeSchema", "true")  # Enable schema evolution
    #.trigger(processingTime="1 minute")  # Process every minute
    .trigger(once=True)
    .table(f"`{CATALOG}`.`{SCHEMA}`.`{BRONZE_TABLE}`")
)

print(f"‚úÖ Bronze stream started")
print(f"   Stream ID: {bronze_stream_query.id}")
print(f"   Status: {bronze_stream_query.status}")
print(f"   Checkpoint: {bronze_checkpoint}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## 5. Monitor Stream Progress

# COMMAND ----------

import time

# Monitor for 2 minutes
print("üìä Monitoring stream progress...")
print("=" * 80)

for i in range(4):  # Check 4 times (30 seconds each)
    try:
        # Get latest progress
        progress = bronze_stream_query.lastProgress
        
        if progress:
            print(f"\nüîÑ Progress Update {i+1}:")
            print(f"   Timestamp: {progress['timestamp']}")
            print(f"   Input Rows: {progress.get('numInputRows', 0):,}")
            print(f"   Processed Rows: {progress.get('processedRowsPerSecond', 0):.2f}/sec")
            print(f"   Batch ID: {progress.get('batchId', 0)}")
            
            # Check sources
            if 'sources' in progress:
                for source in progress['sources']:
                    print(f"   Files Processed: {source.get('numInputRows', 0):,}")
        else:
            print(f"\n‚è≥ Waiting for first batch... ({i+1}/4)")
        
        time.sleep(30)  # Wait 30 seconds
        
    except Exception as e:
        print(f"‚ö†Ô∏è  Error checking progress: {e}")
        break

print("\n" + "=" * 80)
print("‚úÖ Initial monitoring complete")

# COMMAND ----------

# MAGIC %md
# MAGIC ## 6. Verify Bronze Table

# COMMAND ----------

# Read bronze table
bronze_df = spark.table(f"{CATALOG}.{SCHEMA}.{BRONZE_TABLE}")

print(f"‚úÖ Bronze Table: {CATALOG}.{SCHEMA}.{BRONZE_TABLE}")
print(f"   Total Records: {bronze_df.count():,}")
print(f"   Schema:")
bronze_df.printSchema()

# Show sample records
print("\nüìä Sample Records:")
display(bronze_df.limit(10))

# COMMAND ----------

# MAGIC %md
# MAGIC ## 7. Data Quality Checks

# COMMAND ----------

# Perform data quality checks
print("üîç Running Data Quality Checks...")
print("=" * 80)

# Check 1: Null values in critical columns
null_checks = bronze_df.select(
    F.sum(F.when(F.col("VendorID").isNull(), 1).otherwise(0)).alias("null_vendor"),
    F.sum(F.when(F.col("lpep_pickup_datetime").isNull(), 1).otherwise(0)).alias("null_pickup"),
    F.sum(F.when(F.col("lpep_dropoff_datetime").isNull(), 1).otherwise(0)).alias("null_dropoff"),
    F.sum(F.when(F.col("total_amount").isNull(), 1).otherwise(0)).alias("null_amount")
).first()

print("1Ô∏è‚É£ Null Value Check:")
for field, count in null_checks.asDict().items():
    print(f"   {field}: {count:,} nulls")

# Check 2: Invalid values
invalid_checks = bronze_df.select(
    F.sum(F.when(F.col("trip_distance") < 0, 1).otherwise(0)).alias("negative_distance"),
    F.sum(F.when(F.col("fare_amount") < 0, 1).otherwise(0)).alias("negative_fare"),
    F.sum(F.when(F.col("passenger_count") <= 0, 1).otherwise(0)).alias("invalid_passengers"),
    F.sum(F.when(F.col("total_amount") < 0, 1).otherwise(0)).alias("negative_total")
).first()

print("\n2Ô∏è‚É£ Invalid Value Check:")
for field, count in invalid_checks.asDict().items():
    print(f"   {field}: {count:,} invalid records")

# Check 3: Date range
date_stats = bronze_df.select(
    F.min("lpep_pickup_datetime").alias("min_pickup"),
    F.max("lpep_pickup_datetime").alias("max_pickup"),
    F.count("*").alias("total_records")
).first()

print("\n3Ô∏è‚É£ Date Range:")
print(f"   Earliest pickup: {date_stats['min_pickup']}")
print(f"   Latest pickup: {date_stats['max_pickup']}")
print(f"   Total records: {date_stats['total_records']:,}")

# Check 4: Vendor distribution
print("\n4Ô∏è‚É£ Vendor Distribution:")
vendor_dist = bronze_df.groupBy("VendorID").count().orderBy("VendorID")
display(vendor_dist)

print("\n" + "=" * 80)
print("‚úÖ Data quality checks complete")

# COMMAND ----------

# MAGIC %md
# MAGIC ## 8. Stream Management

# COMMAND ----------

# Check if stream is active
print("üìä Stream Status:")
print("=" * 80)

if bronze_stream_query.isActive:
    print(f"‚úÖ Stream is ACTIVE")
    print(f"   Stream ID: {bronze_stream_query.id}")
    print(f"   Name: {bronze_stream_query.name or 'Unnamed'}")
    
    # Get recent progress
    recent_progress = bronze_stream_query.recentProgress
    if recent_progress:
        print(f"\nüìà Recent Activity:")
        print(f"   Total Batches: {len(recent_progress)}")
        total_rows = sum([p.get('numInputRows', 0) for p in recent_progress])
        print(f"   Total Rows Processed: {total_rows:,}")
else:
    print("‚ö†Ô∏è  Stream is NOT ACTIVE")

print("\n" + "=" * 80)

# COMMAND ----------

# MAGIC %md
# MAGIC ## 9. Stop Stream (Optional)
# MAGIC 
# MAGIC Uncomment to stop the stream. For continuous processing, leave it running.

# COMMAND ----------

# Uncomment to stop the stream
# bronze_stream_query.stop()
# print("üõë Stream stopped successfully")

# COMMAND ----------

# MAGIC %md
# MAGIC ## 10. View Stream Metrics

# COMMAND ----------

# Display stream metrics
print("üìä Stream Metrics Dashboard")
print("=" * 80)

# Get all active streams
active_streams = spark.streams.active

print(f"Active Streams: {len(active_streams)}")
for stream in active_streams:
    print(f"\nüîÑ Stream: {stream.id}")
    print(f"   Name: {stream.name or 'Unnamed'}")
    print(f"   Status: {'Active' if stream.isActive else 'Inactive'}")
    
    if stream.lastProgress:
        progress = stream.lastProgress
        print(f"   Last Batch: {progress.get('batchId', 'N/A')}")
        print(f"   Input Rows: {progress.get('numInputRows', 0):,}")
        print(f"   Processing Time: {progress.get('durationMs', {}).get('triggerExecution', 0):,} ms")

print("\n" + "=" * 80)

# COMMAND ----------

# MAGIC %md
# MAGIC ## 11. Query Bronze Table Statistics

# COMMAND ----------

# Get table statistics
print("üìä Bronze Table Statistics")
print("=" * 80)

stats_query = f"""
SELECT 
    COUNT(*) as total_records,
    COUNT(DISTINCT VendorID) as unique_vendors,
    COUNT(DISTINCT DATE(lpep_pickup_datetime)) as unique_days,
    MIN(lpep_pickup_datetime) as earliest_trip,
    MAX(lpep_pickup_datetime) as latest_trip,
    SUM(trip_distance) as total_distance,
    SUM(total_amount) as total_revenue,
    AVG(trip_distance) as avg_distance,
    AVG(total_amount) as avg_fare
FROM {CATALOG}.{SCHEMA}.{BRONZE_TABLE}
"""

stats_df = spark.sql(stats_query)
display(stats_df)

print("‚úÖ Statistics query complete")

# COMMAND ----------

# MAGIC %md
# MAGIC ## 12. Checkpoint Information

# COMMAND ----------

# Display checkpoint information
print("üìÇ Checkpoint Information")
print("=" * 80)

try:
    # List checkpoint files
    checkpoint_files = dbutils.fs.ls(bronze_checkpoint)
    print(f"Checkpoint Location: {bronze_checkpoint}")
    print(f"Total Files: {len(checkpoint_files)}")
    
    # Show commit files
    commits = [f for f in checkpoint_files if f.name.startswith('commits/')]
    print(f"\n‚úÖ Commit Logs: {len(commits)}")
    
    # Show offset files
    offsets = [f for f in checkpoint_files if 'offsets' in f.name]
    print(f"‚úÖ Offset Files: {len(offsets)}")
    
except Exception as e:
    print(f"‚ö†Ô∏è  Could not read checkpoint: {e}")

print("\n" + "=" * 80)

# COMMAND ----------

# MAGIC %md
# MAGIC ## 13. Next Steps
# MAGIC 
# MAGIC 1. ‚úÖ **Bronze Layer Complete** - Data is being incrementally ingested
# MAGIC 2. üìä **Data Quality Verified** - Basic checks passed
# MAGIC 3. üîÑ **Stream Running** - Autoloader is monitoring for new files
# MAGIC 4. ‚≠ê **Ready for Silver** - Proceed to `02_silver_star_schema.py`
# MAGIC 
# MAGIC **Stream Management:**
# MAGIC - Stream will continue running until manually stopped
# MAGIC - New files added to source location will be automatically processed
# MAGIC - Checkpoint ensures exactly-once processing
# MAGIC 
# MAGIC **Monitoring:**
# MAGIC - Check stream status: `spark.streams.active`
# MAGIC - View progress: `bronze_stream_query.lastProgress`
# MAGIC - Stop stream: `bronze_stream_query.stop()`

In [0]:
%sql
select * from `na-dbxtraining`.biju_bronze.