In [0]:
%run ./02config

In [0]:
# Databricks notebook source
# MAGIC %md
# MAGIC # Bronze Layer - Event Hub Streaming
# MAGIC Reads events from Event Hub and writes to Orders & Products tables

# COMMAND ----------

# MAGIC %run ./02config

# COMMAND ----------

from pyspark.sql.functions import *
from pyspark.sql.types import *

# COMMAND ----------

# MAGIC %md
# MAGIC ## Event Schema

# COMMAND ----------

event_schema = StructType([
    StructField("order_id", StringType(), True),
    StructField("customer_id", StringType(), True),
    StructField("customer_name", StringType(), True),
    StructField("location", StringType(), True),
    StructField("product_id", StringType(), True),
    StructField("product_name", StringType(), True),
    StructField("category", StringType(), True),
    StructField("brand", StringType(), True),
    StructField("order_status", StringType(), True),
    StructField("payment_method", StringType(), True),
    StructField("quantity", IntegerType(), True),
    StructField("discount_pct", DoubleType(), True),
    StructField("total_amount", DoubleType(), True),
    StructField("base_price", DoubleType(), True),
    StructField("unit_price", DoubleType(), True),
    StructField("order_timestamp", StringType(), True)
])

print("‚úì Event schema defined")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Read and Parse Event Hub Stream

# COMMAND ----------

print("="*70)
print("STARTING EVENT HUB STREAMING")
print("="*70)

# Read from Event Hub
raw_stream = (spark.readStream
    .format("eventhubs")
    .options(**event_hubs_conf)
    .load()
)

print("‚úì Raw stream created")

# Parse messages
parsed_stream = (raw_stream
    .withColumn("json_value", col("body").cast("string"))
    .withColumn("parsed_data", from_json(col("json_value"), event_schema))
    .select(
        col("parsed_data.*"),
        col("enqueuedTime").alias("event_time"),
        col("offset").alias("eventhub_offset"),
        col("sequenceNumber").alias("sequence_number"),
        col("partition").alias("partition_id")
    )
    .withColumn("bronze_timestamp", current_timestamp())
    .withColumn("order_timestamp_parsed", to_timestamp(col("order_timestamp")))
)

print("‚úì Stream parsing configured")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Orders Stream

# COMMAND ----------

print("\n" + "="*70)
print("STARTING ORDERS STREAM")
print("="*70)

orders_stream = parsed_stream.select(
    col("order_id"),
    col("customer_id"),
    col("customer_name"),
    col("location"),
    col("product_id"),
    col("order_status"),
    col("payment_method"),
    col("quantity"),
    col("discount_pct"),
    col("total_amount"),
    col("order_timestamp"),
    col("order_timestamp_parsed"),
    col("event_time"),
    col("eventhub_offset"),
    col("sequence_number"),
    col("partition_id"),
    col("bronze_timestamp")
)

orders_query = (orders_stream
    .writeStream
    .format("delta")
    .outputMode("append")
    .option("checkpointLocation", orders_checkpoint)
    .option("mergeSchema", "true")
    #.trigger(processingTime="10 seconds")
    .trigger(availableNow=True)
    .toTable(bronze_orders_table)
)

print(f"‚úì Orders stream started: {orders_query.id}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Products Stream with UPSERT

# COMMAND ----------

print("\n" + "="*70)
print("STARTING PRODUCTS STREAM")
print("="*70)

products_stream = (parsed_stream
    .select(
        col("product_id"),
        col("product_name"),
        col("category"),
        col("brand"),
        col("base_price"),
        col("unit_price"),
        col("bronze_timestamp")
    )
    .dropDuplicates(["product_id"])
)

def upsert_products(batch_df, batch_id):
    if batch_df.count() == 0:
        return
    
    batch_df.createOrReplaceTempView("products_batch")
    
    merge_query = f"""
    MERGE INTO {bronze_products_table} target
    USING products_batch source
    ON target.product_id = source.product_id
    WHEN MATCHED THEN
        UPDATE SET
            target.product_name = source.product_name,
            target.category = source.category,
            target.brand = source.brand,
            target.base_price = source.base_price,
            target.unit_price = source.unit_price,
            target.bronze_timestamp = source.bronze_timestamp
    WHEN NOT MATCHED THEN
        INSERT (product_id, product_name, category, brand, base_price, unit_price, bronze_timestamp)
        VALUES (source.product_id, source.product_name, source.category, source.brand, 
                source.base_price, source.unit_price, source.bronze_timestamp)
    """
    
    try:
        spark.sql(merge_query)
        print(f"  Batch {batch_id}: ‚úì Upserted {batch_df.count()} products")
    except Exception as e:
        if "TABLE_OR_VIEW_NOT_FOUND" in str(e) or "does not exist" in str(e).lower():
            batch_df.write.format("delta").mode("append").saveAsTable(bronze_products_table)
            print(f"  Batch {batch_id}: ‚úì Created table with {batch_df.count()} products")
        else:
            raise e

products_query = (products_stream
    .writeStream
    .format("delta")
    .outputMode("append")
    .option("checkpointLocation", products_checkpoint)
    .option("mergeSchema", "true")
    #.trigger(processingTime="10 seconds")
    .trigger(availableNow=True)
    .foreachBatch(upsert_products)
    .start()
)

print(f"‚úì Products stream started: {products_query.id}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Monitor Streams

# COMMAND ----------

print("\n" + "="*70)
print("ACTIVE STREAMING QUERIES")
print("="*70)

for stream in spark.streams.active:
    print(f"\nQuery ID: {stream.id}")
    print(f"  Status: {stream.status['message']}")
    print(f"  Active: {stream.isActive}")
    
    if stream.recentProgress:
        latest = stream.recentProgress[-1]
        print(f"  Batch: {latest.get('batchId', 'N/A')}")
        print(f"  Input Rows: {latest.get('numInputRows', 0)}")
        print(f"  Rate: {latest.get('processedRowsPerSecond', 0):.2f} rows/sec")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Verify Tables

# COMMAND ----------

import time

print("\n" + "="*70)
print("MONITORING (30 seconds)")
print("="*70)

for i in range(6):
    time.sleep(5)
    
    try:
        orders_count = spark.table(bronze_orders_table).count()
        print(f"‚è±Ô∏è  {(i+1)*5}s - Orders: {orders_count:,}", end="")
    except:
        print(f"‚è±Ô∏è  {(i+1)*5}s - Orders: waiting...", end="")
    
    try:
        products_count = spark.table(bronze_products_table).count()
        print(f" | Products: {products_count:,}")
    except:
        print(" | Products: waiting...")

print("\n‚úì Monitoring complete")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Orders Analysis

# COMMAND ----------

try:
    orders_df = spark.table(bronze_orders_table)
    orders_count = orders_df.count()
    
    print(f"\nüìä Orders: {orders_count:,} records")
    
    if orders_count > 0:
        display(orders_df.orderBy(desc("bronze_timestamp")).limit(10))
        
        display(
            orders_df.groupBy("location")
            .agg(
                count("*").alias("orders"),
                sum("total_amount").alias("revenue")
            )
            .orderBy(desc("orders"))
        )
except Exception as e:
    print(f"‚ö†Ô∏è  Orders table not available: {e}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Products Analysis

# COMMAND ----------

try:
    products_df = spark.table(bronze_products_table)
    products_count = products_df.count()
    
    print(f"\nüìä Products: {products_count:,} records")
    
    if products_count > 0:
        display(products_df.orderBy("product_id"))
        
        display(
            products_df.groupBy("category", "brand")
            .count()
            .orderBy("category", "brand")
        )
except Exception as e:
    print(f"‚ö†Ô∏è  Products table not available: {e}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Stop Streams

# COMMAND ----------

# Uncomment to stop
"""
for stream in spark.streams.active:
    print(f"Stopping: {stream.id}")
    stream.stop()
print("‚úì All streams stopped")
"""

# COMMAND ----------

# MAGIC %md
# MAGIC ## Summary

# COMMAND ----------

print("\n" + "="*70)
print("BRONZE LAYER COMPLETE")
print("="*70)

try:
    orders_count = spark.table(bronze_orders_table).count()
    products_count = spark.table(bronze_products_table).count()
    
    print(f"\nRecords:")
    print(f"  Orders: {orders_count:,}")
    print(f"  Products: {products_count:,}")
    print(f"\nActive Streams: {len(spark.streams.active)}")
    
    for stream in spark.streams.active:
        print(f"  ‚Ä¢ {stream.id}")
    
except Exception as e:
    print(f"‚ö†Ô∏è  Tables not ready yet")