In [0]:
%pip install azure-eventhub
dbutils.library.restartPython()

In [0]:
%run ./config

In [0]:
# Databricks notebook source
# MAGIC %md
# MAGIC # Bronze Layer - Direct Table Creation
# MAGIC Reads enriched events from Event Hub and splits into Orders & Products tables

# COMMAND ----------

from azure.eventhub import EventHubConsumerClient
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.types import (
    StringType, LongType, DoubleType, IntegerType, TimestampType, StructType, StructField
)
import threading
import time
import json

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 1: Read Events from Event Hub

# COMMAND ----------
orders_schema = StructType([
    StructField("order_id", StringType(), True),
    StructField("customer_id", StringType(), True),
    StructField("customer_name", StringType(), True),
    StructField("location", StringType(), True),
    StructField("product_id", StringType(), True),
    StructField("order_status", StringType(), True),
    StructField("payment_method", StringType(), True),
    StructField("quantity", IntegerType(), True),         # or LongType if needed
    StructField("discount_pct", DoubleType(), True),
    StructField("total_amount", DoubleType(), True),
    StructField("order_timestamp", StringType(), True),   # or TimestampType if parsed
    StructField("event_time", StringType(), True),
    StructField("kafka_offset", StringType(), True),
    StructField("partition_id", StringType(), True)
])
events = []
stop_flag = threading.Event()

def collect_events(max_events=1000):
    """Collect events from Event Hub with thread-based timeout"""
    
    def on_event(partition_context, event):
        if stop_flag.is_set() or len(events) >= max_events:
            return
        
        if event:
            events.append({
                'body': event.body_as_str(),
                'event_time': event.enqueued_time.isoformat() if event.enqueued_time else None,
                'offset': event.offset,
                'sequence_number': event.sequence_number,
                'partition_id': partition_context.partition_id
            })
            
            if len(events) % 50 == 0:
                print(f"  Collected {len(events)} events...")
    
    client = EventHubConsumerClient.from_connection_string(
        conn_str=connection_string,
        consumer_group="$Default"
    )
    
    try:
        with client:
            client.receive(
                on_event=on_event,
                starting_position="-1",  # Read from beginning
                max_wait_time=3
            )
    except:
        pass

# Start collection
print("="*70)
print("READING EVENTS FROM EVENT HUB")
print("="*70)
print(f"Event Hub: {eh_name}")
print(f"Reading all available events...\n")

thread = threading.Thread(target=lambda: collect_events(max_events=1000))
thread.daemon = True
thread.start()

# Wait for events (15 seconds max)
for i in range(15):
    time.sleep(1)
    if len(events) >= 300:  # Got all expected events
        print(f"  Reached target: {len(events)} events")
        break
    if len(events) > 0 and i >= 5:  # Got some events, waited 5 seconds
        break

stop_flag.set()
thread.join(timeout=2)

print("\n" + "="*70)
print(f"✓ Collected {len(events)} events from Event Hub")
print("="*70)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 2: Parse Events

# COMMAND ----------

if len(events) == 0:
    print("\n⚠️  NO EVENTS COLLECTED!")
    print("\nPossible reasons:")
    print("  1. Data Generator (01_Data_Generator) hasn't run yet")
    print("  2. Event Hub is empty")
    print("  3. Connection issue")
    print("\nAction: Run 01_Data_Generator first, then re-run this notebook")
    dbutils.notebook.exit("No events to process")

print(f"\nParsing {len(events)} events...")

# Parse JSON bodies
parsed_events = []
parse_errors = 0

for evt in events:
    try:
        data = json.loads(evt['body'])
        data['event_time'] = evt['event_time']
        data['kafka_offset'] = evt['offset']
        data['partition_id'] = evt['partition_id']
        data['sequence_number'] = evt['sequence_number']
        parsed_events.append(data)
    except Exception as e:
        parse_errors += 1
        print(f"  Error parsing event: {e}")

print(f"✓ Successfully parsed {len(parsed_events)} events")
if parse_errors > 0:
    print(f"⚠️  Failed to parse {parse_errors} events")

# Show sample event
if parsed_events:
    print("\nSample parsed event:")
    print(json.dumps(parsed_events[0], indent=2))

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 3: Create Orders DataFrame and Table

# COMMAND ----------

print("\n" + "="*70)
print("CREATING ORDERS TABLE")
print("="*70)

# Extract orders data
orders_data = []
for evt in parsed_events:
    orders_data.append((
        evt['order_id'],
        evt['customer_id'],
        evt['customer_name'],
        evt['location'],
        evt['product_id'],  # Foreign key to products
        evt['order_status'],
        evt['payment_method'],
        int(evt['quantity']) if evt['quantity'] is not None else None,
        float(evt['discount_pct']) if evt['discount_pct'] is not None else None,
        float(evt['total_amount']) if evt['total_amount'] is not None else None,
        evt['order_timestamp'],
        evt.get('event_time'),
        evt.get('kafka_offset'),
        evt.get('partition_id')
    ))

# Create DataFrame
orders_df = spark.createDataFrame(
    orders_data,
     schema=orders_schema
)

# Add metadata
orders_df = orders_df.withColumn("bronze_timestamp", current_timestamp())

print(f"✓ Created orders DataFrame with {orders_df.count()} records")

# Show schema
print("\nOrders DataFrame schema:")
orders_df.printSchema()

# Write to Unity Catalog table
print(f"\nWriting to table: {bronze_orders_table}")

orders_df.write \
    .format("delta") \
    .mode("append") \
    .option("mergeSchema", "true") \
    .saveAsTable(bronze_orders_table)

orders_count = orders_df.count()
print(f"✓ Saved {orders_count} orders to {bronze_orders_table}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 4: Create Products DataFrame and Table

# COMMAND ----------

print("\n" + "="*70)
print("CREATING PRODUCTS TABLE")
print("="*70)

# Extract unique products (deduplicate by product_id)
products_dict = {}
for evt in parsed_events:
    product_id = evt['product_id']
    if product_id not in products_dict:
        products_dict[product_id] = (
            evt['product_id'],
            evt['product_name'],
            evt['category'],
            evt['brand'],
            float(evt['base_price']) if evt['base_price'] is not None else None,
            float(evt.get('unit_price', evt['base_price'])) if evt.get('unit_price', evt['base_price']) is not None else None
        )



products_data = list(products_dict.values())

# Create DataFrame
products_df = spark.createDataFrame(
    products_data,
    ["product_id", "product_name", "category", "brand", "base_price", "unit_price"]
)

# Add metadata
products_df = products_df.withColumn("bronze_timestamp", current_timestamp())

print(f"✓ Created products DataFrame with {products_df.count()} unique products")

# Show schema
print("\nProducts DataFrame schema:")
products_df.printSchema()

# Write to Unity Catalog table
print(f"\nWriting to table: {bronze_products_table}")

products_df.write \
    .format("delta") \
    .mode("append") \
    .option("mergeSchema", "true") \
    .saveAsTable(bronze_products_table)

products_count = products_df.count()
print(f"✓ Saved {products_count} products to {bronze_products_table}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 5: Verify Bronze Tables

# COMMAND ----------

print("\n" + "="*70)
print("BRONZE LAYER SUMMARY")
print("="*70)

# Orders table
orders_table_df = spark.table(bronze_orders_table)
total_orders = orders_table_df.count()

print(f"\nOrders Table: {bronze_orders_table}")
print(f"  Records: {total_orders}")
print(f"  New records this run: {orders_count}")

print("\nSample orders (latest 10):")
display(orders_table_df.orderBy(desc("bronze_timestamp")).limit(10))

# COMMAND ----------

# Products table
products_table_df = spark.table(bronze_products_table)
total_products = products_table_df.count()

print(f"\nProducts Table: {bronze_products_table}")
print(f"  Records: {total_products}")
print(f"  New records this run: {products_count}")

print("\nAll products:")
display(products_table_df.orderBy("product_id"))

# COMMAND ----------

# Quick analysis
print("\n" + "="*70)
print("QUICK ANALYSIS")
print("="*70)

print("\nOrders by product:")
display(
    orders_table_df
    .groupBy("product_id")
    .agg(
        count("*").alias("order_count"),
        sum("total_amount").alias("total_revenue")
    )
    .orderBy(desc("order_count"))
)

# COMMAND ----------

print("\n" + "="*70)
print("✓ BRONZE LAYER COMPLETE")
print("="*70)
print(f"Orders: {total_orders} records")
print(f"Products: {total_products} records")
print(f"\nData split successfully:")
print(f"  - Enriched events → Orders table (with product_id FK)")
print(f"  - Unique products → Products table (with product_id PK)")
print("="*70)
print("\nNext: Run 03_Silver_Layer to join Orders with Products")