In [0]:
%pip install azure-eventhub

In [0]:
dbutils.library.restartPython()

In [0]:
# Databricks notebook source
# MAGIC %md
# MAGIC # ü•â BRONZE Notebook: Event Hub to Raw Delta Table
# MAGIC 
# MAGIC Ingests raw event data from Azure Event Hubs directly into the Bronze Delta table.
# MAGIC 
# MAGIC **Goal:** Minimal transformation, retain raw structure for historical replay.

# COMMAND ----------

# --- Configuration Section ---
eh_namespace = "evhns-natraining.servicebus.windows.net"
eh_name = "evh-natraining-bijunew"
keyvault_scope = "dbx-ss-kv-natraining-2"
secret_name = "evh-natraining-read-write"
shared_access_key_name = "SharedAccessKeyToSendAndListen"

# Table configuration
catalog = "na-dbxtraining"
bronze_schema = "biju_bronze"
bronze_table_name = "eventhubbronzeorderdata"
full_bronze_table_name = f"`{catalog}`.{bronze_schema}.{bronze_table_name}"

# Get connection string
secret_value = dbutils.secrets.get(scope=keyvault_scope, key=secret_name)
connection_string = (
    f"Endpoint=sb://{eh_namespace}/;"
    f"SharedAccessKeyName={shared_access_key_name};"
    f"SharedAccessKey={secret_value};"
    f"EntityPath={eh_name}"
)

print(f"Target Bronze table: {full_bronze_table_name}")
print("‚úì Configuration Ready")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 1: Data Generation (For Demo/Testing)
# MAGIC *In a production environment, this would be an external process.*

# COMMAND ----------

from azure.eventhub import EventHubProducerClient, EventData
from datetime import datetime, timezone
import json
import time
import threading

def send_test_data(count=50):
    """Send test events to Event Hubs"""
    print(f"Sending {count} test events...")
    
    producer = EventHubProducerClient.from_connection_string(connection_string)
    batch = producer.create_batch()
    
    for i in range(count):
        # The raw data payload
        event = {
            "id": f"test_{datetime.now().strftime('%H%M%S%f')}_{i}",
            "timestamp": datetime.now(timezone.utc).isoformat(),
            "message": f"Test message {i}",
            "sensor": ["temperature", "humidity", "pressure"][i % 3],
            "value": 20.0 + i % 10,
            "unit": ["celsius", "percent", "hPa"][i % 3]
        }
        
        # Add to batch
        try:
            batch.add(EventData(json.dumps(event)))
        except ValueError: # Batch is full, send and start a new one
            producer.send_batch(batch)
            batch = producer.create_batch()
            batch.add(EventData(json.dumps(event)))
    
    producer.send_batch(batch)
    producer.close()
    
    print(f"‚úì Sent {count} events. Waiting for availability...")
    time.sleep(3)

# Send test data for the pipeline run
#send_test_data(50)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 2: Read Events from Event Hub (Simulated Micro-batch)

# COMMAND ----------

events = []
stop_flag = threading.Event()

def collect_events(max_events=1000):
    """Collect events with thread-based timeout"""
    from azure.eventhub import EventHubConsumerClient # Import locally for thread
    
    def on_event(partition_context, event):
        if stop_flag.is_set() or len(events) >= max_events:
            return
        
        if event:
            events.append({
                # Raw data from Event Hub
                'body': event.body_as_str(),
                'event_time': event.enqueued_time.isoformat() if event.enqueued_time else None,
                'offset': event.offset,
                'sequence_number': event.sequence_number,
                'partition_id': partition_context.partition_id
            })
            
            if len(events) % 10 == 0:
                print(f"  Collected {len(events)} events...")
    
    # NOTE: Using consumer group $Default and starting_position="-1" to read recent events
    client = EventHubConsumerClient.from_connection_string(
        conn_str=connection_string,
        consumer_group="$Default"
    )
    
    try:
        with client:
            client.receive(
                on_event=on_event,
                starting_position="-1", # Read from the latest events
                max_wait_time=2,
                # NOTE: Only read from one partition for this simple example
                # If you need to read from ALL partitions, remove the partition_id argument below
                partition_id='0' 
            )
    except Exception as e:
        print(f"Error during event collection: {str(e)}")
        pass

# Start collection
print("Reading events for max 10 seconds...")
thread = threading.Thread(target=lambda: collect_events(max_events=1000))
thread.daemon = True
thread.start()

# Wait 5 seconds
time.sleep(5) 

# Stop collection
stop_flag.set()
thread.join(timeout=2)

print(f"\n‚úì Collected {len(events)} events")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 3: Write to Bronze Delta Table

# COMMAND ----------

from pyspark.sql.functions import current_timestamp

if events:
    print(f"Creating DataFrame from {len(events)} events...")
    
    # 1. Create DataFrame from raw events
    df = spark.createDataFrame(events)
    
    # 2. Add pipeline metadata
    df = df.withColumn("ingestion_time", current_timestamp())
    
    # 3. Write directly to Delta table (append mode)
    print(f"\nWriting {df.count()} rows to Bronze table: {full_bronze_table_name}")
    
    df.write \
        .format("delta") \
        .mode("append") \
        .option("mergeSchema", "true") \
        .saveAsTable(full_bronze_table_name)
    
    print(f"‚úì Saved {len(events)} events to table {full_bronze_table_name}")
    
else:
    print("‚ö†Ô∏è No events collected. Skipping write to Bronze.")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 4: Verification

# COMMAND ----------

try:
    df_bronze = spark.table(full_bronze_table_name)
    total = df_bronze.count()
    print(f"Total records in Bronze: {total}")
    display(df_bronze.orderBy("ingestion_time", ascending=False).limit(5))
except Exception as e:
    print(f"Error reading table: {e}")

In [0]:
%sql
select * from  `na-dbxtraining`.biju_bronze.eventhubbronzeorderdata