In [0]:
%pip install azure-eventhub

In [0]:
%pip install azure-eventhub pandas

In [0]:
# Databricks notebook source
# MAGIC %md
# MAGIC # Azure Event Hub - Python SDK (Fixed Version)
# MAGIC 
# MAGIC **‚úÖ Works on shared clusters - No library installation needed**
# MAGIC 
# MAGIC This version:
# MAGIC - Handles empty Event Hubs properly
# MAGIC - Sends test messages first
# MAGIC - Better error handling

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 1: Install Python SDK

# COMMAND ----------

%pip install azure-eventhub pandas

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 2: Configuration

# COMMAND ----------

# Event Hub Configuration
eh_namespace = "evhns-gdctraining.servicebus.windows.net"
eh_name = "evh-gdctraining-001"

# PASTE YOUR KEYS HERE
# Get these from Azure Portal ‚Üí Event Hub ‚Üí Shared access policies

# READ ACCESS KEY (from evh-gdctraining-001-read secret in Key Vault)
# Or from evhaccesspolicylisten in Event Hub
READ_ACCESS_KEY = "YB11jV8CfL4cFRgr2LSAMudsBdtKeMDUX+AEhLceoGI="

# WRITE ACCESS KEY (from evh-gdctraining-001-write secret in Key Vault)
# Or from evhaccesspolicysend in Event Hub
WRITE_ACCESS_KEY = "uilv4rbO5nBOr8JfHiFqpRXVmXluwx47c+AEhBoNmvQ="

print("‚úì Configuration loaded")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 3: Build Connection Strings

# COMMAND ----------

# Connection String for READING
read_connection_string = f"Endpoint=sb://{eh_namespace}/;SharedAccessKeyName=evhaccesspolicylisten;SharedAccessKey={READ_ACCESS_KEY};EntityPath={eh_name}"

# Connection String for WRITING
write_connection_string = f"Endpoint=sb://{eh_namespace}/;SharedAccessKeyName=evhaccesspolicysend;SharedAccessKey={WRITE_ACCESS_KEY};EntityPath={eh_name}"

print("‚úì Connection strings built")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 4: FIRST - Send Test Messages to Event Hub
# MAGIC 
# MAGIC Let's send some test data first so we have something to read

# COMMAND ----------

from azure.eventhub import EventHubProducerClient, EventData
import json
from datetime import datetime, timezone

# Create test messages
test_messages = [
    {"id": "test_001", "timestamp": datetime.now(timezone.utc).isoformat(), "sensor": "temperature", "value": 22.5, "unit": "celsius"},
    {"id": "test_002", "timestamp": datetime.now(timezone.utc).isoformat(), "sensor": "humidity", "value": 65.3, "unit": "percent"},
    {"id": "test_003", "timestamp": datetime.now(timezone.utc).isoformat(), "sensor": "pressure", "value": 1013.2, "unit": "hPa"},
    {"id": "test_004", "timestamp": datetime.now(timezone.utc).isoformat(), "sensor": "temperature", "value": 23.1, "unit": "celsius"},
    {"id": "test_005", "timestamp": datetime.now(timezone.utc).isoformat(), "sensor": "humidity", "value": 67.8, "unit": "percent"}
]

print("Sending test messages to Event Hub...")
print(f"Messages to send: {len(test_messages)}")

try:
    # Create producer
    producer = EventHubProducerClient.from_connection_string(
        conn_str=write_connection_string
    )
    
    # Create and send batch
    event_batch = producer.create_batch()
    
    for msg in test_messages:
        event_batch.add(EventData(json.dumps(msg)))
        print(f"  Added: {msg['id']} - {msg['sensor']}: {msg['value']}")
    
    # Send the batch
    producer.send_batch(event_batch)
    producer.close()
    
    print(f"\n‚úì Successfully sent {len(test_messages)} messages!")
    print("‚è≥ Wait 5 seconds for messages to be available...")
    
    import time
    time.sleep(5)
    
except Exception as e:
    print(f"‚úó Error sending messages: {str(e)}")
    print("\nPlease check:")
    print("  - WRITE_ACCESS_KEY is correct")
    print("  - You have 'Azure Event Hubs Data Sender' permission")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 5: Read from Event Hub (Fixed Version)

# COMMAND ----------

from azure.eventhub import EventHubConsumerClient

# Storage for collected events
collected_events = []

def on_event(partition_context, event):
    """Process each event - with proper None checking"""
    try:
        # Check if event is None
        if event is None:
            print("Received None event (skipping)")
            return
        
        # Try to get event body
        try:
            body = event.body_as_str()
        except Exception:
            body = str(event.body) if hasattr(event, 'body') else "No body"
        
        # Parse event data
        event_data = {
            'body': body,
            'enqueued_time': event.enqueued_time if hasattr(event, 'enqueued_time') else None,
            'offset': event.offset if hasattr(event, 'offset') else None,
            'sequence_number': event.sequence_number if hasattr(event, 'sequence_number') else None,
            'partition_key': event.partition_key if hasattr(event, 'partition_key') else None
        }
        
        collected_events.append(event_data)
        
        # Print progress
        if len(collected_events) <= 10 or len(collected_events) % 10 == 0:
            print(f"‚úì Collected {len(collected_events)} events")
            
    except Exception as e:
        print(f"Error processing event: {str(e)}")

# Create consumer
print("Connecting to Event Hub...")
print("Reading events (will read for 20 seconds)...\n")

client = EventHubConsumerClient.from_connection_string(
    conn_str=read_connection_string,
    consumer_group="$Default"
)

try:
    with client:
        client.receive(
            on_event=on_event,
            starting_position="-1",  # Start from beginning
            max_wait_time=20  # Read for 20 seconds
        )
    
    print(f"\n{'='*60}")
    print(f"‚úì Finished reading from Event Hub")
    print(f"‚úì Total events collected: {len(collected_events)}")
    print(f"{'='*60}\n")
    
except Exception as e:
    print(f"\n‚úó Error reading from Event Hub: {str(e)}")
    print("\nPlease check:")
    print("  - READ_ACCESS_KEY is correct")
    print("  - You have 'Azure Event Hubs Data Receiver' permission")
    print("  - Event Hub namespace and name are correct")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 6: Display Collected Events

# COMMAND ----------

if collected_events:
    print(f"Found {len(collected_events)} events!\n")
    
    # Show first few events
    print("First 5 events:")
    print("="*60)
    for i, event in enumerate(collected_events[:5], 1):
        print(f"\nEvent {i}:")
        print(f"  Body: {event['body']}")
        print(f"  Enqueued Time: {event['enqueued_time']}")
        print(f"  Offset: {event['offset']}")
        print(f"  Sequence Number: {event['sequence_number']}")
    print("="*60)
else:
    print("‚ö†Ô∏è No events collected from Event Hub")
    print("\nPossible reasons:")
    print("  1. Event Hub is empty - Try running Step 4 to send test messages")
    print("  2. Connection issue - Check your access keys")
    print("  3. Permissions - Verify you have 'Data Receiver' role")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 7: Convert to Pandas DataFrame

# COMMAND ----------

import pandas as pd

if collected_events:
    # Convert to Pandas
    df_pandas = pd.DataFrame(collected_events)
    
    print(f"Created Pandas DataFrame with {len(df_pandas)} rows")
    print("\nDataFrame Info:")
    print(df_pandas.info())
    
    print("\nFirst 10 rows:")
    display(df_pandas.head(10))
else:
    print("No events to convert")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 8: Convert to Spark DataFrame

# COMMAND ----------

if collected_events:
    # Convert to Spark DataFrame
    df_spark = spark.createDataFrame(collected_events)
    
    print("‚úì Created Spark DataFrame")
    print("\nSchema:")
    df_spark.printSchema()
    
    print(f"\nTotal records: {df_spark.count()}")
    
    # Display
    display(df_spark)
else:
    print("No events to convert to Spark DataFrame")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 9: Parse JSON Message Bodies

# COMMAND ----------

from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType, DoubleType

if collected_events and len(collected_events) > 0:
    # Sample message to understand structure
    print("Sample message body:")
    print(collected_events[0]['body'])
    print()
    
    # Define schema for your messages
    message_schema = StructType([
        StructField("id", StringType(), True),
        StructField("timestamp", StringType(), True),
        StructField("sensor", StringType(), True),
        StructField("value", DoubleType(), True),
        StructField("unit", StringType(), True)
    ])
    
    # Parse JSON
    df_parsed = df_spark.select(
        from_json(col("body"), message_schema).alias("data"),
        col("enqueued_time"),
        col("offset"),
        col("sequence_number")
    ).select("data.*", "enqueued_time", "offset", "sequence_number")
    
    print("‚úì Parsed JSON messages")
    print(f"Total parsed records: {df_parsed.count()}\n")
    
    display(df_parsed)
else:
    print("No events to parse")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 10: Save to Delta Lake

# COMMAND ----------

if collected_events and len(collected_events) > 0:
    # Output path
    output_path = "/tmp/eventhub/data"
    
    # Save to Delta
    df_spark.write \
        .format("delta") \
        .mode("append") \
        .save(output_path)
    
    print(f"‚úì Saved {len(collected_events)} events to Delta Lake")
    print(f"  Location: {output_path}")
else:
    print("No events to save")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 11: Read from Delta Lake

# COMMAND ----------

from pyspark.sql.functions import col

output_path = "/tmp/eventhub/data"

try:
    # Read Delta table
    df_delta = spark.read.format("delta").load(output_path)
    
    print(f"‚úì Delta table loaded")
    print(f"Total records: {df_delta.count()}\n")
    
    # Display latest records
    display(df_delta.orderBy(col("enqueued_time").desc()).limit(100))
    
except Exception as e:
    print(f"Delta table not found: {str(e)}")
    print("Run Step 10 first to save data to Delta Lake")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 12: Send More Test Messages (Anytime)

# COMMAND ----------

def send_test_messages(count=5, sensor_type="temperature"):
    """Send test messages to Event Hub"""
    from datetime import datetime, timezone
    import json
    
    messages = []
    for i in range(count):
        msg = {
            "id": f"msg_{datetime.now().strftime('%Y%m%d%H%M%S')}_{i}",
            "timestamp": datetime.now(timezone.utc).isoformat(),
            "sensor": sensor_type,
            "value": 20.0 + i * 0.5,
            "unit": "celsius" if sensor_type == "temperature" else "percent"
        }
        messages.append(msg)
    
    try:
        producer = EventHubProducerClient.from_connection_string(
            conn_str=write_connection_string
        )
        
        event_batch = producer.create_batch()
        for msg in messages:
            event_batch.add(EventData(json.dumps(msg)))
        
        producer.send_batch(event_batch)
        producer.close()
        
        print(f"‚úì Sent {count} test messages")
        for msg in messages:
            print(f"  - {msg['id']}: {msg['sensor']} = {msg['value']}")
        
        return True
    except Exception as e:
        print(f"‚úó Error: {str(e)}")
        return False

# Send 5 temperature readings
send_test_messages(count=5, sensor_type="temperature")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 13: Read Latest Messages

# COMMAND ----------

# Clear previous events
collected_events = []

print("Reading latest messages from Event Hub...")
print("(Reading for 15 seconds)\n")

client = EventHubConsumerClient.from_connection_string(
    conn_str=read_connection_string,
    consumer_group="$Default"
)

try:
    with client:
        client.receive(
            on_event=on_event,
            starting_position="-1",
            max_wait_time=15
        )
    
    print(f"\n‚úì Collected {len(collected_events)} events")
    
    if collected_events:
        # Show latest events
        print("\nLatest events:")
        for i, event in enumerate(collected_events[-5:], 1):
            print(f"{i}. {event['body']}")
    
except Exception as e:
    print(f"‚úó Error: {str(e)}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Troubleshooting Guide
# MAGIC 
# MAGIC ### Error: "NoneType object has no attribute"
# MAGIC **Fixed!** The code now handles None events properly.
# MAGIC 
# MAGIC ### No events collected:
# MAGIC 1. **First, send test messages:** Run Step 4
# MAGIC 2. **Wait a few seconds:** Give Event Hub time to process
# MAGIC 3. **Then read:** Run Step 5
# MAGIC 
# MAGIC ### "Unauthorized" error:
# MAGIC - Check access keys are correct (no extra spaces)
# MAGIC - Verify you have proper permissions:
# MAGIC   - "Azure Event Hubs Data Sender" for writing
# MAGIC   - "Azure Event Hubs Data Receiver" for reading
# MAGIC 
# MAGIC ### Connection timeout:
# MAGIC - Event Hub might be empty - send test messages first
# MAGIC - Check namespace and Event Hub name are correct
# MAGIC - Verify network connectivity from Databricks to Azure

# COMMAND ----------

# MAGIC %md
# MAGIC ## Quick Reference
# MAGIC 
# MAGIC ### Typical Workflow:
# MAGIC 
# MAGIC 1. **Send test messages:** Run Step 4
# MAGIC 2. **Wait 5 seconds**
# MAGIC 3. **Read messages:** Run Step 5
# MAGIC 4. **View data:** Run Steps 6-8
# MAGIC 5. **Parse JSON:** Run Step 9
# MAGIC 6. **Save to Delta:** Run Step 10
# MAGIC 
# MAGIC ### Key Functions:
# MAGIC 
# MAGIC ```python
# MAGIC # Send messages
# MAGIC send_test_messages(count=10, sensor_type="humidity")
# MAGIC 
# MAGIC # Read and convert to Spark
# MAGIC # (Run Step 5, then Step 8)
# MAGIC 
# MAGIC # Save to Delta
# MAGIC df_spark.write.format("delta").mode("append").save("/path")
# MAGIC ```

# COMMAND ----------

# MAGIC %md
# MAGIC ## Next Steps
# MAGIC 
# MAGIC ### ‚úÖ What's Working:
# MAGIC - Sending messages to Event Hub
# MAGIC - Reading messages from Event Hub
# MAGIC - Converting to Spark DataFrames
# MAGIC - Saving to Delta Lake
# MAGIC - Parsing JSON messages
# MAGIC 
# MAGIC ### üöÄ For Production:
# MAGIC 1. **Schedule this notebook as a job** (Workflows ‚Üí Create Job)
# MAGIC 2. **Set up Delta tables** with proper schemas
# MAGIC 3. **Add data validation** and error handling
# MAGIC 4. **Monitor with Databricks monitoring tools**
# MAGIC 
# MAGIC ### üí° Performance Tips:
# MAGIC - For high-volume data: Ask admin to add Spark connector to allowlist
# MAGIC - Use Delta Lake for efficient storage and queries
# MAGIC - Schedule jobs during off-peak hours
# MAGIC - Monitor Event Hub throughput units