In [0]:
%pip install azure-eventhub pandas

In [0]:
%restart_python

In [0]:
# Option A
secret_value = dbutils.secrets.get(
    scope="dbx-ss-kv-natraining-2", key="evh-natraining-read-write"
)
send_conn_str = (
    "Endpoint=sb://evhns-natraining.servicebus.windows.net/;"
    "SharedAccessKeyName=SharedAccessKeyToSendAndListen;"
    f"SharedAccessKey={secret_value};"
    "EntityPath=evh-natraining-001"
)
eh_namespace = "evhns-natraining.servicebus.windows.net"
eh_name = "evh-natraining-001"
dbutils.widgets.text("eventhub_name", "evh-natraining-001")
dbutils.widgets.text(
    "test_message", "Test Biju on Databricks!",
)


In [0]:
# Databricks notebook source


# Event Hub Configuration
eh_namespace = "evhns-natraining.servicebus.windows.net"
eh_name = "evh-natraining-biju"

# Key Vault Secret Scope
keyvault_scope = "dbx-ss-kv-natraining-2"

# Secret names in Key Vault
read_secret_name = "evh-natraining-read-write"
write_secret_name = "evh-natraining-read-write"

print("‚úì Configuration loaded successfully")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 3: Retrieve Access Keys from Key Vault

# COMMAND ----------

# Retrieve the access keys from Key Vault via Databricks Secrets
try:
    read_access_key = dbutils.secrets.get(scope=keyvault_scope, key=read_secret_name)
    write_access_key = dbutils.secrets.get(scope=keyvault_scope, key=write_secret_name)
    print("‚úì Successfully retrieved secrets from Key Vault")
    print(f"  - Read secret: {read_secret_name}")
    print(f"  - Write secret: {write_secret_name}")
except Exception as e:
    print(f"‚úó Error retrieving secrets: {str(e)}")
    print("\nTroubleshooting:")
    print("1. Verify secret scope exists:")
    print("   Available scopes:", [s.name for s in dbutils.secrets.listScopes()])
    print("\n2. Verify secrets exist in scope:")
    try:
        secrets = dbutils.secrets.list(keyvault_scope)
        print(f"   Secrets in '{keyvault_scope}':", [s.key for s in secrets])
    except:
        print(f"   Cannot list secrets in scope '{keyvault_scope}'")
    print("\n3. Check Key Vault access permissions")
    raise

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 4: Build Connection Strings

# COMMAND ----------

# Connection String for READING
read_connection_string = f"Endpoint=sb://{eh_namespace}/;SharedAccessKeyName=evhaccesspolicylisten;SharedAccessKey={read_access_key};EntityPath={eh_name}"

# Connection String for WRITING
write_connection_string = f"Endpoint=sb://{eh_namespace}/;SharedAccessKeyName=evhaccesspolicysend;SharedAccessKey={write_access_key};EntityPath={eh_name}"

print("‚úì Connection strings built successfully")

# COMMAND ----------



In [0]:
# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 5: FIRST - Send Test Messages to Event Hub
# MAGIC 
# MAGIC Let's send some test data first so we have something to read

# COMMAND ----------

from azure.eventhub import EventHubProducerClient, EventData
import json
from datetime import datetime, timezone

# Create test messages
test_messages = [
    {"id": "test_00661", "timestamp": datetime.now(timezone.utc).isoformat(), "sensor": "temperature", "value": 22.5, "unit": "celsius"},
    {"id": "test_06602", "timestamp": datetime.now(timezone.utc).isoformat(), "sensor": "humidity", "value": 65.3, "unit": "percent"},
    {"id": "test_06603", "timestamp": datetime.now(timezone.utc).isoformat(), "sensor": "pressure", "value": 1013.2, "unit": "hPa"},
    {"id": "test_00664", "timestamp": datetime.now(timezone.utc).isoformat(), "sensor": "temperature", "value": 23.1, "unit": "celsius"},
    {"id": "test_00665", "timestamp": datetime.now(timezone.utc).isoformat(), "sensor": "humidity", "value": 67.8, "unit": "percent"}
]

print("Sending test messages to Event Hub...")
print(f"Messages to send: {len(test_messages)}")

try:
    # Create producer
    producer = EventHubProducerClient.from_connection_string(
        conn_str=write_connection_string
    )
    
    # Create and send batch
    event_batch = producer.create_batch()
    
    for msg in test_messages:
        event_batch.add(EventData(json.dumps(msg)))
        print(f"  Added: {msg['id']} - {msg['sensor']}: {msg['value']}")
    
    # Send the batch
    producer.send_batch(event_batch)
    producer.close()
    
    print(f"\n‚úì Successfully sent {len(test_messages)} messages!")
    print("‚è≥ Wait 5 seconds for messages to be available...")
    
    import time
    time.sleep(5)
    
except Exception as e:
    print(f"‚úó Error sending messages: {str(e)}")
    print("\nPlease check:")
    print("  - Write access key from Key Vault is correct")
    print("  - You have 'Azure Event Hubs Data Sender' permission")
    print("  - Event Hub namespace and name are correct")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 6: Read from Event Hub

# COMMAND ----------

from azure.eventhub import EventHubConsumerClient

# Storage for collected events
collected_events = []

def on_event(partition_context, event):
    """Process each event - with proper None checking"""
    try:
        # Check if event is None
        if event is None:
            print("Received None event (skipping)")
            return
        
        # Try to get event body
        try:
            body = event.body_as_str()
        except Exception:
            body = str(event.body) if hasattr(event, 'body') else "No body"
        
        # Parse event data
        event_data = {
            'body': body,
            'enqueued_time': event.enqueued_time if hasattr(event, 'enqueued_time') else None,
            'offset': event.offset if hasattr(event, 'offset') else None,
            'sequence_number': event.sequence_number if hasattr(event, 'sequence_number') else None,
            'partition_key': event.partition_key if hasattr(event, 'partition_key') else None
        }
        
        collected_events.append(event_data)
        
        # Print progress
        if len(collected_events) <= 10 or len(collected_events) % 10 == 0:
            print(f"‚úì Collected {len(collected_events)} events")
            
    except Exception as e:
        print(f"Error processing event: {str(e)}")

# Create consumer
print("Connecting to Event Hub...")
print("Reading events (will read for 20 seconds)...\n")

client = EventHubConsumerClient.from_connection_string(
    conn_str=read_connection_string,
    consumer_group="$Default"
)

try:
    with client:
        client.receive(
            on_event=on_event,
            starting_position="-1",  # Start from beginning
            max_wait_time=20  # Read for 20 seconds
        )
    
    print(f"\n{'='*60}")
    print(f"‚úì Finished reading from Event Hub")
    print(f"‚úì Total events collected: {len(collected_events)}")
    print(f"{'='*60}\n")
    
except Exception as e:
    print(f"\n‚úó Error reading from Event Hub: {str(e)}")
    print("\nPlease check:")
    print("  - Read access key from Key Vault is correct")
    print("  - You have 'Azure Event Hubs Data Receiver' permission")
    print("  - Event Hub namespace and name are correct")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 7: Display Collected Events

# COMMAND ----------

if collected_events:
    print(f"Found {len(collected_events)} events!\n")
    
    # Show first few events
    print("First 5 events:")
    print("="*60)
    for i, event in enumerate(collected_events[:5], 1):
        print(f"\nEvent {i}:")
        print(f"  Body: {event['body']}")
        print(f"  Enqueued Time: {event['enqueued_time']}")
        print(f"  Offset: {event['offset']}")
        print(f"  Sequence Number: {event['sequence_number']}")
    print("="*60)
else:
    print("‚ö†Ô∏è No events collected from Event Hub")
    print("\nPossible reasons:")
    print("  1. Event Hub is empty - Try running Step 5 to send test messages")
    print("  2. Connection issue - Check your Key Vault secrets")
    print("  3. Permissions - Verify you have 'Data Receiver' role")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 8: Convert to Pandas DataFrame

# COMMAND ----------

import pandas as pd

if collected_events:
    # Convert to Pandas
    df_pandas = pd.DataFrame(collected_events)
    
    print(f"Created Pandas DataFrame with {len(df_pandas)} rows")
    print("\nDataFrame Info:")
    print(df_pandas.info())
    
    print("\nFirst 10 rows:")
    display(df_pandas.head(10))
else:
    print("No events to convert")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 9: Convert to Spark DataFrame

# COMMAND ----------

if collected_events:
    # Convert to Spark DataFrame
    df_spark = spark.createDataFrame(collected_events)
    
    print("‚úì Created Spark DataFrame")
    print("\nSchema:")
    df_spark.printSchema()
    
    print(f"\nTotal records: {df_spark.count()}")
    
    # Display
    display(df_spark)
else:
    print("No events to convert to Spark DataFrame")


In [0]:
# MAGIC %md
# MAGIC ## Step 5: FIRST - Send Test Messages to Event Hub
# MAGIC 
# MAGIC Let's send some test data first so we have something to read

# COMMAND ----------

from azure.eventhub import EventHubProducerClient, EventData
import json
from datetime import datetime, timezone

# Create test messages
test_messages = [
    {"id": "bj_001", "timestamp": datetime.now(timezone.utc).isoformat(), "sensor": "temperature", "value": 212.5, "unit": "celsius"},
    {"id": "bj_002", "timestamp": datetime.now(timezone.utc).isoformat(), "sensor": "humidity", "value": 65.3, "unit": "percent"},
    {"id": "bj_003", "timestamp": datetime.now(timezone.utc).isoformat(), "sensor": "pressure", "value": 1013.2, "unit": "hPa"},
    {"id": "bj_004", "timestamp": datetime.now(timezone.utc).isoformat(), "sensor": "temperature", "value": 23.1, "unit": "celsius"},
    {"id": "bj_005", "timestamp": datetime.now(timezone.utc).isoformat(), "sensor": "humidity", "value": 67.8, "unit": "percent"}
]

print("Sending test messages to Event Hub...")
print(f"Messages to send: {len(test_messages)}")

try:
    # Create producer
    producer = EventHubProducerClient.from_connection_string(
        conn_str=write_connection_string
    )
    
    # Create and send batch
    event_batch = producer.create_batch()
    
    for msg in test_messages:
        event_batch.add(EventData(json.dumps(msg)))
        print(f"  Added: {msg['id']} - {msg['sensor']}: {msg['value']}")
    
    # Send the batch
    producer.send_batch(event_batch)
    producer.close()
    
    print(f"\n‚úì Successfully sent {len(test_messages)} messages!")
    print("‚è≥ Wait 5 seconds for messages to be available...")
    
    import time
    time.sleep(5)
    
except Exception as e:
    print(f"‚úó Error sending messages: {str(e)}")
    print("\nPlease check:")
    print("  - Write access key from Key Vault is correct")
    print("  - You have 'Azure Event Hubs Data Sender' permission")
    print("  - Event Hub namespace and name are correct")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 6: Read from Event Hub

# COMMAND ----------

from azure.eventhub import EventHubConsumerClient

# Storage for collected events
collected_events = []

def on_event(partition_context, event):
    """Process each event - with proper None checking"""
    try:
        # Check if event is None
        if event is None:
            print("Received None event (skipping)")
            return
        
        # Try to get event body
        try:
            body = event.body_as_str()
        except Exception:
            body = str(event.body) if hasattr(event, 'body') else "No body"
        
        # Parse event data
        event_data = {
            'body': body,
            'enqueued_time': event.enqueued_time if hasattr(event, 'enqueued_time') else None,
            'offset': event.offset if hasattr(event, 'offset') else None,
            'sequence_number': event.sequence_number if hasattr(event, 'sequence_number') else None,
            'partition_key': event.partition_key if hasattr(event, 'partition_key') else None
        }
        
        collected_events.append(event_data)
        
        # Print progress
        if len(collected_events) <= 10 or len(collected_events) % 10 == 0:
            print(f"‚úì Collected {len(collected_events)} events")
            
    except Exception as e:
        print(f"Error processing event: {str(e)}")

# Create consumer
print("Connecting to Event Hub...")
print("Reading events (will read for 20 seconds)...\n")

client = EventHubConsumerClient.from_connection_string(
    conn_str=read_connection_string,
    consumer_group="$Default"
)

try:
    with client:
        client.receive(
            on_event=on_event,
            starting_position="-1",  # Start from beginning
            max_wait_time=20  # Read for 20 seconds
        )
    
    print(f"\n{'='*60}")
    print(f"‚úì Finished reading from Event Hub")
    print(f"‚úì Total events collected: {len(collected_events)}")
    print(f"{'='*60}\n")
    
except Exception as e:
    print(f"\n‚úó Error reading from Event Hub: {str(e)}")
    print("\nPlease check:")
    print("  - Read access key from Key Vault is correct")
    print("  - You have 'Azure Event Hubs Data Receiver' permission")
    print("  - Event Hub namespace and name are correct")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 7: Display Collected Events

# COMMAND ----------

if collected_events:
    print(f"Found {len(collected_events)} events!\n")
    
    # Show first few events
    print("First 5 events:")
    print("="*60)
    for i, event in enumerate(collected_events[:5], 1):
        print(f"\nEvent {i}:")
        print(f"  Body: {event['body']}")
        print(f"  Enqueued Time: {event['enqueued_time']}")
        print(f"  Offset: {event['offset']}")
        print(f"  Sequence Number: {event['sequence_number']}")
    print("="*60)
else:
    print("‚ö†Ô∏è No events collected from Event Hub")
    print("\nPossible reasons:")
    print("  1. Event Hub is empty - Try running Step 5 to send test messages")
    print("  2. Connection issue - Check your Key Vault secrets")
    print("  3. Permissions - Verify you have 'Data Receiver' role")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 8: Convert to Pandas DataFrame

# COMMAND ----------

import pandas as pd

if collected_events:
    # Convert to Pandas
    df_pandas = pd.DataFrame(collected_events)
    
    print(f"Created Pandas DataFrame with {len(df_pandas)} rows")
    print("\nDataFrame Info:")
    print(df_pandas.info())
    
    print("\nFirst 10 rows:")
    display(df_pandas.head(10))
else:
    print("No events to convert")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 9: Convert to Spark DataFrame

# COMMAND ----------

if collected_events:
    # Convert to Spark DataFrame
    df_spark = spark.createDataFrame(collected_events)
    
    print("‚úì Created Spark DataFrame")
    print("\nSchema:")
    df_spark.printSchema()
    
    print(f"\nTotal records: {df_spark.count()}")
    
    # Display
    display(df_spark)
else:
    print("No events to convert to Spark DataFrame")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 10: Parse JSON Message Bodies

# COMMAND ----------

from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType, DoubleType

if collected_events and len(collected_events) > 0:
    # Sample message to understand structure
    print("Sample message body:")
    print(collected_events[0]['body'])
    print()
    
    # Define schema for your messages
    message_schema = StructType([
        StructField("id", StringType(), True),
        StructField("timestamp", StringType(), True),
        StructField("sensor", StringType(), True),
        StructField("value", DoubleType(), True),
        StructField("unit", StringType(), True)
    ])
    
    # Parse JSON
    df_parsed = df_spark.select(
        from_json(col("body"), message_schema).alias("data"),
        col("enqueued_time"),
        col("offset"),
        col("sequence_number")
    ).select("data.*", "enqueued_time", "offset", "sequence_number")
    
    print("‚úì Parsed JSON messages")
    print(f"Total parsed records: {df_parsed.count()}\n")
    
    display(df_parsed)
else:
    print("No events to parse")



In [0]:
%sql
create schema  gdc_dbxtraining.

In [0]:
# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 11: Save to Delta Lake

# COMMAND ----------

if collected_events and len(collected_events) > 0:
    # Output path
    output_path = "/tmp/eventhub/data"
    
    # Save to Delta
    df_spark.write \
        .format("delta") \
        .mode("append") \
        .save(output_path)
    
    print(f"‚úì Saved {len(collected_events)} events to Delta Lake")
    print(f"  Location: {output_path}")
else:
    print("No events to save")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 12: Read from Delta Lake

# COMMAND ----------

from pyspark.sql.functions import col

output_path = "/tmp/eventhub/data"

try:
    # Read Delta table
    df_delta = spark.read.format("delta").load(output_path)
    
    print(f"‚úì Delta table loaded")
    print(f"Total records: {df_delta.count()}\n")
    
    # Display latest records
    display(df_delta.orderBy(col("enqueued_time").desc()).limit(100))
    
except Exception as e:
    print(f"Delta table not found: {str(e)}")
    print("Run Step 11 first to save data to Delta Lake")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 13: Send More Test Messages (Anytime)

# COMMAND ----------

def send_test_messages(count=5, sensor_type="temperature"):
    """Send test messages to Event Hub"""
    from datetime import datetime, timezone
    import json
    
    messages = []
    for i in range(count):
        msg = {
            "id": f"msg_{datetime.now().strftime('%Y%m%d%H%M%S')}_{i}",
            "timestamp": datetime.now(timezone.utc).isoformat(),
            "sensor": sensor_type,
            "value": 20.0 + i * 0.5,
            "unit": "celsius" if sensor_type == "temperature" else "percent"
        }
        messages.append(msg)
    
    try:
        producer = EventHubProducerClient.from_connection_string(
            conn_str=write_connection_string
        )
        
        event_batch = producer.create_batch()
        for msg in messages:
            event_batch.add(EventData(json.dumps(msg)))
        
        producer.send_batch(event_batch)
        producer.close()
        
        print(f"‚úì Sent {count} test messages")
        for msg in messages:
            print(f"  - {msg['id']}: {msg['sensor']} = {msg['value']}")
        
        return True
    except Exception as e:
        print(f"‚úó Error: {str(e)}")
        return False

# Send 5 temperature readings
send_test_messages(count=5, sensor_type="temperature")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 14: Read Latest Messages

# COMMAND ----------

# Clear previous events
collected_events = []

print("Reading latest messages from Event Hub...")
print("(Reading for 15 seconds)\n")

client = EventHubConsumerClient.from_connection_string(
    conn_str=read_connection_string,
    consumer_group="$Default"
)

try:
    with client:
        client.receive(
            on_event=on_event,
            starting_position="-1",
            max_wait_time=15
        )
    
    print(f"\n‚úì Collected {len(collected_events)} events")
    
    if collected_events:
        # Show latest events
        print("\nLatest events:")
        for i, event in enumerate(collected_events[-5:], 1):
            print(f"{i}. {event['body']}")
    
except Exception as e:
    print(f"‚úó Error: {str(e)}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 15: Verify Key Vault Connection

# COMMAND ----------

# Verify secrets are accessible (without displaying the actual values)
try:
    # List available scopes
    scopes = dbutils.secrets.listScopes()
    print("Available Secret Scopes:")
    for scope in scopes:
        print(f"  - {scope.name}")
    
    # List secrets in your scope (keys only, not values)
    print(f"\nSecrets in '{keyvault_scope}' scope:")
    secrets = dbutils.secrets.list(keyvault_scope)
    for secret in secrets:
        print(f"  - {secret.key}")
    
    # Verify the specific secrets exist
    secret_keys = [s.key for s in secrets]
    
    if read_secret_name in secret_keys:
        print(f"\n‚úì Read secret '{read_secret_name}' found")
    else:
        print(f"\n‚úó Read secret '{read_secret_name}' NOT found")
    
    if write_secret_name in secret_keys:
        print(f"‚úì Write secret '{write_secret_name}' found")
    else:
        print(f"‚úó Write secret '{write_secret_name}' NOT found")
        
except Exception as e:
    print(f"Error: {str(e)}")
    print("\nPlease ensure:")
    print("1. Secret scope 'dbx-ss-kv-gdctraining' is created and linked to Key Vault")
    print("2. Databricks has access to Key Vault (Managed Identity or Service Principal)")
    print("3. Secrets exist in Key Vault with the correct names")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Troubleshooting Guide
# MAGIC 
# MAGIC ### Error: "Secret does not exist with scope"
# MAGIC **Cause:** Key Vault secret scope is not set up or secrets don't exist
# MAGIC **Solution:**
# MAGIC 1. Verify secret scope exists: Run Step 15
# MAGIC 2. Check scope name is correct: `dbx-ss-kv-gdctraining`
# MAGIC 3. Verify secrets exist in Key Vault:
# MAGIC    - `evh-gdctraining-001-read`
# MAGIC    - `evh-gdctraining-001-write`
# MAGIC 
# MAGIC ### Error: "PERMISSION_DENIED" from Key Vault
# MAGIC **Cause:** Databricks doesn't have permission to access Key Vault
# MAGIC **Solution:**
# MAGIC 1. Grant Databricks Managed Identity access to Key Vault
# MAGIC 2. Add "Get" and "List" permissions for secrets
# MAGIC 3. Wait 5-10 minutes for permissions to propagate
# MAGIC 
# MAGIC ### Error: "NoneType object has no attribute"
# MAGIC **Fixed!** The code now handles None events properly.
# MAGIC 
# MAGIC ### No events collected:
# MAGIC 1. **First, send test messages:** Run Step 5
# MAGIC 2. **Wait a few seconds:** Give Event Hub time to process
# MAGIC 3. **Then read:** Run Step 6
# MAGIC 
# MAGIC ### "Unauthorized" error when reading/writing:
# MAGIC - Check Key Vault secrets contain valid access keys
# MAGIC - Verify you have proper Event Hub permissions:
# MAGIC   - "Azure Event Hubs Data Sender" for writing
# MAGIC   - "Azure Event Hubs Data Receiver" for reading
# MAGIC 
# MAGIC ### Connection timeout:
# MAGIC - Event Hub might be empty - send test messages first
# MAGIC - Check namespace: `evhns-gdctraining.servicebus.windows.net`
# MAGIC - Verify network connectivity from Databricks to Azure

# COMMAND ----------

# MAGIC %md
# MAGIC ## Quick Reference
# MAGIC 
# MAGIC ### Configuration:
# MAGIC - **Event Hub Namespace:** evhns-gdctraining.servicebus.windows.net
# MAGIC - **Event Hub Name:** evh-gdctraining-001
# MAGIC - **Key Vault Scope:** dbx-ss-kv-gdctraining
# MAGIC - **Read Secret:** evh-gdctraining-001-read
# MAGIC - **Write Secret:** evh-gdctraining-001-write
# MAGIC 
# MAGIC ### Typical Workflow:
# MAGIC 
# MAGIC 1. **Setup:** Run Steps 1-4 (install, config, retrieve secrets, build connection strings)
# MAGIC 2. **Send test messages:** Run Step 5
# MAGIC 3. **Wait 5 seconds**
# MAGIC 4. **Read messages:** Run Step 6
# MAGIC 5. **View data:** Run Steps 7-9
# MAGIC 6. **Parse JSON:** Run Step 10
# MAGIC 7. **Save to Delta:** Run Step 11
# MAGIC 
# MAGIC ### Key Functions:
# MAGIC 
# MAGIC ```python
# MAGIC # Send messages
# MAGIC send_test_messages(count=10, sensor_type="humidity")
# MAGIC 
# MAGIC # Verify Key Vault
# MAGIC dbutils.secrets.listScopes()
# MAGIC dbutils.secrets.list(keyvault_scope)
# MAGIC 
# MAGIC # Save to Delta
# MAGIC df_spark.write.format("delta").mode("append").save("/path")
# MAGIC ```

# COMMAND ----------

# MAGIC %md
# MAGIC ## Next Steps
# MAGIC 
# MAGIC ### ‚úÖ What's Working:
# MAGIC - Secure credential management with Key Vault
# MAGIC - Sending messages to Event Hub
# MAGIC - Reading messages from Event Hub
# MAGIC - Converting to Spark DataFrames
# MAGIC - Saving to Delta Lake
# MAGIC - Parsing JSON messages
# MAGIC 
# MAGIC ### üöÄ For Production:
# MAGIC 1. **Schedule this notebook as a job** (Workflows ‚Üí Create Job)
# MAGIC 2. **Set up Delta tables** with proper schemas
# MAGIC 3. **Add data validation** and error handling
# MAGIC 4. **Monitor with Databricks monitoring tools**
# MAGIC 5. **Set up alerts** for failed jobs
# MAGIC 
# MAGIC ### üí° Security Best Practices:
# MAGIC - ‚úÖ Using Key Vault for secrets (not hardcoded)
# MAGIC - ‚úÖ Using Databricks secret scopes
# MAGIC - ‚úÖ Secrets never displayed in output
# MAGIC - Consider: Rotate access keys regularly
# MAGIC - Consider: Use Managed Identity for authentication

new


In [0]:
# Databricks notebook source
# MAGIC %md
# MAGIC # Azure Event Hub - Python SDK with Key Vault
# MAGIC 
# MAGIC **‚úÖ Works on shared clusters - No library installation needed**
# MAGIC **‚úÖ Uses single shared access key for read and write**

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 1: Install Python SDK

# COMMAND ----------

%pip install azure-eventhub pandas

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 2: Configuration

# COMMAND ----------

# Event Hub Configuration
eh_namespace = "evhns-natraining.servicebus.windows.net"
eh_name = "evh-natraining-biju"

# Key Vault Secret Scope
keyvault_scope = "dbx-ss-kv-natraining-2"

# Secret name in Key Vault (same for both read and write)
secret_name = "evh-natraining-read-write"

# Shared Access Key Name (policy that has both send and listen permissions)
shared_access_key_name = "SharedAccessKeyToSendAndListen"

# Create widgets for configuration
#dbutils.widgets.text("eventhub_name", "evh-natraining-001")
#dbutils.widgets.text("test_message", "Test Biju on Databricks!")

print("‚úì Configuration loaded successfully")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 3: Retrieve Access Key from Key Vault

# COMMAND ----------

# Retrieve the access key from Key Vault via Databricks Secrets
try:
    secret_value = dbutils.secrets.get(scope=keyvault_scope, key=secret_name)
    print("‚úì Successfully retrieved secret from Key Vault")
    print(f"  - Secret name: {secret_name}")
    print(f"  - Scope: {keyvault_scope}")
except Exception as e:
    print(f"‚úó Error retrieving secret: {str(e)}")
    print("\nTroubleshooting:")
    print("1. Verify secret scope exists:")
    try:
        scopes = dbutils.secrets.listScopes()
        print("   Available scopes:", [s.name for s in scopes])
    except:
        print("   Cannot list scopes")
    
    print("\n2. Verify secret exists in scope:")
    try:
        secrets = dbutils.secrets.list(keyvault_scope)
        print(f"   Secrets in '{keyvault_scope}':", [s.key for s in secrets])
    except:
        print(f"   Cannot list secrets in scope '{keyvault_scope}'")
    
    print("\n3. Check Key Vault access permissions")
    raise

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 4: Build Connection String

# COMMAND ----------

# Build connection string using the shared access key
# This key has both Send and Listen permissions
send_conn_str = (
    f"Endpoint=sb://{eh_namespace}/;"
    f"SharedAccessKeyName={shared_access_key_name};"
    f"SharedAccessKey={secret_value};"
    f"EntityPath={eh_name}"
)

# Same connection string for both read and write since we have a shared key
read_connection_string = send_conn_str
write_connection_string = send_conn_str

print("‚úì Connection string built successfully")
print(f"  - Using shared access key: {shared_access_key_name}")
print(f"  - Event Hub: {eh_name}")
print(f"  - Namespace: {eh_namespace}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 5: Send Test Messages to Event Hub

# COMMAND ----------

from azure.eventhub import EventHubProducerClient, EventData
import json
from datetime import datetime, timezone

# Get test message from widget
test_message_text = dbutils.widgets.get("test_message")

# Create test messages
test_messages = [
    {
        "id": "test_056757",
        "timestamp": datetime.now(timezone.utc).isoformat(),
        "message": test_message_text,
        "sensor": "temperature",
        "value": 22.5,
        "unit": "celsius"
    },
    {
        "id": "test_57657",
        "timestamp": datetime.now(timezone.utc).isoformat(),
        "message": "Temperature reading 2",
        "sensor": "temperature",
        "value": 23.1,
        "unit": "celsius"
    },
    {
        "id": "test_57573",
        "timestamp": datetime.now(timezone.utc).isoformat(),
        "message": "Humidity reading",
        "sensor": "humidity",
        "value": 65.3,
        "unit": "percent"
    },
    {
        "id": "test_088812",
        "timestamp": datetime.now(timezone.utc).isoformat(),
        "message": "Pressure reading",
        "sensor": "pressure",
        "value": 1013.2,
        "unit": "hPa"
    },
    {
        "id": "test_0575708",
        "timestamp": datetime.now(timezone.utc).isoformat(),
        "message": "Final test message",
        "sensor": "humidity",
        "value": 67.8,
        "unit": "percent"
    }
]

print("Sending test messages to Event Hub...")
print(f"Event Hub: {eh_name}")
print(f"Messages to send: {len(test_messages)}\n")

try:
    # Create producer
    producer = EventHubProducerClient.from_connection_string(
        conn_str=write_connection_string
    )
    
    # Create and send batch
    event_batch = producer.create_batch()
    
    for msg in test_messages:
        event_batch.add(EventData(json.dumps(msg)))
        print(f"  ‚úì Added: {msg['id']} - {msg['message'][:50]}")
    
    # Send the batch
    producer.send_batch(event_batch)
    producer.close()
    
    print(f"\n{'='*70}")
    print(f"‚úì Successfully sent {len(test_messages)} messages to Event Hub!")
    print(f"{'='*70}")
    print("\n‚è≥ Wait 5 seconds for messages to be available...")
    
    import time
    time.sleep(5)
    
except Exception as e:
    print(f"\n{'='*70}")
    print(f"‚úó Error sending messages: {str(e)}")
    print(f"{'='*70}")
    print("\nDebugging Information:")
    print(f"  - Event Hub Namespace: {eh_namespace}")
    print(f"  - Event Hub Name: {eh_name}")
    print(f"  - Shared Access Key Name: {shared_access_key_name}")
    print("\nPossible Issues:")
    print("  1. Access key is invalid or expired")
    print("  2. Shared access policy doesn't have Send permission")
    print("  3. Network connectivity issue")
    print("  4. Event Hub namespace or name is incorrect")

# COMMAND ----------



In [0]:
# MAGIC %md
# MAGIC ## Step 6: Read from Event Hub

# COMMAND ----------

from azure.eventhub import EventHubConsumerClient

# Storage for collected events
collected_events = []

def on_event(partition_context, event):
    """Process each event - with proper None checking"""
    try:
        # Check if event is None
        if event is None:
            return
        
        # Try to get event body
        try:
            body = event.body_as_str()
        except Exception:
            body = str(event.body) if hasattr(event, 'body') else "No body"
        
        # Parse event data
        event_data = {
            'body': body,
            'enqueued_time': event.enqueued_time if hasattr(event, 'enqueued_time') else None,
            'offset': event.offset if hasattr(event, 'offset') else None,
            'sequence_number': event.sequence_number if hasattr(event, 'sequence_number') else None,
            'partition_key': event.partition_key if hasattr(event, 'partition_key') else None
        }
        
        collected_events.append(event_data)
        
        # Print progress
        if len(collected_events) <= 10 or len(collected_events) % 10 == 0:
            print(f"  ‚úì Collected {len(collected_events)} events")
            
    except Exception as e:
        print(f"  Error processing event: {str(e)}")

# Create consumer
print("Connecting to Event Hub for reading...")
print(f"Event Hub: {eh_name}")
print("Reading events (will read for 20 seconds)...\n")

client = EventHubConsumerClient.from_connection_string(
    conn_str=read_connection_string,
    consumer_group="$Default"
)

try:
    with client:
        client.receive(
            on_event=on_event,
            starting_position="-1",  # Start from beginning
            max_wait_time=20  # Read for 20 seconds
        )
    
    print(f"\n{'='*70}")
    print(f"‚úì Finished reading from Event Hub")
    print(f"‚úì Total events collected: {len(collected_events)}")
    print(f"{'='*70}\n")
    
except Exception as e:
    print(f"\n{'='*70}")
    print(f"‚úó Error reading from Event Hub: {str(e)}")
    print(f"{'='*70}")
    print("\nDebugging Information:")
    print(f"  - Event Hub Namespace: {eh_namespace}")
    print(f"  - Event Hub Name: {eh_name}")
    print(f"  - Shared Access Key Name: {shared_access_key_name}")
    print("\nPossible Issues:")
    print("  1. Access key is invalid or expired")
    print("  2. Shared access policy doesn't have Listen permission")
    print("  3. Network connectivity issue")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 7: Display Collected Events

# COMMAND ----------

if collected_events:
    print(f"{'='*70}")
    print(f"Found {len(collected_events)} events!")
    print(f"{'='*70}\n")
    
    # Show first few events
    print("First 5 events:")
    print("-"*70)
    for i, event in enumerate(collected_events[:5], 1):
        print(f"\nEvent {i}:")
        print(f"  Body: {event['body'][:100]}...")  # Truncate long messages
        print(f"  Enqueued Time: {event['enqueued_time']}")
        print(f"  Offset: {event['offset']}")
        print(f"  Sequence Number: {event['sequence_number']}")
    print("-"*70)
else:
    print("‚ö†Ô∏è No events collected from Event Hub")
    print("\nPossible reasons:")
    print("  1. Event Hub is empty - Try running Step 5 to send test messages")
    print("  2. Connection issue - Check your Key Vault secret")
    print("  3. Permissions - Verify the shared access policy has Listen permission")

# COMMAND ----------
# MAGIC %md
# MAGIC ## Step 8: Convert to Spark DataFrame

# COMMAND ----------

if collected_events:
    # Convert to Spark DataFrame
    df_spark = spark.createDataFrame(collected_events)
    
    print("‚úì Created Spark DataFrame")
    print("\nSchema:")
    df_spark.printSchema()
    
    print(f"\nTotal records: {df_spark.count()}\n")
    
    # Display
    display(df_spark)
else:
    print("No events to convert to Spark DataFrame")




# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 9: Parse JSON Message Bodies

# COMMAND ----------

from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType, DoubleType

if collected_events and len(collected_events) > 0:
    # Sample message to understand structure
    print("Sample message body:")
    print(collected_events[0]['body'])
    print()
    
    # Define schema for your messages
    message_schema = StructType([
        StructField("id", StringType(), True),
        StructField("timestamp", StringType(), True),
        StructField("message", StringType(), True),
        StructField("sensor", StringType(), True),
        StructField("value", DoubleType(), True),
        StructField("unit", StringType(), True)
    ])
    
    # Parse JSON
    df_parsed = df_spark.select(
        from_json(col("body"), message_schema).alias("data"),
        col("enqueued_time"),
        col("offset"),
        col("sequence_number")
    ).select("data.*", "enqueued_time", "offset", "sequence_number")
    
    print("‚úì Parsed JSON messages")
    print(f"Total parsed records: {df_parsed.count()}\n")
    
    display(df_parsed)
else:
    print("No events to parse")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 10: Save to Delta Lake

# COMMAND ----------

if collected_events and len(collected_events) > 0:
    # Output path
    output_path = "/Volumes/na-dbxtraining/biju_raw/biju_vol/eventhub/natraining/data"
    
    # Save to Delta
    df_spark.write \
        .format("delta") \
        .mode("append") \
        .save(output_path)
    
    print(f"‚úì Saved {len(collected_events)} events to Delta Lake")
    print(f"  Location: {output_path}")
else:
    print("No events to save")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 11: Read from Delta Lake

# COMMAND ----------

from pyspark.sql.functions import col

output_path = "/Volumes/na-dbxtraining/biju_raw/biju_vol/eventhub/natraining/data"

try:
    # Read Delta table
    df_delta = spark.read.format("delta").load(output_path)
    
    print(f"‚úì Delta table loaded")
    print(f"Total records: {df_delta.count()}\n")
    
    # Display latest records
    display(df_delta.orderBy(col("enqueued_time").desc()).limit(100))
    
except Exception as e:
    print(f"Delta table not found: {str(e)}")
    print("Run Step 10 first to save data to Delta Lake")



In [0]:


# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 9: Parse JSON Message Bodies

# COMMAND ----------

from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType, DoubleType

if collected_events and len(collected_events) > 0:
    # Sample message to understand structure
    print("Sample message body:")
    print(collected_events[0]['body'])
    print()
    
    # Define schema for your messages
    message_schema = StructType([
        StructField("id", StringType(), True),
        StructField("timestamp", StringType(), True),
        StructField("message", StringType(), True),
        StructField("sensor", StringType(), True),
        StructField("value", DoubleType(), True),
        StructField("unit", StringType(), True)
    ])
    
    # Parse JSON
    df_parsed = df_spark.select(
        from_json(col("body"), message_schema).alias("data"),
        col("enqueued_time"),
        col("offset"),
        col("sequence_number")
    ).select("data.*", "enqueued_time", "offset", "sequence_number")
    
    print("‚úì Parsed JSON messages")
    print(f"Total parsed records: {df_parsed.count()}\n")
    
    display(df_parsed)
else:
    print("No events to parse")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 10: Save to Delta Lake

# COMMAND ----------

if collected_events and len(collected_events) > 0:
    # Output path
    output_path = "/Volumes/na-dbxtraining/biju_raw/biju_vol/eventhub/natraining/data"
    
    # Save to Delta
    df_spark.write \
        .format("delta") \
        .mode("append") \
        .save(output_path)
    
    print(f"‚úì Saved {len(collected_events)} events to Delta Lake")
    print(f"  Location: {output_path}")
else:
    print("No events to save")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 11: Read from Delta Lake

# COMMAND ----------

from pyspark.sql.functions import col

output_path = "/tmp/eventhub/natraining/data"

try:
    # Read Delta table
    df_delta = spark.read.format("delta").load(output_path)
    
    print(f"‚úì Delta table loaded")
    print(f"Total records: {df_delta.count()}\n")
    
    # Display latest records
    display(df_delta.orderBy(col("enqueued_time").desc()).limit(100))
    
except Exception as e:
    print(f"Delta table not found: {str(e)}")
    print("Run Step 10 first to save data to Delta Lake")

