In [0]:
# Event Hub Configuration

eventhub_namespace = "evhns-natraining.servicebus.windows.net"
eventhub_name = "evh-natraining-bijunew"
keyvault_scope = "dbx-ss-kv-natraining-2"
secret_name = "evh-natraining-read-write"
shared_access_key_name = "SharedAccessKeyToSendAndListen"

try:
    secret_value = dbutils.secrets.get(
        scope=keyvault_scope,
        key=secret_name
    )
    print("✓ Successfully retrieved secret from Key Vault")
    print(f"  - Secret name: {secret_name}")
    print(f"  - Scope: {keyvault_scope}")
except Exception as e:
    print(f"✗ Error retrieving secret: {str(e)}")
    raise
try:
    secret_value = dbutils.secrets.get(
        scope=keyvault_scope,
        key=secret_name
    )
    print("✓ Successfully retrieved secret from Key Vault")
    print(f"  - Secret name: {secret_name}")
    print(f"  - Scope: {keyvault_scope}")
except Exception as e:
    print(f"✗ Error retrieving secret: {str(e)}")
    raise
shared_access_key=secret_value


# Build connection string
connection_string = (
    f"Endpoint=sb://{eventhub_namespace}/;"
    f"SharedAccessKeyName={shared_access_key_name};"
    f"SharedAccessKey={shared_access_key};"
    f"EntityPath={eventhub_name}"
)

# Event Hub configuration for Spark
eh_conf = {
    'eventhubs.connectionString': connection_string,
    'eventhubs.consumerGroup': '$Default'
}

# Storage paths
bronze_orders_path = "/Volumes/na-dbxtraining/biju_raw/biju_vol/bronze/orders"
silver_path = "/Volumes/na-dbxtraining/biju_raw/biju_vol//silver/order_details"
gold_path = "/Volumes/na-dbxtraining/biju_raw/biju_vol//gold/aggregations"

# Checkpoint locations
checkpoint_bronze_orders = "/Volumes/na-dbxtraining/biju_raw/biju_vol/mnt/delta/checkpoints/bronze_orders"
checkpoint_bronze_products = "/Volumes/na-dbxtraining/biju_raw/biju_vol/mnt/delta/checkpoints/bronze_products"
checkpoint_silver = "/Volumes/na-dbxtraining/biju_raw/biju_vol/mnt/delta/checkpoints/silver"
checkpoint_gold = "/Volumes/na-dbxtraining/biju_raw/biju_vol/mnt/delta/checkpoints/gold"

print("✓ Event Hub configuration loaded")
print(f"✓ Event Hub: {eventhub_name}")

In [0]:
%pip install azure-eventhub
dbutils.library.restartPython()

In [0]:
# Event Hub Configuration
eventhub_namespace = "evhns-natraining.servicebus.windows.net"
eventhub_name = "evh-natraining-biju"
keyvault_scope = "dbx-ss-kv-natraining-2"
secret_name = "evh-natraining-read-write"
shared_access_key_name = "SharedAccessKeyToSendAndListen"

try:
    # The actual Shared Access Key value is retrieved and stored in 'secret_value'
    secret_value = dbutils.secrets.get(
        scope=keyvault_scope,
        key=secret_name
    )
    print("✓ Successfully retrieved secret from Key Vault")
    print(f"  - Secret name: {secret_name}")
    print(f"  - Scope: {keyvault_scope}")
except Exception as e:
    print(f"✗ Error retrieving secret: {str(e)}")
    raise


# Build connection string
# FIX: Replaced 'shared_access_key' with the actual secret value variable: 'secret_value'
connection_string = (
    f"Endpoint=sb://{eventhub_namespace}/;"
    f"SharedAccessKeyName={shared_access_key_name};"
    f"SharedAccessKey={secret_value};" # <--- FIXED HERE!
    f"EntityPath={eventhub_name}"
)

print("\n--- Generated Connection String ---")
# Use 'dbutils.secrets.mask()' to safely print the connection string without exposing the key
print(connection_string.replace(secret_value, '***'))

In [0]:
import json
import time
from datetime import datetime
from azure.eventhub import EventHubProducerClient, EventData
import random

# COMMAND ----------

# Product catalog
products = [
    {"product_id": "P001", "name": "Laptop", "category": "Electronics", "brand": "Dell", "base_price": 1200},
    {"product_id": "P002", "name": "Smartphone", "category": "Electronics", "brand": "Samsung", "base_price": 800},
    {"product_id": "P003", "name": "Headphones", "category": "Accessories", "brand": "Sony", "base_price": 150},
    {"product_id": "P004", "name": "Keyboard", "category": "Accessories", "brand": "Logitech", "base_price": 80},
    {"product_id": "P005", "name": "Monitor", "category": "Electronics", "brand": "LG", "base_price": 400},
    {"product_id": "P006", "name": "Mouse", "category": "Accessories", "brand": "Logitech", "base_price": 50},
    {"product_id": "P007", "name": "Tablet", "category": "Electronics", "brand": "Apple", "base_price": 600},
    {"product_id": "P008", "name": "Webcam", "category": "Accessories", "brand": "Logitech", "base_price": 100},
    {"product_id": "P009", "name": "Smartwatch", "category": "Electronics", "brand": "Apple", "base_price": 350},
    {"product_id": "P010", "name": "USB Cable", "category": "Accessories", "brand": "Generic", "base_price": 15}
]

customer_names = ["John Smith", "Jane Doe", "Bob Johnson", "Alice Williams", "Charlie Brown", 
                  "Diana Davis", "Eve Martinez", "Frank Wilson", "Grace Lee", "Henry Taylor"]

locations = ["New York", "Los Angeles", "Chicago", "Houston", "Phoenix", 
             "Philadelphia", "San Antonio", "San Diego", "Dallas", "San Jose"]

# COMMAND ----------

def generate_enriched_order(order_id):
    """Generate enriched order event with product details embedded"""
    
    customer_id = f"C{random.randint(1000, 1050):04d}"
    product = random.choice(products)
    quantity = random.randint(1, 5)
    discount = random.choice([0, 0.05, 0.10, 0.15])
    price = product["base_price"] * (1 - discount)
    
    event = {
        # Order Information
        "order_id": f"ORD{order_id:06d}",
        "customer_id": customer_id,
        "customer_name": random.choice(customer_names),
        "location": random.choice(locations),
        "order_status": random.choice(["pending", "confirmed", "shipped"]),
        "payment_method": random.choice(["credit_card", "debit_card", "paypal"]),
        "quantity": quantity,
        "discount_pct": discount,
        "total_amount": round(price * quantity, 2),
        "order_timestamp": datetime.utcnow().isoformat(),
        
        # Product Information (embedded)
        "product_id": product["product_id"],
        "product_name": product["name"],
        "category": product["category"],
        "brand": product["brand"],
        "base_price": product["base_price"],
        "unit_price": round(price, 2)
    }
    
    return event

# COMMAND ----------

def send_events():
    """Send enriched order events to Event Hub"""
    
    producer = EventHubProducerClient.from_connection_string(
        conn_str=connection_string.replace(f";EntityPath={eventhub_name}", ""),
        eventhub_name=eventhub_name
    )
    
    print("Starting event generation for 60 seconds...")
    print("Each event contains BOTH order and product information\n")
    
    start_time = time.time()
    order_count = 0
    
    try:
        while time.time() - start_time < 60:
            # Send 5 orders per second
            batch = producer.create_batch()
            for i in range(5):
                event = generate_enriched_order(order_count)
                batch.add(EventData(json.dumps(event)))
                order_count += 1
            
            producer.send_batch(batch)
            
            # Progress update
            if order_count % 50 == 0:
                elapsed = time.time() - start_time
                print(f"[{elapsed:.0f}s] Generated: {order_count} enriched orders")
            
            time.sleep(1)
    
    finally:
        producer.close()
    
    print(f"\n✓ Generation complete!")
    print(f"✓ Total enriched orders: {order_count}")
    print(f"✓ Each order contains order details + product details")

# COMMAND ----------

# Run the generator
send_events()

In [0]:
# COMMAND ----------

# Check if 01_Data_Generator completed successfully
# You should see output like: "✓ Total orders sent: 300"
print("Did you run 01_Data_Generator.py and see '✓ Total orders sent: 300'?")
print("If NO, run 01_Data_Generator.py first!")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Check 2: Verify Event Hub Has Data (Azure Portal)

# COMMAND ----------

print("Check Azure Portal:")
print("1. Go to Azure Portal → Your Event Hub Namespace")
print("2. Click on 'orders-events' Event Hub")
print("3. Go to 'Overview' or 'Metrics'")
print("4. Look at 'Incoming Messages' metric")
print("")
print("You should see ~300 incoming messages in the last hour")
print("If you see 0 messages, the data generator didn't send data properly")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Check 3: Test Kafka Connection Directly

# COMMAND ----------

from pyspark.sql.functions import *

print("Testing Kafka connection...")

# Try to read just 10 messages
test_df = (spark
    .read
    .format("kafka")
    .options(**kafka_options)
    .option("startingOffsets", "earliest")
    .option("endingOffsets", "latest")
    .load()
)

message_count = test_df.count()
print(f"\n✓ Successfully connected!")
print(f"✓ Found {message_count} messages in Event Hub")

if message_count == 0:
    print("\n⚠️  WARNING: Event Hub is EMPTY!")
    print("   Run 01_Data_Generator.py to send data")
else:
    print("\n✓ Event Hub has data - streaming should work")
    print("\nSample messages:")
    display(test_df.select(col("value").cast("string")).limit(5))

# COMMAND ----------

# MAGIC %md
# MAGIC ## Check 4: Verify Streaming Queries are Running

# COMMAND ----------

# Check active streams
active = spark.streams.active

print(f"Active Streams: {len(active)}")
print("-" * 60)

for stream in active:
    print(f"Stream ID: {stream.id}")
    print(f"  Status: {stream.status}")
    print(f"  Recent Progress: {stream.lastProgress}")
    print("-" * 60)

if len(active) == 0:
    print("\n⚠️  No active streams!")
    print("   Run 02_Bronze_Layer.py to start streams")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Check 5: Check Checkpoint Locations

# COMMAND ----------

print("Checkpoint locations:")
print(f"Orders:   {checkpoint_bronze_orders}")
print(f"Products: {checkpoint_bronze_products}")
print("")

# Check if checkpoints exist
try:
    orders_files = dbutils.fs.ls(checkpoint_bronze_orders)
    print(f"✓ Orders checkpoint exists with {len(orders_files)} files")
except:
    print("✗ Orders checkpoint doesn't exist yet (this is OK if first run)")

try:
    products_files = dbutils.fs.ls(checkpoint_bronze_products)
    print(f"✓ Products checkpoint exists with {len(products_files)} files")
except:
    print("✗ Products checkpoint doesn't exist yet (this is OK if first run)")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Check 6: Try Reading with Different Starting Offset

# COMMAND ----------

print("Testing with 'latest' offset (only NEW messages)...")

# Update kafka_options for this test
test_options = kafka_options.copy()
test_options["startingOffsets"] = "latest"

test_stream = (spark
    .readStream
    .format("kafka")
    .options(**test_options)
    .load()
)

print("✓ Stream created with 'latest' offset")
print("If your data was sent BEFORE the stream started, it won't appear")
print("Solution: Use 'earliest' offset (which is already in config)")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Check 7: Manual Bronze Write Test

# COMMAND ----------

print("Testing manual batch write to Bronze...")

from pyspark.sql.types import *

# Read in batch mode
batch_df = (spark
    .read
    .format("kafka")
    .options(**kafka_options)
    .option("startingOffsets", "earliest")
    .option("endingOffsets", "latest")
    .load()
)

count = batch_df.count()
print(f"Messages in Event Hub: {count}")

if count > 0:
    # Parse one message
    sample = batch_df.limit(1).select(col("value").cast("string")).collect()[0][0]
    print("\nSample message:")
    print(sample)
    
    # Try to write to Bronze
    try:
        test_path = "/tmp/test_bronze_orders"
        batch_df.write.format("delta").mode("overwrite").save(test_path)
        print(f"\n✓ Successfully wrote to Delta: {test_path}")
        print("The issue is likely with streaming, not with Event Hub or Delta")
    except Exception as e:
        print(f"\n✗ Error writing to Delta: {e}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Solution: Clear Checkpoints and Restart

# COMMAND ----------



# COMMAND ----------

# MAGIC %md
# MAGIC ## Quick Fix Commands

# COMMAND ----------

# Uncomment and run if needed:

# # Stop all streams
# for stream in spark.streams.active:
#     print(f"Stopping: {stream.id}")
#     stream.stop()
