In [0]:
%run ./00eventhubconfig

In [0]:
%pip install azure-eventhub


In [0]:
 dbutils.library.restartPython()

In [0]:
%sh
# Replace 2.12 and 2.3.22 with your cluster's correct versions
# This downloads the JAR and puts it in the Spark classpath upon cluster restart
wget https://repo1.maven.org/maven2/com/microsoft/azure/azure-eventhubs-spark_2.12/2.3.22/azure-eventhubs-spark_2.12-2.3.22.jar -P /databricks/jars

In [0]:




eventhub_namespace = "evhns-natraining.servicebus.windows.net"
eventhub_name = "evh-natraining-biju"
keyvault_scope = "dbx-ss-kv-natraining-2"
secret_name = "evh-natraining-read-write"
#shared_access_key_name = "SharedAccessKeyToSendAndListen"
try:
    secret_value = dbutils.secrets.get(
        scope=keyvault_scope,
        key=secret_name
    )
    print("✓ Successfully retrieved secret from Key Vault")
    print(f"  - Secret name: {secret_name}")
    print(f"  - Scope: {keyvault_scope}")
except Exception as e:
    print(f"✗ Error retrieving secret: {str(e)}")
    raise
shared_access_key=secret_value
shared_access_key_name = "RootManageSharedAccessKey"

# Build connection string
eventhub_connection_string = f"Endpoint=sb://{eventhub_namespace}.servicebus.windows.net/;SharedAccessKeyName={shared_access_key_name};SharedAccessKey={shared_access_key}"

# Kafka options dictionary
kafka_options = {
    "kafka.bootstrap.servers": f"{eventhub_namespace}.servicebus.windows.net:9093",
    "subscribe": eventhub_name,
    "kafka.security.protocol": "SASL_SSL",
    "kafka.sasl.mechanism": "PLAIN",
    "kafka.sasl.jaas.config": f'org.apache.kafka.common.security.plain.PlainLoginModule required username="$ConnectionString" password="{eventhub_connection_string}";',
    "kafka.request.timeout.ms": "60000",
    "kafka.session.timeout.ms": "60000",
    "startingOffsets": "earliest",
    "failOnDataLoss": "false"
}

# Storage paths
bronze_orders_path = "/Volumes/na-dbxtraining/biju_raw/biju_vol/bronze/orders"
bronze_products_path = "/Volumes/na-dbxtraining/biju_raw/biju_vol/bronze/products"
silver_path = "/Volumes/na-dbxtraining/biju_raw/biju_vol//silver/order_details"
gold_path = "/Volumes/na-dbxtraining/biju_raw/biju_vol//gold/aggregations"

# Checkpoint locations
checkpoint_bronze_orders = "/Volumes/na-dbxtraining/biju_raw/biju_vol/mnt/delta/checkpoints/bronze_orders"
checkpoint_bronze_products = "/Volumes/na-dbxtraining/biju_raw/biju_vol/mnt/delta/checkpoints/bronze_products"
checkpoint_silver = "/Volumes/na-dbxtraining/biju_raw/biju_vol/mnt/delta/checkpoints/silver"
checkpoint_gold = "/Volumes/na-dbxtraining/biju_raw/biju_vol/mnt/delta/checkpoints/gold"

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

# COMMAND ----------

# Schema for enriched event
enriched_schema = StructType([
    StructField("order_id", StringType()),
    StructField("customer_id", StringType()),
    StructField("customer_name", StringType()),
    StructField("location", StringType()),
    StructField("order_status", StringType()),
    StructField("payment_method", StringType()),
    StructField("quantity", IntegerType()),
    StructField("discount_pct", DoubleType()),
    StructField("total_amount", DoubleType()),
    StructField("order_timestamp", StringType()),
    StructField("product_id", StringType()),
    StructField("product_name", StringType()),
    StructField("category", StringType()),
    StructField("brand", StringType()),
    StructField("base_price", DoubleType()),
    StructField("unit_price", DoubleType())
])

# COMMAND ----------

# MAGIC %md
# MAGIC ## Read from Event Hub via Kafka

# COMMAND ----------

# Read stream using Kafka connector
raw_stream = (
    spark.readStream
    .format("kafka")
    .options(**kafka_options)
    .load()
)

print("✓ Connected to Event Hub")

# COMMAND ----------

# Parse JSON from Event Hub body
parsed_stream = (
    raw_stream
    .withColumn("body_string", col("value").cast("string"))
    .withColumn("data", from_json(col("body_string"), enriched_schema))
    .select("data.*", col("timestamp").alias("event_time"))
)

print("✓ Parsed event stream")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Split into Orders Table

# COMMAND ----------

# Extract orders
orders_stream = (parsed_stream
    .select(
        col("order_id"),
        col("customer_id"),
        col("customer_name"),
        col("location"),
        col("product_id"),
        col("order_status"),
        col("payment_method"),
        col("quantity"),
        col("discount_pct"),
        col("total_amount"),
        col("order_timestamp"),
        col("event_time"),
        current_timestamp().alias("bronze_timestamp")
    )
)

# Write to Bronze Orders
orders_query = (orders_stream
    .writeStream
    .format("delta")
    .outputMode("append")
    .option("checkpointLocation", checkpoint_bronze_orders)
    .trigger(processingTime='5 seconds')
    .start(bronze_orders_path)
)

print("✓ Orders stream started")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Split into Products Table

# COMMAND ----------

# Extract products (deduplicated)
products_stream = (parsed_stream
    .select(
        col("product_id"),
        col("product_name"),
        col("category"),
        col("brand"),
        col("base_price"),
        col("unit_price"),
        current_timestamp().alias("bronze_timestamp")
    )
    .dropDuplicates(["product_id"])
)

# Write to Bronze Products
products_query = (products_stream
    .writeStream
    .format("delta")
    .outputMode("append")
    .option("checkpointLocation", checkpoint_bronze_products)
    .trigger(processingTime='5 seconds')
    .start(bronze_orders_path)
)

print("✓ Products stream started")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Monitor Progress

# COMMAND ----------

import time

print("\nMonitoring Bronze tables (60 seconds)...")
print("-" * 50)

for i in range(12):
    time.sleep(5)
    
    try:
        orders_count = spark.read.format("delta").load(bronze_orders_path).count()
    except:
        orders_count = 0
    
    try:
        products_count = spark.read.format("delta").load(bronze_products_path).count()
    except:
        products_count = 0
    
    print(f"[{i*5:3d}s] Orders: {orders_count:4d} | Products: {products_count:3d}")

print("-" * 50)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Display Bronze Data

# COMMAND ----------

print("=== BRONZE ORDERS ===")
display(spark.read.format("delta").load(bronze_orders_path).orderBy(desc("bronze_timestamp")).limit(20))

# COMMAND ----------

print("=== BRONZE PRODUCTS ===")
display(spark.read.format("delta").load(bronze_products_path).orderBy("product_id"))

# COMMAND ----------

# Summary
orders_final = spark.read.format("delta").load(bronze_orders_path).count()
products_final = spark.read.format("delta").load(bronze_products_path).count()

print("\n" + "="*50)
print("BRONZE LAYER COMPLETE")
print("="*50)
print(f"Orders:   {orders_final:4d} records")
print(f"Products: {products_final:3d} records")
print("="*50)
print("\n✓ Using Kafka connector - no compatibility issues!")
print("Next: Run 03_Silver_Layer.py")


In [0]:




eventhub_namespace = "evhns-natraining.servicebus.windows.net"
eventhub_name = "evh-natraining-biju"
keyvault_scope = "dbx-ss-kv-natraining-2"
secret_name = "evh-natraining-read-write"
#shared_access_key_name = "SharedAccessKeyToSendAndListen"
try:
    secret_value = dbutils.secrets.get(
        scope=keyvault_scope,
        key=secret_name
    )
    print("✓ Successfully retrieved secret from Key Vault")
    print(f"  - Secret name: {secret_name}")
    print(f"  - Scope: {keyvault_scope}")
except Exception as e:
    print(f"✗ Error retrieving secret: {str(e)}")
    raise
shared_access_key=secret_value
shared_access_key_name = "RootManageSharedAccessKey"

# Build connection string
eventhub_connection_string = f"Endpoint=sb://{eventhub_namespace}.servicebus.windows.net/;SharedAccessKeyName={shared_access_key_name};SharedAccessKey={shared_access_key}"

# Kafka options dictionary
kafka_options = {
    "kafka.bootstrap.servers": f"{eventhub_namespace}.servicebus.windows.net:9093",
    "subscribe": eventhub_name,
    "kafka.security.protocol": "SASL_SSL",
    "kafka.sasl.mechanism": "PLAIN",
    "kafka.sasl.jaas.config": f'org.apache.kafka.common.security.plain.PlainLoginModule required username="$ConnectionString" password="{eventhub_connection_string}";',
    "kafka.request.timeout.ms": "60000",
    "kafka.session.timeout.ms": "60000",
    "startingOffsets": "earliest",
    "failOnDataLoss": "false"
}


# COMMAND ----------

# MAGIC %md
# MAGIC ## Check 3: Test Kafka Connection Directly

# COMMAND ----------

from pyspark.sql.functions import *

print("Testing Kafka connection...")

# Try to read just 10 messages
test_df = (spark
    .read
    .format("kafka")
    .options(**kafka_options)
    .option("startingOffsets", "earliest")
    .option("endingOffsets", "latest")
    .load()
)

message_count = test_df.count()
print(f"\n✓ Successfully connected!")
print(f"✓ Found {message_count} messages in Event Hub")

if message_count == 0:
    print("\n⚠️  WARNING: Event Hub is EMPTY!")
    print("   Run 01_Data_Generator.py to send data")
else:
    print("\n✓ Event Hub has data - streaming should work")
    print("\nSample messages:")
    display(test_df.select(col("value").cast("string")).limit(5))

# COMMAND ----------

# MAGIC %md
# MAGIC ## Check 4: Verify Streaming Queries are Running

# COMMAND ----------

# Check active streams
active = spark.streams.active

print(f"Active Streams: {len(active)}")
print("-" * 60)

for stream in active:
    print(f"Stream ID: {stream.id}")
    print(f"  Status: {stream.status}")
    print(f"  Recent Progress: {stream.lastProgress}")
    print("-" * 60)

if len(active) == 0:
    print("\n⚠️  No active streams!")
    print("   Run 02_Bronze_Layer.py to start streams")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Check 5: Check Checkpoint Locations

# COMMAND ----------

print("Checkpoint locations:")
print(f"Orders:   {checkpoint_bronze_orders}")
print(f"Products: {checkpoint_bronze_products}")
print("")

# Check if checkpoints exist
try:
    orders_files = dbutils.fs.ls(checkpoint_bronze_orders)
    print(f"✓ Orders checkpoint exists with {len(orders_files)} files")
except:
    print("✗ Orders checkpoint doesn't exist yet (this is OK if first run)")

try:
    products_files = dbutils.fs.ls(checkpoint_bronze_products)
    print(f"✓ Products checkpoint exists with {len(products_files)} files")
except:
    print("✗ Products checkpoint doesn't exist yet (this is OK if first run)")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Check 6: Try Reading with Different Starting Offset

# COMMAND ----------

print("Testing with 'latest' offset (only NEW messages)...")

# Update kafka_options for this test
test_options = kafka_options.copy()
test_options["startingOffsets"] = "latest"

test_stream = (spark
    .readStream
    .format("kafka")
    .options(**test_options)
    .load()
)

print("✓ Stream created with 'latest' offset")
print("If your data was sent BEFORE the stream started, it won't appear")
print("Solution: Use 'earliest' offset (which is already in config)")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Check 7: Manual Bronze Write Test

# COMMAND ----------

print("Testing manual batch write to Bronze...")

from pyspark.sql.types import *

# Read in batch mode
batch_df = (spark
    .read
    .format("kafka")
    .options(**kafka_options)
    .option("startingOffsets", "earliest")
    .option("endingOffsets", "latest")
    .load()
)

count = batch_df.count()
print(f"Messages in Event Hub: {count}")

if count > 0:
    # Parse one message
    sample = batch_df.limit(1).select(col("value").cast("string")).collect()[0][0]
    print("\nSample message:")
    print(sample)
    
    # Try to write to Bronze
    try:
        test_path = "/tmp/test_bronze_orders"
        batch_df.write.format("delta").mode("overwrite").save(test_path)
        print(f"\n✓ Successfully wrote to Delta: {test_path}")
        print("The issue is likely with streaming, not with Event Hub or Delta")
    except Exception as e:
        print(f"\n✗ Error writing to Delta: {e}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Solution: Clear Checkpoints and Restart

# COMMAND ----------

print("If streams are stuck, clear checkpoints and restart:")
print("")
print("# Stop all streams")
print("for stream in spark.streams.active:")
print("    stream.stop()")
print("")
print("# Clear checkpoints")
print("dbutils.fs.rm('/mnt/delta/checkpoints', True)")
print("")
print("# Re-run 02_Bronze_Layer.py")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Quick Fix Commands

# COMMAND ----------

# Uncomment and run if needed:

# # Stop all streams
# for stream in spark.streams.active:
#     print(f"Stopping: {stream.id}")
#     stream.stop()

# # Clear checkpoints
# dbutils.fs.rm("/mnt/delta/checkpoints", True)
# print("✓ Checkpoints cleared")

# # Clear Bronze tables
# dbutils.fs.rm("/mnt/delta/bronze", True)
# print("✓ Bronze tables cleared")

# Now re-run 02_Bronze_Layer.py
