In [0]:
%pip install azure-eventhub

In [0]:
# Event Hub Configuration

eventhub_namespace = "evhns-natraining.servicebus.windows.net"
eventhub_name = "evh-natraining-biju"
keyvault_scope = "dbx-ss-kv-natraining-2"
secret_name = "evh-natraining-read-write"
shared_access_key_name = "SharedAccessKeyToSendAndListen"

try:
    secret_value = dbutils.secrets.get(
        scope=keyvault_scope,
        key=secret_name
    )
    print("âœ“ Successfully retrieved secret from Key Vault")
    print(f"  - Secret name: {secret_name}")
    print(f"  - Scope: {keyvault_scope}")
except Exception as e:
    print(f"âœ— Error retrieving secret: {str(e)}")
    raise
shared_access_key=secret_value

# Build connection string
connection_string = (
    f"Endpoint=sb://{eventhub_namespace}/;"
    f"SharedAccessKeyName={shared_access_key_name};"
    f"SharedAccessKey={shared_access_key};"
    f"EntityPath={eventhub_name}"
)

# Kafka-style connection for Event Hub
kafka_bootstrap_servers = f"{eventhub_namespace}.servicebus.windows.net:9093"
kafka_topic = eventhub_name

# SASL connection string for Kafka
jaas_config = f'org.apache.kafka.common.security.plain.PlainLoginModule required username="$ConnectionString" password="Endpoint=sb://{eventhub_namespace}.servicebus.windows.net/;SharedAccessKeyName={shared_access_key_name};SharedAccessKey={shared_access_key}";'


# Storage paths
bronze_orders_path = "/Volumes/na-dbxtraining/biju_raw/biju_vol/bronze/orders"
silver_path = "/Volumes/na-dbxtraining/biju_raw/biju_vol//silver/order_details"
gold_path = "/Volumes/na-dbxtraining/biju_raw/biju_vol//gold/aggregations"

# Checkpoint locations
checkpoint_bronze_orders = "/Volumes/na-dbxtraining/biju_raw/biju_vol/mnt/delta/checkpoints/bronze_orders"
checkpoint_bronze_products = "/Volumes/na-dbxtraining/biju_raw/biju_vol/mnt/delta/checkpoints/bronze_products"
checkpoint_silver = "/Volumes/na-dbxtraining/biju_raw/biju_vol/mnt/delta/checkpoints/silver"
checkpoint_gold = "/Volumes/na-dbxtraining/biju_raw/biju_vol/mnt/delta/checkpoints/gold"

print("âœ“ Configuration loaded")
print(f"âœ“ Event Hub Namespace: {eventhub_namespace}")
print(f"âœ“ Event Hub (Kafka Topic): {eventhub_name}")
print(f"âœ“ Kafka Bootstrap: {kafka_bootstrap_servers}")

In [0]:
%pip install kafka-python

In [0]:
dbutils.library.restartPython()

In [0]:
# Databricks notebook source
# MAGIC %md
# MAGIC # ðŸ¥‰ BRONZE Notebook: Streaming Ingestion (Event Hub to Delta using KAFKA)
# MAGIC 
# MAGIC Ingests raw event data from Azure Event Hubs using the Kafka connector (`readStream.format("kafka")`) and writes to the Bronze Delta table.

# COMMAND ----------

# --- Configuration Section ---
eh_namespace = "evhns-natraining.servicebus.windows.net"
eh_name = "evh-natraining-bijunew"
keyvault_scope = "dbx-ss-kv-natraining-2"
secret_name = "evh-natraining-read-write"
shared_access_key_name = "SharedAccessKeyToSendAndListen" # NOTE: Not directly used in the Kafka config, but good to retain if needed

# Table configuration
catalog = "na-dbxtraining"
bronze_schema = "biju_bronze"
bronze_table_name = "eventhubbronzeorderdata"
full_bronze_table_name = f"`{catalog}`.{bronze_schema}.{bronze_table_name}"

# Checkpoint path (MANDATORY for Structured Streaming)
checkpoint_path = f"/Volumes/na-dbxtraining/biju_raw/biju_vol/streaming_checkpoints/{bronze_table_name}_kafka/" 

# Get secret value
secret_value = dbutils.secrets.get(scope=keyvault_scope, key=secret_name)

print(f"Target Bronze table: {full_bronze_table_name}")
print(f"Checkpoint location: {checkpoint_path}")
print("âœ“ Configuration Ready")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 1: Read Stream from Event Hub using Kafka Connector

# COMMAND ----------

# Set Kafka options for Azure Event Hubs
kafka_options = {
    # Event Hubs uses the Kafka protocol on port 9093
    "kafka.bootstrap.servers": f"{eh_namespace}:9093",
    # The Kafka topic is the Event Hub name
    "subscribe": eh_name,
    # Authentication settings required by Azure Event Hubs Kafka endpoint
    "kafka.security.protocol": "SASL_SSL",
    "kafka.sasl.mechanism": "PLAIN",
    # SASL PLAIN authentication uses "$ConnectionString" as the username and the primary key as the password
    "kafka.sasl.jaas.config": f'org.apache.kafka.common.security.plain.PlainLoginModule required username="$ConnectionString" password="{secret_value}";',
    "kafka.request.timeout.ms": "60000",
    "kafka.session.timeout.ms": "60000",
    "maxOffsetsPerTrigger": "10000" # Throttle the input rate for micro-batch
}

from pyspark.sql.functions import current_timestamp, col

# Read stream from Event Hub using Kafka connector
df_bronze_stream = (
    spark.readStream
    .format("kafka")
    .options(**kafka_options)
    # Use 'latest' to start reading only new events from when the stream starts
    .option("startingOffsets", "latest")
    .load()
)

# Kafka connector output columns: 
# key (binary), value (binary), topic (string), partition (int), offset (long), timestamp (long), timestampType (int)
df_bronze_stream.printSchema()

# Apply minimal transformation (Add ingestion time and rename 'value' to 'body')
df_bronze_stream = (
    df_bronze_stream
    .withColumn("ingestion_time", current_timestamp())
    # Rename 'value' (the payload) to 'body' for consistency with the previous structure
    .withColumnRenamed("value", "body")
)

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 2: Write Stream to Bronze Delta Table

# COMMAND ----------

# Ensure the schema/catalog exist before writing
#park.sql(f"CREATE DATABASE IF NOT EXISTS {catalog}.{bronze_schema}")

# Select columns to write. 'body' contains the raw JSON data.
df_write = df_bronze_stream.select("body", "topic", "partition", "offset", "timestamp", "ingestion_time")

# Start the stream write
query = (
  df_write.writeStream
    .format("delta")
    .outputMode("append")
    .option("checkpointLocation", checkpoint_path)
    .option("mergeSchema", "true")
    .trigger(processingTime='10 seconds') # Run a micro-batch every 10 seconds
    .toTable(full_bronze_table_name)
)

print(f"Starting Bronze stream (KAFKA)... (Query ID: {query.id})")
# NOTE: This cell will run indefinitely until manually stopped or the cluster shuts down.

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 3: Stream Verification (Optional)
# MAGIC *Run this in a separate cell after the stream is started.*

# COMMAND ----------

# # Example of how to stop the stream (run in a separate cell if needed)
# # for stream in spark.streams.active:
# #     if stream.name == query.name: # Replace 'query.name' if you didn't define it
# #         print(f"Stopping stream: {stream.id}")
# #         stream.stop()

# # Display status of the stream
# # display(spark.table(full_bronze_table_name))

In [0]:
%sql
drop table if exists `na-dbxtraining`.biju_bronze.eventhubbronzeorderdata

In [0]:
from kafka.admin import KafkaAdminClient
from kafka.errors import KafkaConfigurationError

try:
    secret_value = dbutils.secrets.get(
        scope=keyvault_scope,
        key=secret_name
    )
    print("âœ“ Successfully retrieved secret from Key Vault")
    print(f"  - Secret name: {secret_name}")
    print(f"  - Scope: {keyvault_scope}")
except Exception as e:
    print(f"âœ— Error retrieving secret: {str(e)}")
    raise
shared_access_key=secret_value

# Build connection string
connection_string = (
    f"Endpoint=sb://{eventhub_namespace}/;"
    f"SharedAccessKeyName={shared_access_key_name};"
    f"SharedAccessKey={shared_access_key};"
    f"EntityPath={eventhub_name}"
)

# Kafka-style connection for Event Hub
kafka_bootstrap_servers = f"{eventhub_namespace}.servicebus.windows.net:9093"
kafka_topic = eventhub_name

# SASL connection string for Kafka
jaas_config = f'org.apache.kafka.common.security.plain.PlainLoginModule required username="$ConnectionString" password="Endpoint=sb://{eventhub_namespace}.servicebus.windows.net/;SharedAccessKeyName={shared_access_key_name};SharedAccessKey={shared_access_key}";'
# Replicate your SASL configuration
sasl_username = "$jaas_config"
# Ensure secret_value is the actual key obtained from dbutils.secrets.get(...)


admin_client = KafkaAdminClient(
bootstrap_servers=f"{eh_namespace}:9093",
security_protocol="SASL_SSL",
sasl_mechanism="PLAIN",
sasl_plain_username=sasl_username,
sasl_plain_password=secret_value,
request_timeout_ms=60000,
api_version=(0, 10, 2) # Use a slightly older API version for compatibility if needed
)
    # Attempt to list topics to confirm successful connection and authentication
topics = admin_client.list_topics()
print("âœ… Connection Successful! Topics found:", topics)
admin_client.close()
    


In [0]:
# Retrieve secret from Key Vault
secret_value = dbutils.secrets.get(
    scope=keyvault_scope,
    key=secret_name
)

# Kafka connection options for Azure Event Hubs
kafka_options = {
    "kafka.bootstrap.servers": f"{eventhub_namespace}.servicebus.windows.net:9093",
    "subscribe": eventhub_name,
    "kafka.security.protocol": "SASL_SSL",
    "kafka.sasl.mechanism": "PLAIN",
    "kafka.sasl.jaas.config": (
        f'org.apache.kafka.common.security.plain.PlainLoginModule required '
        f'username="$ConnectionString" '
        f'password="Endpoint=sb://{eventhub_namespace}.servicebus.windows.net/;'
        f'SharedAccessKeyName={shared_access_key_name};'
        f'SharedAccessKey={secret_value}";'
    ),
    "kafka.request.timeout.ms": "60000",
    "kafka.session.timeout.ms": "60000"
}

# Read stream from Event Hub using Kafka connector
df = (
    spark.readStream
    .format("kafka")
    .options(**kafka_options)
    .option("startingOffsets", "latest")
    .load()
)

display(df)

In [0]:
# Databricks notebook source
# MAGIC %md
# MAGIC # ðŸ¥‰ BRONZE Notebook: Streaming Ingestion (Event Hub to Delta using KAFKA)
# MAGIC 
# MAGIC Ingests raw event data from Azure Event Hubs using the Kafka connector and writes to the Bronze Delta table.
# MAGIC 
# MAGIC **NOTE:** This version uses the Shared Access Policy (Key Name + Key) components in the SASL password.

# COMMAND ----------

# --- Configuration Section ---
# Use the naming conventions from your previous notebooks for consistency
eh_namespace = "evhns-natraining" # Just the namespace name
eh_name = "evh-natraining-bijunew"
keyvault_scope = "dbx-ss-kv-natraining-2"
secret_name = "evh-natraining-read-write"
shared_access_key_name = "SharedAccessKeyToSendAndListen" # The name of your SAS Policy

# Table configuration
catalog = "na-dbxtraining"
bronze_schema = "biju_bronze"
bronze_table_name = "eventhubbronzeorderdata"
full_bronze_table_name = f"`{catalog}`.{bronze_schema}.{bronze_table_name}"

# Checkpoint path (MANDATORY for Structured Streaming)
checkpoint_path = f"/Volumes/na-dbxtraining/biju_raw/biju_vol/streaming_checkpoints/{bronze_table_name}_sas/" 

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 1: Prepare Credentials and Kafka Options

# COMMAND ----------

# Retrieve secret (the Shared Access Key value) from Key Vault
secret_value = dbutils.secrets.get(
    scope=keyvault_scope,
    key=secret_name
)

# Construct the SASL password string: KeyName + KeyValue
# Format: <SharedAccessKeyName>:<SharedAccessKey>
sasl_password = f"{shared_access_key_name}:{secret_value}"


# Kafka connection options for Azure Event Hubs
kafka_options = {
    # Broker address
    "kafka.bootstrap.servers": f"{eh_namespace}.servicebus.windows.net:9093",
    # Topic to subscribe to (Event Hub name)
    "subscribe": eh_name,
    # Authentication settings
    "kafka.security.protocol": "SASL_SSL",
    "kafka.sasl.mechanism": "PLAIN",
    "kafka.sasl.jaas.config": (
        f'org.apache.kafka.common.security.plain.PlainLoginModule required '
        # Use the literal username identifier
        f'username="$ConnectionString" '
        # Use the constructed SASL password (KeyName:KeyValue)
        f'password="{sasl_password}";' 
    ),
    # Connection timeout settings
    "kafka.request.timeout.ms": "60000",
    "kafka.session.timeout.ms": "60000",
    "maxOffsetsPerTrigger": "10000"
}

print(f"Target Bronze table: {full_bronze_table_name}")
print(f"Checkpoint location: {checkpoint_path}")
print("âœ“ Kafka Options Configured using Shared Access Policy")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 2: Read Stream from Event Hub and Transform

# COMMAND ----------

from pyspark.sql.functions import current_timestamp, col

# Read stream from Event Hub using Kafka connector
df_bronze_stream = (
    spark.readStream
    .format("kafka")
    .options(**kafka_options)
    # Start reading from the latest available events
    .option("startingOffsets", "latest") 
    .load()
)

# Transform the stream for Bronze layer storage
df_bronze_stream = (
    df_bronze_stream
    # Rename 'value' (the payload) to 'body' for clarity
    .withColumnRenamed("value", "body") 
    # Add pipeline metadata: ingestion timestamp
    .withColumn("ingestion_time", current_timestamp())
    # Cast the binary 'body' value to string 
    .withColumn("body", col("body").cast("string")) 
)

print("Bronze Stream Schema:")
df_bronze_stream.printSchema()

# COMMAND ----------

# MAGIC %md
# MAGIC ## Step 3: Write Stream to Bronze Delta Table

# COMMAND ----------

# Ensure the catalog and schema exist
#spark.sql(f"CREATE SCHEMA IF NOT EXISTS {catalog}.{bronze_schema}")

# Select the required columns for the Bronze table
df_write = df_bronze_stream.select(
    "body", # The raw data payload
    "topic", 
    "partition", 
    "offset", 
    "timestamp", # The event enqueue time
    "ingestion_time" # The processing time
)

# Start the stream write
query = (
  df_write.writeStream
    .format("delta")
    .outputMode("append")
    .option("checkpointLocation", checkpoint_path)
    .option("mergeSchema", "true")
    .trigger(processingTime='10 seconds') # Define micro-batch interval
    .toTable(full_bronze_table_name)
)

print(f"Starting Bronze stream... (Query ID: {query.id})")
# NOTE: This cell will run indefinitely until manually stopped.

In [0]:
from pyspark.sql.functions import col

# Use the same variables as in your config cell
namespace = "evhns-natraining"
eventhub_name = "evh-natraining-bijunew"
shared_access_key_name = "bijuaccesspolicy"
shared_access_key = "0wRvFTfbTJz1daTi6zgmAOBs8sTd9qEAQ+AEhPjdO/I="

# SASL password should be <KeyName>:<Key>
sasl_password = f"{shared_access_key_name}:{shared_access_key}"

kafka_options = {
    "kafka.bootstrap.servers": f"{namespace}.servicebus.windows.net:9093",
    "subscribe": eventhub_name,
    "kafka.security.protocol": "SASL_SSL",
    "kafka.sasl.mechanism": "PLAIN",
    "kafka.sasl.jaas.config": (
        f'org.apache.kafka.common.security.plain.PlainLoginModule required '
        f'username="$ConnectionString" password="{sasl_password}";'
    ),
    "kafka.request.timeout.ms": "60000",
    "kafka.session.timeout.ms": "60000"
}

df = (
    spark.readStream
    .format("kafka")
    .options(**kafka_options)
    .option("startingOffsets", "latest")
    .load()
)

df_parsed = df.withColumn("body", col("value").cast("string"))
display(df_parsed)