In [0]:
# ============================================================
# NOTEBOOK 4: STREAMING PIPELINE
# Cell 0: Authentication
# ============================================================

client_id     = dbutils.secrets.get(scope="shopsmart-scope", key="datalake-sp-client-id")
client_secret = dbutils.secrets.get(scope="shopsmart-scope", key="datalake-sp-client-secret")
tenant_id     = dbutils.secrets.get(scope="shopsmart-scope", key="datalake-sp-tenant-id")

storage_account_name = "dlsshopsmartdev123"

spark.conf.set("fs.azure.account.auth.type." + storage_account_name + ".dfs.core.windows.net", "OAuth")
spark.conf.set("fs.azure.account.oauth.provider.type." + storage_account_name + ".dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set("fs.azure.account.oauth2.client.id." + storage_account_name + ".dfs.core.windows.net", client_id)
spark.conf.set("fs.azure.account.oauth2.client.secret." + storage_account_name + ".dfs.core.windows.net", client_secret)
spark.conf.set("fs.azure.account.oauth2.client.endpoint." + storage_account_name + ".dfs.core.windows.net", "https://login.microsoftonline.com/" + tenant_id + "/oauth2/token")

BRONZE = "abfss://bronze@" + storage_account_name + ".dfs.core.windows.net"
SILVER = "abfss://silver@" + storage_account_name + ".dfs.core.windows.net"
GOLD   = "abfss://gold@" + storage_account_name + ".dfs.core.windows.net"

print("Auth configured. Ready for streaming pipeline.")

Auth configured. Ready for streaming pipeline.


In [0]:
# ============================================================
# Cell 1: READ FROM EVENT HUB + WRITE TO BRONZE
# ============================================================
#
# HOW THIS WORKS:
# 1. We install azure-eventhub on the cluster
# 2. Read all available events from Event Hub using Python SDK
# 3. Convert to Spark DataFrame
# 4. Write to Bronze as Delta table
# 5. Then process through Silver (same pattern as batch)
#
# WHY PYTHON SDK INSTEAD OF SPARK STREAMING CONNECTOR?
# The Spark Event Hub connector requires:
#   - Maven packages (com.microsoft.azure:azure-eventhubs-spark)
#   - Single-user cluster (not shared)
# Python SDK works everywhere and is simpler for demo.
#
# IN PRODUCTION:
# You would use Spark Structured Streaming with the
# Event Hub connector for true real-time processing.
# This demo proves the concept works end-to-end.
# ============================================================

# Step 1: Install azure-eventhub on cluster
%pip install azure-eventhub

# ============================================================

from azure.eventhub import EventHubConsumerClient
from pyspark.sql.functions import *
from pyspark.sql.types import *
import json

# Step 2: Get connection string from Key Vault
eh_conn_str = dbutils.secrets.get(scope="shopsmart-scope", key="eventhub-connection-string")
EVENTHUB_NAME = "eh-clickstream"
CONSUMER_GROUP = "$Default"

print("=" * 65)
print("STREAMING PIPELINE - EVENT HUB CONSUMER")
print("=" * 65)
print("  Event Hub:      " + EVENTHUB_NAME)
print("  Consumer Group: " + CONSUMER_GROUP)
print("  Reading events...")

# Step 3: Read events from Event Hub
events_list = []

def on_event(partition_context, event):
    """Callback for each event received"""
    body = event.body_as_str()
    events_list.append(body)
    partition_context.update_checkpoint(event)

# Create consumer and receive events
consumer = EventHubConsumerClient.from_connection_string(
    conn_str=eh_conn_str,
    consumer_group=CONSUMER_GROUP,
    eventhub_name=EVENTHUB_NAME
)

# Read for 15 seconds then stop
import threading
import time

def receive_events():
    with consumer:
        consumer.receive(
            on_event=on_event,
            starting_position="-1",  # Read from beginning
        )

# Run receiver in background thread with timeout
receiver_thread = threading.Thread(target=receive_events, daemon=True)
receiver_thread.start()

# Wait 15 seconds for events to arrive
print("  Waiting 15 seconds for events...")
time.sleep(15)

# Consumer will stop when thread is killed (daemon=True)
print("  Events received: " + str(len(events_list)))

if len(events_list) == 0:
    print("\n  WARNING: No events received!")
    print("  Make sure you ran stream_producer.py first")
    print("  and events were sent to: " + EVENTHUB_NAME)
else:
    # Step 4: Parse JSON events into list of dicts
    parsed_events = []
    parse_errors = 0
    for event_str in events_list:
        try:
            event_dict = json.loads(event_str)
            # Flatten geo_location
            if "geo_location" in event_dict:
                event_dict["geo_city"] = event_dict["geo_location"].get("city")
                event_dict["geo_country"] = event_dict["geo_location"].get("country")
                del event_dict["geo_location"]
            parsed_events.append(event_dict)
        except:
            parse_errors += 1

    print("  Successfully parsed: " + str(len(parsed_events)))
    if parse_errors > 0:
        print("  Parse errors: " + str(parse_errors))

    # Step 5: Define schema for the streaming events
    stream_schema = StructType([
        StructField("event_id", StringType(), True),
        StructField("session_id", StringType(), True),
        StructField("customer_id", StringType(), True),
        StructField("event_type", StringType(), True),
        StructField("event_timestamp", StringType(), True),
        StructField("page_url", StringType(), True),
        StructField("product_id", StringType(), True),
        StructField("device_type", StringType(), True),
        StructField("browser", StringType(), True),
        StructField("os", StringType(), True),
        StructField("ip_address", StringType(), True),
        StructField("geo_city", StringType(), True),
        StructField("geo_country", StringType(), True),
        StructField("referrer", StringType(), True),
        StructField("search_query", StringType(), True),
        StructField("page_load_time_ms", IntegerType(), True),
        StructField("time_on_page_sec", IntegerType(), True),
        StructField("scroll_depth_pct", IntegerType(), True),
    ])

    # Step 6: Create Spark DataFrame
    df_stream_events = spark.createDataFrame(parsed_events, schema=stream_schema)

    # Add streaming metadata
    df_stream_bronze = df_stream_events \
        .withColumn("_ingestion_source", lit("event_hub")) \
        .withColumn("_ingestion_timestamp", current_timestamp()) \
        .withColumn("_event_hub_name", lit(EVENTHUB_NAME)) \
        .withColumn("_batch_id", lit("stream_" + time.strftime("%Y%m%d_%H%M%S")))

    stream_count = df_stream_bronze.count()
    print("\n  Spark DataFrame created: " + str(stream_count) + " rows")

    # Step 7: Write to Bronze as Delta (APPEND to existing clickstream)
    bronze_stream_path = BRONZE + "/streaming_clickstream"

    df_stream_bronze.write \
        .format("delta") \
        .mode("append") \
        .option("mergeSchema", True) \
        .save(bronze_stream_path)

    print("  Written to Bronze: " + bronze_stream_path)

    # Step 8: Show sample
    print("\n  Sample streaming events:")
    df_stream_bronze.select(
        "event_id", "customer_id", "event_type",
        "product_id", "device_type", "event_timestamp"
    ).show(10, truncate=False)

    # Event type distribution
    print("  Event type distribution (streaming):")
    df_stream_bronze.groupBy("event_type").count().orderBy(desc("count")).show()

    print("\n[DONE] Streaming events landed in Bronze!")
    print("[NEXT] Process through Silver layer")

Collecting azure-eventhub
  Downloading azure_eventhub-5.15.1-py3-none-any.whl.metadata (30 kB)
Downloading azure_eventhub-5.15.1-py3-none-any.whl (317 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/317.1 kB[0m [31m?[0m eta [36m-:--:--[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.1/317.1 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: azure-eventhub
Successfully installed azure-eventhub-5.15.1
[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m
STREAMING PIPELINE - EVENT HUB CONSUMER
  Event Hub:      eh-clickstream
  Consumer Group: $Default
  Reading events...
  Waiting 15 seconds for events...
  Events received: 50
  Successfully parsed: 50

  Spark DataFrame created: 50 rows
  Written to Bronze: abfss://bronze@dlsshopsmartdev123.dfs.core.windows.net/streaming_clickstream

  Sample streaming events:
+---------------

In [0]:
# ============================================================
# Cell 2: PROCESS STREAMING DATA - BRONZE TO SILVER
# ============================================================
#
# This applies the SAME transformations as our batch
# clickstream pipeline (Cell 9 in notebook 01).
# Proving that batch and streaming data follow the
# same processing logic.
# ============================================================

from pyspark.sql.functions import *

# Read streaming events from Bronze
bronze_stream_path = BRONZE + "/streaming_clickstream"

try:
    df_stream = spark.read.format("delta").load(bronze_stream_path)
    stream_count = df_stream.count()
    print("Streaming events in Bronze: " + str(stream_count) + " rows")
except:
    print("ERROR: No streaming events found. Run Cell 1 first.")
    stream_count = 0

if stream_count > 0:
    # Apply same Silver transformations as batch pipeline
    df_stream_silver = df_stream \
        .withColumn("event_id", trim(col("event_id"))) \
        .withColumn("session_id", trim(col("session_id"))) \
        .withColumn("customer_id", trim(col("customer_id"))) \
        .withColumn("event_type", lower(trim(col("event_type")))) \
        .withColumn("event_timestamp", to_timestamp(col("event_timestamp"))) \
        .withColumn("product_id", trim(col("product_id"))) \
        .withColumn("device_type", lower(trim(col("device_type")))) \
        .withColumn("browser", initcap(trim(col("browser")))) \
        .withColumn("os", initcap(trim(col("os")))) \
        .withColumn("referrer", lower(trim(col("referrer")))) \
        .withColumn("search_query", lower(trim(col("search_query")))) \
        .withColumn("event_date", to_date(col("event_timestamp"))) \
        .withColumn("event_hour", hour(col("event_timestamp"))) \
        .withColumn("day_name", date_format(col("event_timestamp"), "EEEE")) \
        .withColumn("is_weekend",
            when(dayofweek(col("event_timestamp")).isin(1, 7), lit(True))
            .otherwise(lit(False))) \
        .withColumn("is_anonymous",
            when(col("customer_id").isNull(), lit(True)).otherwise(lit(False))) \
        .withColumn("is_purchase_intent",
            when(col("event_type").isin("add_to_cart", "checkout"), lit(True))
            .otherwise(lit(False))) \
        .withColumn("funnel_stage",
            when(col("event_type") == "page_view", lit("1-Awareness"))
            .when(col("event_type") == "product_view", lit("2-Interest"))
            .when(col("event_type").isin("add_to_cart", "search", "remove_from_cart"), lit("3-Consideration"))
            .when(col("event_type") == "checkout", lit("4-Purchase"))
            .otherwise(lit("Other"))) \
        .withColumn("data_source", lit("streaming")) \
        .withColumn("_silver_processed_at", current_timestamp()) \
        .withColumn("_silver_version", lit("1.0"))

    # Write to Silver streaming table
    silver_stream_path = SILVER + "/streaming_clickstream"

    df_stream_silver.write \
        .format("delta") \
        .mode("append") \
        .option("mergeSchema", True) \
        .save(silver_stream_path)

    silver_count = spark.read.format("delta").load(silver_stream_path).count()

    print("")
    print("=" * 65)
    print("STREAMING SILVER - COMPLETE")
    print("=" * 65)
    print("  Bronze events:   " + str(stream_count))
    print("  Silver events:   " + str(silver_count))
    print("  Path:            " + silver_stream_path)

    # Compare batch vs streaming
    batch_count = spark.read.format("delta").load(SILVER + "/clickstream").count()
    print("\n  DATA COMPARISON:")
    print("    Batch clickstream (Silver):     " + str(batch_count) + " events")
    print("    Streaming clickstream (Silver): " + str(silver_count) + " events")
    print("    Total clickstream events:       " + str(batch_count + silver_count))

    # Show sample
    print("\n  Streaming Silver sample:")
    df_stream_silver.select(
        "event_id", "customer_id", "event_type",
        "product_id", "funnel_stage", "is_anonymous", "data_source"
    ).show(10, truncate=False)

    # Funnel analysis on streaming data
    print("\n  Streaming funnel analysis:")
    df_stream_silver.groupBy("funnel_stage").agg(
        count("*").alias("events"),
        countDistinct("session_id").alias("sessions")
    ).orderBy("funnel_stage").show()

    # Device distribution
    print("\n  Streaming device distribution:")
    df_stream_silver.groupBy("device_type").count().orderBy(desc("count")).show()

    print("\n" + "=" * 65)
    print("STREAMING PIPELINE - COMPLETE!")
    print("=" * 65)
    print("""
  WHAT WE PROVED:
    1. Python producer sends events to Azure Event Hub
    2. Databricks reads events from Event Hub
    3. Events land in Bronze (raw, with metadata)
    4. Same Silver transformations apply to streaming data
    5. Batch + streaming data coexist in the same lake

  IN PRODUCTION (ARCHITECTURE DIAGRAM):
    - stream_producer.py -> Azure Event Hub (DONE)
    - Event Hub -> Spark Structured Streaming (simulated)
    - Streaming -> Bronze -> Silver (DONE)
    - Same Gold layer serves both batch + streaming data

  THIS COMPLETES THE STREAMING PATH IN YOUR ARCHITECTURE!
    """)

Streaming events in Bronze: 50 rows

STREAMING SILVER - COMPLETE
  Bronze events:   50
  Silver events:   50
  Path:            abfss://silver@dlsshopsmartdev123.dfs.core.windows.net/streaming_clickstream

  DATA COMPARISON:
    Batch clickstream (Silver):     3000 events
    Streaming clickstream (Silver): 50 events
    Total clickstream events:       3050

  Streaming Silver sample:
+----------------+-----------+------------+----------+---------------+------------+-----------+
|event_id        |customer_id|event_type  |product_id|funnel_stage   |is_anonymous|data_source|
+----------------+-----------+------------+----------+---------------+------------+-----------+
|EVT-C3A559B20CE2|CUST396    |page_view   |PROD026   |1-Awareness    |false       |streaming  |
|EVT-832FC70E0DDB|CUST077    |product_view|PROD007   |2-Interest     |false       |streaming  |
|EVT-80AF42D55ED3|CUST480    |page_view   |PROD004   |1-Awareness    |false       |streaming  |
|EVT-C885AD27871E|CUST353    |page_v

In [0]:
# Install library
%pip install azure-eventhub

# ============================================================
from azure.eventhub import EventHubConsumerClient
from pyspark.sql.functions import *
from pyspark.sql.types import *
import json
import threading
import time

# Get connection string from Key Vault
eh_conn_str = dbutils.secrets.get(scope="shopsmart-scope", key="eventhub-connection-string")
EVENTHUB_NAME = "eh-clickstream"
CONSUMER_GROUP = "$Default"

print("Reading events from Event Hub...")

# Read events
events_list = []
def on_event(partition_context, event):
    events_list.append(event.body_as_str())
    partition_context.update_checkpoint(event)

consumer = EventHubConsumerClient.from_connection_string(
    conn_str=eh_conn_str,
    consumer_group=CONSUMER_GROUP,
    eventhub_name=EVENTHUB_NAME
)

def receive_events():
    with consumer:
        consumer.receive(on_event=on_event, starting_position="-1")

# Run in background for 15 seconds
t = threading.Thread(target=receive_events, daemon=True)
t.start()
time.sleep(15)

print("Events received: " + str(len(events_list)))

if len(events_list) > 0:
    # Parse and Create DataFrame
    parsed_events = []
    for e in events_list:
        try:
            d = json.loads(e)
            if "geo_location" in d:
                d["geo_city"] = d["geo_location"].get("city")
                d["geo_country"] = d["geo_location"].get("country")
                del d["geo_location"]
            parsed_events.append(d)
        except: pass

    # Schema
    schema = StructType([
        StructField("event_id", StringType(), True),
        StructField("session_id", StringType(), True),
        StructField("customer_id", StringType(), True),
        StructField("event_type", StringType(), True),
        StructField("event_timestamp", StringType(), True),
        StructField("page_url", StringType(), True),
        StructField("product_id", StringType(), True),
        StructField("device_type", StringType(), True),
        StructField("browser", StringType(), True),
        StructField("os", StringType(), True),
        StructField("ip_address", StringType(), True),
        StructField("geo_city", StringType(), True),
        StructField("geo_country", StringType(), True),
        StructField("referrer", StringType(), True),
        StructField("search_query", StringType(), True)
    ])

    df_stream = spark.createDataFrame(parsed_events, schema=schema) \
        .withColumn("_ingestion_source", lit("event_hub")) \
        .withColumn("_ingestion_timestamp", current_timestamp())

    # Write to Bronze
    df_stream.write.format("delta").mode("append").option("mergeSchema", True).save(BRONZE + "/streaming_clickstream")
    print("Written " + str(df_stream.count()) + " events to Bronze streaming table.")

[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m
Reading events from Event Hub...
Events received: 150
Written 150 events to Bronze streaming table.


In [0]:
# ============================================================
# Cell 2: PROCESS STREAMING DATA - BRONZE TO SILVER
# ============================================================
from pyspark.sql.functions import *

# Read streaming events from Bronze
bronze_stream_path = BRONZE + "/streaming_clickstream"

# Apply Silver Transformations
df_stream = spark.read.format("delta").load(bronze_stream_path)

df_stream_silver = df_stream \
    .withColumn("event_id", trim(col("event_id"))) \
    .withColumn("event_type", lower(trim(col("event_type")))) \
    .withColumn("is_anonymous", when(col("customer_id").isNull(), lit(True)).otherwise(lit(False))) \
    .withColumn("event_timestamp", to_timestamp(col("event_timestamp"))) \
    .withColumn("funnel_stage",
        when(col("event_type") == "page_view", lit("1-Awareness"))
        .when(col("event_type") == "product_view", lit("2-Interest"))
        .when(col("event_type").isin("add_to_cart", "search"), lit("3-Consideration"))
        .when(col("event_type") == "checkout", lit("4-Purchase"))
        .otherwise(lit("Other"))) \
    .withColumn("_silver_processed_at", current_timestamp())

# Write to Silver
silver_stream_path = SILVER + "/streaming_clickstream"
df_stream_silver.write.format("delta").mode("append").option("mergeSchema", True).save(silver_stream_path)

print("STREAMING SILVER COMPLETE")
print("  Path: " + silver_stream_path)
print("  Rows: " + str(df_stream_silver.count()))

# Show Funnel Analysis
df_stream_silver.groupBy("funnel_stage").count().orderBy("funnel_stage").show()

STREAMING SILVER COMPLETE
  Path: abfss://silver@dlsshopsmartdev123.dfs.core.windows.net/streaming_clickstream
  Rows: 200
+---------------+-----+
|   funnel_stage|count|
+---------------+-----+
|    1-Awareness|   61|
|     2-Interest|   53|
|3-Consideration|   57|
|     4-Purchase|   16|
|          Other|   13|
+---------------+-----+

