In [0]:
%sql
select * from `na-dbxtraining`.biju_gold.raw_events

In [0]:
eh_namespace = "evhns-natraining.servicebus.windows.net"
eh_name = "evh-natraining-bijunew"
keyvault_scope = "dbx-ss-kv-natraining-2"
secret_name = "evh-natraining-read-write"
shared_access_key_name = "SharedAccessKeyToSendAndListen"

In [0]:
# Full connection string for Kafka-based Event Hub connection
connection_string = (
    f"Endpoint=sb://{eh_namespace}/;"
    f"SharedAccessKeyName={shared_access_key_name};"
    f"SharedAccessKey={secret_value}"
)

print("✓ Connection string built successfully")

In [0]:
try:
    # Read streaming data from Event Hub
    streaming_df = (
        spark.readStream
        .format("kafka")
        .options(**KAFKA_OPTIONS)
        .load()
        .withColumn("value_string", col("value").cast("string"))
        .withColumn("parsed_data", from_json(col("value_string"), payload_schema))
    )
    
    # Flatten data
    flattened_df = streaming_df.select(
        col("parsed_data.product_id").alias("product_id"),
        col("parsed_data.product_name").alias("product_name"),
        col("parsed_data.category").alias("category"),
        col("parsed_data.brand").alias("brand"),
        col("parsed_data.quantity").alias("quantity"),
        col("parsed_data.total_amount").alias("total_amount"),
        to_timestamp(col("parsed_data.order_timestamp")).alias("order_timestamp")
    )
    
    # Aggregate by product
    product_gold_df = (
        flattened_df
        .groupBy(
            window(col("order_timestamp"), "1 day").alias("time_window"),
            col("product_id"),
            col("product_name"),
            col("category"),
            col("brand")
        )
        .agg(
            _count("*").alias("times_ordered"),
            _sum("quantity").alias("units_sold"),
            _sum("total_amount").alias("revenue"),
            avg("total_amount").alias("avg_sale_value")
        )
        .select(
            col("time_window.start").alias("day"),
            col("product_id"),
            col("product_name"),
            col("category"),
            col("brand"),
            col("times_ordered"),
            col("units_sold"),
            col("revenue"),
            col("avg_sale_value"),
            current_timestamp().alias("processed_at")
        )
    )
    
    # Write to gold table with backticks
    product_query = (
        product_gold_df
        .writeStream
        .format("delta")
        .outputMode("complete")
        .option(
            "checkpointLocation",
            f"/tmp/checkpoints/{catalog.replace('-', '_')}_{schema_gold}_product_performance"
        )
        .option("mergeSchema", "true")
        .trigger(processingTime="30 seconds")
        .toTable(gold_product_table)
    )
    
    print(f"\n✓ Product performance table streaming job started!")
    print(f"  Query ID: {product_query.id}")
    print(f"  Table: {gold_product_table}")

except Exception as e:
    print(f"\n✗ Error creating product performance table: {str(e)}")
    import traceback
    traceback.print_exc()

In [0]:
display(
    spark.sql(
        "select * from `na-dbxtraining`.biju_gold.product_performance"
    )
)