# Kafka Datagen + Spark Demo

This notebook shows how to:
1. Consume messages from the Kafka topic `orders` using `confluent-kafka`.
2. Read the same topic as a structured stream in PySpark.

In [None]:
from confluent_kafka import Consumer
import json

conf = {
    'bootstrap.servers': 'kafka:29092',
    'group.id': 'notebook-demo',
    'auto.offset.reset': 'earliest'
}

c = Consumer(conf)
c.subscribe(['orders'])

messages = []
for _ in range(5):
    msg = c.poll(5)
    if msg and not msg.error():
        messages.append(json.loads(msg.value().decode()))
c.close()

messages

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("KafkaStructuredStreaming") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.1") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")

df = (spark.readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", "kafka:29092")
      .option("subscribe", "orders")
      .option("startingOffsets", "latest")
      .load())

# Kafka data comes in key and value as bytes, convert to string
df_str = df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)")

# Write stream to console (for testing)
query = (df_str.writeStream
         .format("console")
         .outputMode("append")
         .option("truncate", False)
         .start())

query.awaitTermination()

Now you should see a bar chart with the counts of each product from the Datagen stream.