#### Print Environment Variables

In [1]:
import os
print(os.environ.get("SPARK_HOME"))
print(os.environ.get("KAFKA_HOME"))
print(os.environ.get("HADOOP_HOME"))
print(os.environ.get("JAVA_HOME"))

C:\spark\spark-3.4.4-bin-hadoop3-scala2.13
C:\kafka\kafka_2.13-3.9.0
C:\hadoop-3.3.6
C:\Program Files\Java\jdk-1.8


#### Spark Session with Kafka Support

In [5]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName("KafkaRetailConsumer")
    .master("local[*]")
    .getOrCreate()
)


print("Spark Version:", spark.version)
spark.stop()


Spark Version: 3.4.4


In [6]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName("KafkaRetailConsumer")
    .master("local[*]")
    # .config("spark.sql.shuffle.partitions", "2")
    # .config("spark.sql.streaming.schemaInference", "true")
    .config("spark.jars.packages",
            "org.apache.spark:spark-sql-kafka-0-10_2.13:3.4.4,"
            "org.apache.spark:spark-token-provider-kafka-0-10_2.13:3.4.4,"
            "org.apache.kafka:kafka-clients:3.5.1,"
            "org.apache.commons:commons-pool2:2.11.1,"
            "io.delta:delta-spark_2.13:2.4.0")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .getOrCreate()
)


#### Read from Kafka Topics

In [11]:
df_raw = (
    spark.readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "localhost:9092")
    .option("subscribe", "retail_orders_us,retail_orders_in,retail_orders_others")
    .option("startingOffsets", "earliest")  # "latest" if you only want new ones
    .load()
)

#### Parse The JSON Messages

In [None]:
from pyspark.sql.functions import col, from_json
from pyspark.sql.types import StructType, StringType, IntegerType, FloatType, TimestampType

schema = StructType() \
    .add("order_id", StringType()) \
    .add("product_id", StringType()) \
    .add("quantity", StringType()) \
    .add("price", StringType()) \
    .add("country", StringType()) \
    .add("channel", StringType()) \
    .add("timestamp", StringType())

df_parsed = (
    df_raw
    .selectExpr("CAST(value AS STRING)", "topic", "partition", "offset")
    .withColumn("jsonData", from_json(col("value"), schema))
    .select(
        "topic", "partition", "offset",
        col("jsonData.*"))
    .withColumn("quantity", col("quantity").cast("int"))
    .withColumn("price", col("price").cast("float"))

)


TypeError: 'Column' object is not callable

#### Write Stream to Delta - Partitioned by Country

In [None]:
from pyspark.sql.functions import col

def debug_batch(df, epoch_id):
    if df.count() == 0:
        print(f"---- Batch {epoch_id} is empty ----")
        return

    print(f"---- Processing Batch {epoch_id} ----")
    df.show(truncate=False)

    (
        df.write
        .format("delta")
        .mode("append")
        .partitionBy("country")
        .save("delta/orders_by_country")
    )

    print(f"---- Batch {epoch_id} written to Delta ----")

# Start the streaming query
query = (
    df_parsed.writeStream
    .outputMode("append")
    .foreachBatch(debug_batch)
    .option("checkpointLocation", "delta/checkpoints/orders_by_country_batch")
    .start()
)

query.awaitTermination()


+----------------+---------+------+--------+----------+--------+-----+-------+-------+--------------------+
|topic           |partition|offset|order_id|product_id|quantity|price|country|channel|timestamp           |
+----------------+---------+------+--------+----------+--------+-----+-------+-------+--------------------+
|retail_orders_us|0        |0     |ORD001  |P106      |NULL    |NULL |US     |mobile |2025-04-07T10:00:00Z|
|retail_orders_us|0        |1     |ORD004  |P108      |NULL    |NULL |US     |online |2025-04-07T10:00:45Z|
|retail_orders_us|0        |2     |ORD007  |P125      |NULL    |NULL |US     |mobile |2025-04-07T10:01:30Z|
|retail_orders_us|0        |3     |ORD009  |P141      |NULL    |NULL |US     |store  |2025-04-07T10:02:00Z|
|retail_orders_us|0        |4     |ORD013  |P111      |NULL    |NULL |US     |online |2025-04-07T10:03:00Z|
|retail_orders_us|0        |5     |ORD014  |P149      |NULL    |NULL |US     |mobile |2025-04-07T10:03:15Z|
|retail_orders_us|0        |

In [None]:
# Code for testing the parsed dataframe
df_parsed.writeStream.format("console").start().awaitTermination(10)
query.stop()
df_parsed.printSchema()

False

#### Query the Delta Tables

In [7]:
df_all = spark.read.format("delta").load("delta/orders_by_country")
df_all.createOrReplaceTempView("orders")

spark.sql("SELECT country, COUNT(*) FROM orders GROUP BY country").show()


+-------+--------+
|country|count(1)|
+-------+--------+
|     GE|      22|
|     UK|      20|
|     US|      18|
|     IN|      16|
|     FR|      13|
|     CA|      11|
+-------+--------+



In [9]:
spark.sql("SELECT * FROM orders").show()

+--------------------+---------+------+--------+----------+--------+-----+-------+-------+--------------------+
|               topic|partition|offset|order_id|product_id|quantity|price|country|channel|           timestamp|
+--------------------+---------+------+--------+----------+--------+-----+-------+-------+--------------------+
|retail_orders_others|        1|     1|  ORD006|      P130|    null| null|     GE| online|2025-04-07T10:01:15Z|
|retail_orders_others|        1|     3|  ORD010|      P109|    null| null|     GE| online|2025-04-07T10:02:15Z|
|retail_orders_others|        1|     6|  ORD017|      P120|    null| null|     GE| online|2025-04-07T10:04:00Z|
|retail_orders_others|        1|     7|  ORD018|      P137|    null| null|     GE| mobile|2025-04-07T10:04:15Z|
|retail_orders_others|        1|     8|  ORD019|      P120|    null| null|     GE|  store|2025-04-07T10:04:30Z|
|retail_orders_others|        1|     9|  ORD023|      P108|    null| null|     GE| online|2025-04-07T10:

In [12]:
df_raw.selectExpr("CAST(value AS STRING)").show(truncate=False)

AnalysisException: Queries with streaming sources must be executed with writeStream.start();
kafka