# Kafka Configuration for reading data

In [0]:
# Kafka Configuration for reading data

import os
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, DoubleType, TimestampType, IntegerType
from pyspark.sql.functions import explode, col
# Read data from Kafka
kafka_df = (spark.readStream
            .format("kafka")
            .option("kafka.bootstrap.servers",  dbutils.secrets.get(scope="key_vault_scope", key="MYKAFKASERVER"))
            .option("subscribe",  dbutils.secrets.get(scope="key_vault_scope", key="MYKAFKATOPIC"))
            .option("maxFilesPerTrigger", 100)  # Process a maximum of 10 files at a time
            .option("startingOffsets", "earliest")  # or "latest"
            .option("kafka.security.protocol", "SASL_SSL")
            .option("kafka.sasl.mechanism", "PLAIN")
            .option("kafka.sasl.jaas.config", "org.apache.kafka.common.security.plain.PlainLoginModule required username='OPJG2TBO5A6V3KGQ' password='NktHxsYoNZZjBNQyfyBMsfqxo3FIe1soT6VhBiL82np1pTQtmqJwYvCzQTRULLVk';")
             .load())

# Cast Kafka value as string
json_df = kafka_df.selectExpr("CAST(value AS STRING) as json_message")

# Display the streaming DataFrame
display(json_df)

# Writing stream data to delta table in bronze layer

In [0]:
# Write the DataFrame to Delta table for storage with checkpointing
json_df.writeStream \
 .format("delta") \
  .option("checkpointLocation", "/Volumes/kafka/bronze/checkpoint") \
  .outputMode("append") \
  .table("kafka.bronze.stock_data")

<pyspark.sql.streaming.query.StreamingQuery at 0x7f839482a290>

# Displaying bronze table data (Just to show results. Once we set job, we dont need to show it in notebook)

In [0]:
%sql
SELECT * FROM kafka.bronze.stock_data 

json_message
"{""symbol"": ""MSFT"", ""timestamp"": ""2024-10-08 19:59:00"", ""open"": ""414.6050"", ""high"": ""414.6799"", ""low"": ""414.5300"", ""close"": ""414.6500"", ""volume"": ""555""}"
"{""symbol"": ""MSFT"", ""timestamp"": ""2024-10-08 19:58:00"", ""open"": ""414.6000"", ""high"": ""414.6400"", ""low"": ""414.4100"", ""close"": ""414.6050"", ""volume"": ""360""}"
"{""symbol"": ""MSFT"", ""timestamp"": ""2024-10-08 19:57:00"", ""open"": ""414.5300"", ""high"": ""414.6788"", ""low"": ""414.5300"", ""close"": ""414.6000"", ""volume"": ""62""}"
"{""symbol"": ""MSFT"", ""timestamp"": ""2024-10-08 19:56:00"", ""open"": ""414.5100"", ""high"": ""414.6500"", ""low"": ""414.5000"", ""close"": ""414.5200"", ""volume"": ""232""}"
"{""symbol"": ""MSFT"", ""timestamp"": ""2024-10-08 19:55:00"", ""open"": ""414.5300"", ""high"": ""414.6500"", ""low"": ""414.4900"", ""close"": ""414.4900"", ""volume"": ""359""}"
"{""symbol"": ""MSFT"", ""timestamp"": ""2024-10-08 19:54:00"", ""open"": ""414.5300"", ""high"": ""414.5500"", ""low"": ""414.5000"", ""close"": ""414.5300"", ""volume"": ""66""}"
"{""symbol"": ""MSFT"", ""timestamp"": ""2024-10-08 19:53:00"", ""open"": ""414.5300"", ""high"": ""414.5300"", ""low"": ""414.4160"", ""close"": ""414.4160"", ""volume"": ""2""}"
"{""symbol"": ""MSFT"", ""timestamp"": ""2024-10-08 19:52:00"", ""open"": ""414.5300"", ""high"": ""414.6500"", ""low"": ""414.5300"", ""close"": ""414.6500"", ""volume"": ""22""}"
"{""symbol"": ""MSFT"", ""timestamp"": ""2024-10-08 19:51:00"", ""open"": ""414.6300"", ""high"": ""414.6500"", ""low"": ""414.5300"", ""close"": ""414.5300"", ""volume"": ""1187""}"
"{""symbol"": ""MSFT"", ""timestamp"": ""2024-10-08 19:50:00"", ""open"": ""414.6900"", ""high"": ""414.6900"", ""low"": ""414.5300"", ""close"": ""414.6500"", ""volume"": ""31""}"
