# Kafka Configuration for reading data

In [0]:
# Kafka Configuration for reading data

import os
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, DoubleType, TimestampType, IntegerType
from pyspark.sql.functions import explode, col
# Read data from Kafka
kafka_df = (spark.readStream
            .format("kafka")
            .option("kafka.bootstrap.servers",  os.getenv('MY_KAFKA_SERVER'))
            .option("subscribe",  os.getenv('MY_KAFKA_TOPIC'))
            .option("maxFilesPerTrigger", 10)  # Process a maximum of 10 files at a time
            .option("startingOffsets", "earliest")  # or "latest"
            .option("kafka.security.protocol", "SASL_SSL")
            .option("kafka.sasl.mechanism", "PLAIN")
            .option("kafka.sasl.jaas.config", "org.apache.kafka.common.security.plain.PlainLoginModule required username='OPJG2TBO5A6V3KGQ' password='NktHxsYoNZZjBNQyfyBMsfqxo3FIe1soT6VhBiL82np1pTQtmqJwYvCzQTRULLVk';")
            .load())

# Cast Kafka value as string
json_df = kafka_df.selectExpr("CAST(value AS STRING) as json_message")



# Create DataFrame from the static data (assuming streaming data in real case)
#df = spark.createDataFrame(json_df, schema)

# Display the streaming DataFrame
display(json_df)

json_message
"{""symbol"": ""MSFT"", ""timestamp"": ""2024-10-04 19:59:00"", ""open"": ""415.8800"", ""high"": ""415.9000"", ""low"": ""415.8150"", ""close"": ""415.8150"", ""volume"": ""386""}"
"{""symbol"": ""MSFT"", ""timestamp"": ""2024-10-04 19:58:00"", ""open"": ""415.8600"", ""high"": ""417.2700"", ""low"": ""415.8500"", ""close"": ""415.8800"", ""volume"": ""659""}"
"{""symbol"": ""MSFT"", ""timestamp"": ""2024-10-04 19:57:00"", ""open"": ""415.8700"", ""high"": ""415.9500"", ""low"": ""415.8450"", ""close"": ""415.8999"", ""volume"": ""1562""}"
"{""symbol"": ""MSFT"", ""timestamp"": ""2024-10-04 19:56:00"", ""open"": ""415.8700"", ""high"": ""417.5100"", ""low"": ""415.8450"", ""close"": ""415.8450"", ""volume"": ""103""}"
"{""symbol"": ""MSFT"", ""timestamp"": ""2024-10-04 19:55:00"", ""open"": ""415.8500"", ""high"": ""415.9200"", ""low"": ""415.7500"", ""close"": ""415.8700"", ""volume"": ""93""}"
"{""symbol"": ""MSFT"", ""timestamp"": ""2024-10-04 19:54:00"", ""open"": ""415.8250"", ""high"": ""415.9200"", ""low"": ""415.8250"", ""close"": ""415.9200"", ""volume"": ""42""}"
"{""symbol"": ""MSFT"", ""timestamp"": ""2024-10-04 19:53:00"", ""open"": ""415.8800"", ""high"": ""415.8800"", ""low"": ""415.8250"", ""close"": ""415.8800"", ""volume"": ""43""}"
"{""symbol"": ""MSFT"", ""timestamp"": ""2024-10-04 19:52:00"", ""open"": ""415.8700"", ""high"": ""415.9000"", ""low"": ""415.7600"", ""close"": ""415.8700"", ""volume"": ""118""}"
"{""symbol"": ""MSFT"", ""timestamp"": ""2024-10-04 19:51:00"", ""open"": ""415.8600"", ""high"": ""415.9199"", ""low"": ""415.7300"", ""close"": ""415.8250"", ""volume"": ""73""}"
"{""symbol"": ""MSFT"", ""timestamp"": ""2024-10-04 19:50:00"", ""open"": ""415.8200"", ""high"": ""415.9200"", ""low"": ""415.8200"", ""close"": ""415.9200"", ""volume"": ""46""}"


# Writing stream data to delta table in bronze layer

In [0]:
# Write the DataFrame to Delta table for storage with checkpointing
json_df.writeStream \
 .format("delta") \
  .option("checkpointLocation", "/Volumes/kafka/bronze/checkpoint") \
  .outputMode("append") \
  .table("kafka.bronze.stock_data")

<pyspark.sql.streaming.query.StreamingQuery at 0x7f614020f550>

# Displaying bronze table data (Just to show results. Once we set job, we dont need to show it in notebook)

In [0]:
%sql
SELECT * FROM kafka.bronze.stock_data 

json_message
"{""symbol"": ""MSFT"", ""timestamp"": ""2024-10-04 19:59:00"", ""open"": ""415.8800"", ""high"": ""415.9000"", ""low"": ""415.8150"", ""close"": ""415.8150"", ""volume"": ""386""}"
"{""symbol"": ""MSFT"", ""timestamp"": ""2024-10-04 19:58:00"", ""open"": ""415.8600"", ""high"": ""417.2700"", ""low"": ""415.8500"", ""close"": ""415.8800"", ""volume"": ""659""}"
"{""symbol"": ""MSFT"", ""timestamp"": ""2024-10-04 19:57:00"", ""open"": ""415.8700"", ""high"": ""415.9500"", ""low"": ""415.8450"", ""close"": ""415.8999"", ""volume"": ""1562""}"
"{""symbol"": ""MSFT"", ""timestamp"": ""2024-10-04 19:56:00"", ""open"": ""415.8700"", ""high"": ""417.5100"", ""low"": ""415.8450"", ""close"": ""415.8450"", ""volume"": ""103""}"
"{""symbol"": ""MSFT"", ""timestamp"": ""2024-10-04 19:55:00"", ""open"": ""415.8500"", ""high"": ""415.9200"", ""low"": ""415.7500"", ""close"": ""415.8700"", ""volume"": ""93""}"
"{""symbol"": ""MSFT"", ""timestamp"": ""2024-10-04 19:54:00"", ""open"": ""415.8250"", ""high"": ""415.9200"", ""low"": ""415.8250"", ""close"": ""415.9200"", ""volume"": ""42""}"
"{""symbol"": ""MSFT"", ""timestamp"": ""2024-10-04 19:53:00"", ""open"": ""415.8800"", ""high"": ""415.8800"", ""low"": ""415.8250"", ""close"": ""415.8800"", ""volume"": ""43""}"
"{""symbol"": ""MSFT"", ""timestamp"": ""2024-10-04 19:52:00"", ""open"": ""415.8700"", ""high"": ""415.9000"", ""low"": ""415.7600"", ""close"": ""415.8700"", ""volume"": ""118""}"
"{""symbol"": ""MSFT"", ""timestamp"": ""2024-10-04 19:51:00"", ""open"": ""415.8600"", ""high"": ""415.9199"", ""low"": ""415.7300"", ""close"": ""415.8250"", ""volume"": ""73""}"
"{""symbol"": ""MSFT"", ""timestamp"": ""2024-10-04 19:50:00"", ""open"": ""415.8200"", ""high"": ""415.9200"", ""low"": ""415.8200"", ""close"": ""415.9200"", ""volume"": ""46""}"
