In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType

# Initialize Spark Session
# We download the Kafka connector JAR here automatically
spark = SparkSession.builder \
    .appName("IoT_Bronze_Layer") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0") \
    .config("spark.sql.streaming.checkpointLocation", "/home/jovyan/lakehouse/checkpoints/bronze") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN") # Reduce noise in logs
print("âœ… Spark Session Created Successfully!")

âœ… Spark Session Created Successfully!


Read from Kafka (Port 29092)

In [5]:
# 1. Define Schema (Matches Metro PT CSV)
schema = StructType([
    StructField("timestamp", StringType(), True),
    StructField("TP2", StringType(), True),
    StructField("TP3", StringType(), True),
    StructField("H1", StringType(), True),
    StructField("DV_pressure", StringType(), True),
    StructField("Reservoirs", StringType(), True),
    StructField("Oil_temperature", StringType(), True),
    StructField("Motor_current", StringType(), True),
    StructField("COMP", StringType(), True),
    StructField("DV_eletric", StringType(), True),
    StructField("Towers", StringType(), True),
    StructField("MPG", StringType(), True),
    StructField("LPS", StringType(), True),
    StructField("Pressure_switch", StringType(), True),
    StructField("Oil_level", StringType(), True),
    StructField("Caudal_Impulses", StringType(), True)
])

# 2. Read Stream from Kafka
# IMPORTANT: We use port 29092 here!
kafka_df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:29092") \
    .option("subscribe", "iot_sensors") \
    .option("startingOffsets", "earliest") \
    .load()

# 3. Parse JSON Data
parsed_df = kafka_df.select(
    from_json(col("value").cast("string"), schema).alias("data"),
    col("timestamp").alias("kafka_arrival_time")
).select("data.*", "kafka_arrival_time")

print("âœ… Data Stream Initialized.")
parsed_df.printSchema()

âœ… Data Stream Initialized.
root
 |-- timestamp: string (nullable = true)
 |-- TP2: string (nullable = true)
 |-- TP3: string (nullable = true)
 |-- H1: string (nullable = true)
 |-- DV_pressure: string (nullable = true)
 |-- Reservoirs: string (nullable = true)
 |-- Oil_temperature: string (nullable = true)
 |-- Motor_current: string (nullable = true)
 |-- COMP: string (nullable = true)
 |-- DV_eletric: string (nullable = true)
 |-- Towers: string (nullable = true)
 |-- MPG: string (nullable = true)
 |-- LPS: string (nullable = true)
 |-- Pressure_switch: string (nullable = true)
 |-- Oil_level: string (nullable = true)
 |-- Caudal_Impulses: string (nullable = true)
 |-- kafka_arrival_time: timestamp (nullable = true)



Write to Bronze Layer (Parquet)

In [None]:
query = parsed_df.writeStream \
    .format("parquet") \
    .outputMode("append") \
    .option("path", "/home/jovyan/lakehouse/bronze") \
    .option("checkpointLocation", "/home/jovyan/lakehouse/checkpoints/bronze") \
    .trigger(processingTime='5 seconds') \
    .start()

print(f"ðŸš€ Streaming to Bronze Layer started... RunId: {query.runId}")
query.awaitTermination()

ðŸš€ Streaming to Bronze Layer started... RunId: ab3132e1-97c6-47d6-8075-3e8f0d7a537d
