In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp

from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType

# Initialize Spark
spark = SparkSession.builder \
    .appName("IoT_Silver_Layer") \
    .config("spark.sql.streaming.checkpointLocation", "/home/jovyan/lakehouse/checkpoints/silver") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")

# Corrected Schema
schema = StructType([
    StructField("timestamp", StringType(), True),
    StructField("TP2", StringType(), True),
    StructField("TP3", StringType(), True),
    StructField("H1", StringType(), True),
    StructField("DV_pressure", StringType(), True),
    StructField("Reservoirs", StringType(), True),
    StructField("Oil_temperature", StringType(), True),
    StructField("Motor_current", StringType(), True),
    StructField("COMP", StringType(), True),
    StructField("DV_eletric", StringType(), True),
    StructField("Towers", StringType(), True),
    StructField("MPG", StringType(), True),
    StructField("LPS", StringType(), True),
    StructField("Pressure_switch", StringType(), True),
    StructField("Oil_level", StringType(), True),
    StructField("Caudal_Impulses", StringType(), True),

    StructField("kafka_arrival_time", TimestampType(), True) 
])

print("âœ… Silver Layer Session Ready (Schema Fixed)")

âœ… Silver Layer Session Ready (Schema Fixed)


Read Bronze & Clean Data

In [4]:
# 1. Read Stream from Bronze Folder
bronze_df = spark.readStream \
    .schema(schema) \
    .parquet("/home/jovyan/lakehouse/bronze") # Reading from the files you just created

# 2. Transformations (Cleaning)
silver_df = bronze_df \
    .withColumn("timestamp", to_timestamp(col("timestamp"))) \
    .withColumn("TP2", col("TP2").cast(DoubleType())) \
    .withColumn("TP3", col("TP3").cast(DoubleType())) \
    .withColumn("Oil_temperature", col("Oil_temperature").cast(DoubleType())) \
    .withColumn("Motor_current", col("Motor_current").cast(DoubleType())) \
    .filter(col("TP2").isNotNull()) # Remove bad data where sensor failed

print("âœ… Data Cleaning Logic Applied")
silver_df.printSchema()

âœ… Data Cleaning Logic Applied
root
 |-- timestamp: timestamp (nullable = true)
 |-- TP2: double (nullable = true)
 |-- TP3: double (nullable = true)
 |-- H1: string (nullable = true)
 |-- DV_pressure: string (nullable = true)
 |-- Reservoirs: string (nullable = true)
 |-- Oil_temperature: double (nullable = true)
 |-- Motor_current: double (nullable = true)
 |-- COMP: string (nullable = true)
 |-- DV_eletric: string (nullable = true)
 |-- Towers: string (nullable = true)
 |-- MPG: string (nullable = true)
 |-- LPS: string (nullable = true)
 |-- Pressure_switch: string (nullable = true)
 |-- Oil_level: string (nullable = true)
 |-- Caudal_Impulses: string (nullable = true)
 |-- kafka_arrival_time: timestamp (nullable = true)



Write to Silver Layer

In [None]:
query = silver_df.writeStream \
    .format("parquet") \
    .outputMode("append") \
    .option("path", "/home/jovyan/lakehouse/silver") \
    .option("checkpointLocation", "/home/jovyan/lakehouse/checkpoints/silver") \
    .trigger(processingTime='5 seconds') \
    .start()

print(f"ðŸš€ Streaming to Silver Layer started... RunId: {query.runId}")
query.awaitTermination()

ðŸš€ Streaming to Silver Layer started... RunId: d3d6f99f-ea38-434b-8f17-0e9cb6581e59
