In [1]:
!pip install pyspark

Defaulting to user installation because normal site-packages is not writeable


In [2]:
# Import all necessary libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp, avg, count, min, max, window
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType

# 1. Create your Spark Session
spark = SparkSession.builder \
    .appName("Q4") \
    .config("spark.sql.shuffle.partitions", "4") \
    .getOrCreate()

print("Spark Session created.")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/10 22:43:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Spark Session created.


In [6]:
input_directory = "midterm/data/q4/"

# schema definition
schema = StructType([
    StructField("sensor_id", StringType(), True),
    StructField("temperature", DoubleType(), True),
    StructField("timestamp", StringType(), True) # Read as string first
])

# reading the stream from the input directory
df_raw = spark.readStream \
    .schema(schema) \
    .json(input_directory)

# convert string timestamp to actual timestamp object
# adding a 2-minute watermark as was required
df_sensors = df_raw.withColumn(
    "timestamp",
    to_timestamp(col("timestamp"))
).withWatermark("timestamp", "2 minutes")

print(f"Monitoring directory: {input_directory}")
df_sensors.printSchema()

Monitoring directory: midterm/data/q4/
root
 |-- sensor_id: string (nullable = true)
 |-- temperature: double (nullable = true)
 |-- timestamp: timestamp (nullable = true)



In [7]:
# --- Part A: Simple Aggregations ---
df_part_A = df_sensors.groupBy("sensor_id").agg(
    avg("temperature").alias("avg_temp"),
    count("*").alias("reading_count"),
    min("temperature").alias("min_temp"),
    max("temperature").alias("max_temp")
)

# OutputMode 'complete': The entire updated result table is written.
# Format 'memory': Writes to an in-memory table named 'q4_part_a_results'.
query_A = df_part_A.writeStream \
    .outputMode("complete") \
    .format("memory") \
    .queryName("q4_part_a_results") \
    .trigger(processingTime="10 seconds") \
    .start()

print(f"Stream 'q4_part_a_results' started in 'complete' mode.")

25/11/10 22:46:35 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-4b07feee-5e15-4975-ab6e-2664c0c9b96e. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/11/10 22:46:35 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


Stream 'q4_part_a_results' started in 'complete' mode.


25/11/10 22:46:47 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000} milliseconds, but spent 11982 milliseconds


In [8]:
# --- Part B: Windowed Aggregations ---

# 1. 5-minute tumbling window: Avg temp and count across ALL sensors [cite: 155, 157]
df_part_B_tumbling = df_sensors \
    .groupBy(
        window(col("timestamp"), "5 minutes") # Tumbling window
    ) \
    .agg(
        avg("temperature").alias("avg_temp_5min"),
        count("*").alias("count_5min")
    ) \
    .select("window.start", "window.end", "avg_temp_5min", "count_5min")

# 2. 10-minute sliding window (slide 5 min): Max temp and count PER sensor [cite: 156, 157]
df_part_B_sliding = df_sensors \
    .groupBy(
        col("sensor_id"),
        window(col("timestamp"), "10 minutes", "5 minutes") # Sliding window
    ) \
    .agg(
        max("temperature").alias("max_temp_10min"),
        count("*").alias("count_10min")
    ) \
    .select("window.start", "window.end", "sensor_id", "max_temp_10min", "count_10min")

# 3. Start the queries for Part B
# OutputMode 'append': Only *completed* windows are written.
query_B_tumbling = df_part_B_tumbling.writeStream \
    .outputMode("append") \
    .format("memory") \
    .queryName("q4_part_b_tumbling_results") \
    .trigger(processingTime="10 seconds") \
    .start()

query_B_sliding = df_part_B_sliding.writeStream \
    .outputMode("append") \
    .format("memory") \
    .queryName("q4_part_b_sliding_results") \
    .trigger(processingTime="10 seconds") \
    .start()

print("Stream 'q4_part_b_tumbling_results' started in 'append' mode.")
print("Stream 'q4_part_b_sliding_results' started in 'append' mode.")

25/11/10 22:46:57 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-6c438b60-fc4d-431b-90a8-ef93944276f8. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/11/10 22:46:57 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/11/10 22:46:58 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-36b90823-a6f5-4917-aa11-b7dd69070a98. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
25/11/10 22:46:58 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not support

Stream 'q4_part_b_tumbling_results' started in 'append' mode.
Stream 'q4_part_b_sliding_results' started in 'append' mode.


                                                                                

In [9]:
import time
# Give the streams a moment to process the file
print("Waiting 10 seconds for streams to process...")
time.sleep(10) 

print("\n--- Part A Results (Running Aggregations) ---")
spark.sql("SELECT * FROM q4_part_a_results ORDER BY sensor_id").show(truncate=False)

print("\n--- Part B Results (5-min Tumbling Window) ---")
spark.sql("SELECT * FROM q4_part_b_tumbling_results ORDER BY start").show(truncate=False)

print("\n--- Part B Results (10-min Sliding Window) ---")
spark.sql("SELECT * FROM q4_part_b_sliding_results ORDER BY start, sensor_id").show(truncate=False)

Waiting 10 seconds for streams to process...

--- Part A Results (Running Aggregations) ---
+---------+-----------------+-------------+--------+--------+
|sensor_id|avg_temp         |reading_count|min_temp|max_temp|
+---------+-----------------+-------------+--------+--------+
|S001     |72.5676036400404 |989          |55.16   |86.52   |
|S002     |72.29238505747124|1044         |55.79   |88.86   |
|S003     |72.65653725078695|953          |56.18   |89.88   |
|S004     |72.31032901296105|1003         |56.18   |89.93   |
|S005     |72.89840625000005|960          |56.43   |89.96   |
|S006     |72.53573964497033|1014         |55.18   |89.75   |
|S007     |72.57789686552077|989          |55.17   |88.78   |
|S008     |72.28547169811334|1007         |56.5    |89.89   |
|S009     |72.79977905859754|1041         |56.16   |89.82   |
|S010     |72.53289000000001|1000         |56.0    |89.33   |
+---------+-----------------+-------------+--------+--------+


--- Part B Results (5-min Tumbling Win

In [10]:
print("Stopping all streams...")
for stream in spark.streams.active:
    print(f"Stopping stream: {stream.name}")
    try:
        stream.stop()
    except Exception as e:
        print(f"Could not stop stream {stream.name}: {e}")

# Stopping the Spark Session
spark.stop()

Stopping all streams...
Stopping stream: q4_part_b_tumbling_results
Stopping stream: q4_part_b_sliding_results
Stopping stream: q4_part_a_results


25/11/10 22:47:56 WARN DAGScheduler: Failed to cancel job group c522564e-358a-4815-a335-9eb1a0951e77. Cannot find active jobs for it.
25/11/10 22:47:56 WARN DAGScheduler: Failed to cancel job group c522564e-358a-4815-a335-9eb1a0951e77. Cannot find active jobs for it.
25/11/10 22:47:56 WARN DAGScheduler: Failed to cancel job group c0793d97-e1fe-457f-9836-5caa290d03b6. Cannot find active jobs for it.
25/11/10 22:47:56 WARN DAGScheduler: Failed to cancel job group c0793d97-e1fe-457f-9836-5caa290d03b6. Cannot find active jobs for it.
25/11/10 22:47:56 WARN DAGScheduler: Failed to cancel job group cae7a6a2-2e10-4425-a654-ee2dae1ec83b. Cannot find active jobs for it.
25/11/10 22:47:56 WARN DAGScheduler: Failed to cancel job group cae7a6a2-2e10-4425-a654-ee2dae1ec83b. Cannot find active jobs for it.
25/11/10 22:48:45 WARN StateStore: Error running maintenance thread
java.lang.IllegalStateException: SparkEnv not active, cannot do maintenance on StateStores
	at org.apache.spark.sql.execution.st