In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName('IoT TimeSeries') \
    .getOrCreate()

In [4]:
from pyspark.sql.types import *
from datetime import datetime, timedelta
import random

# Schema for IoT sensor data
schema = StructType([
    StructField("sensor_id", StringType(), False),
    StructField("timestamp", TimestampType(), False),
    StructField("location", StructType([
        StructField("latitude", DoubleType(), True),
        StructField("longitude", DoubleType(), True),
        StructField("altitude", DoubleType(), True)
    ]), True),
    StructField("readings", MapType(StringType(), DoubleType()), True),
    StructField("status", StringType(), True)
])

# Generate synthetic IoT data
base_time = datetime.now()
data = []
for i in range(50):
    sensor_data = (
        f"sensor_{i:03d}",
        base_time + timedelta(minutes=i*5),
        (40.7128 + random.uniform(-0.1, 0.1), -74.0060 + random.uniform(-0.1, 0.1), random.uniform(0, 100)),
        {
            "temperature": round(random.uniform(20, 30), 2),
            "humidity": round(random.uniform(40, 60), 2),
            "pressure": round(random.uniform(1000, 1020), 2)
        },
        random.choice(["active", "inactive", "maintenance"])
    )
    data.append(sensor_data)

df = spark.createDataFrame(data, schema)
df.show(truncate=False)

+----------+--------------------------+------------------------------------------------------------+--------------------------------------------------------------+-----------+
|sensor_id |timestamp                 |location                                                    |readings                                                      |status     |
+----------+--------------------------+------------------------------------------------------------+--------------------------------------------------------------+-----------+
|sensor_000|2025-06-08 18:05:04.477832|{40.71583239247499, -73.95319832337543, 26.30292317683288}  |{temperature -> 24.1, humidity -> 56.64, pressure -> 1010.61} |active     |
|sensor_001|2025-06-08 18:10:04.477832|{40.70302833414744, -73.95707354591102, 36.59330992040075}  |{temperature -> 23.44, humidity -> 51.64, pressure -> 1001.57}|maintenance|
|sensor_002|2025-06-08 18:15:04.477832|{40.64648660903925, -74.01601885380882, 87.71190194056301}  |{temperature -> 29.4

In [None]:
# create a temporary view for SQL queries
df.createOrReplaceTempView("iot_sensors")

# Single Sensor Overtime

In [None]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from datetime import datetime, timedelta
import random

# Schema for IoT sensor data
schema = StructType([
    StructField("sensor_id", StringType(), False),
    StructField("timestamp", TimestampType(), False),
    StructField("location", StructType([
        StructField("latitude", DoubleType(), True),
        StructField("longitude", DoubleType(), True),
        StructField("altitude", DoubleType(), True)
    ]), True),
    StructField("readings", MapType(StringType(), DoubleType()), True),
    StructField("status", StringType(), True)
])

# Generate synthetic IoT data
base_time = datetime.now()
data = []
for sensor_id in range(10):
    for i in range(50):
        sensor_data = (
            f"sensor_{sensor_id:03d}",
            base_time + timedelta(minutes=i*5),
            (40.7128 + random.uniform(-0.1, 0.1), -74.0060 + random.uniform(-0.1, 0.1), random.uniform(0, 100)),
            {
                "temperature": round(random.uniform(20, 30), 2),
                "humidity": round(random.uniform(40, 60), 2),
                "pressure": round(random.uniform(1000, 1020), 2)
            },
            random.choice(["active", "inactive", "maintenance"])
        )
        data.append(sensor_data)

df = spark.createDataFrame(data, schema)
df.show(truncate=False)

+----------+-------------------------+------------------------------------------------------------+--------------------------------------------------------------+-----------+
|sensor_id |timestamp                |location                                                    |readings                                                      |status     |
+----------+-------------------------+------------------------------------------------------------+--------------------------------------------------------------+-----------+
|sensor_000|2025-06-10 09:16:38.23774|{40.676752359137545, -73.90856554030503, 19.944268473625925}|{temperature -> 28.87, humidity -> 44.55, pressure -> 1013.33}|maintenance|
|sensor_000|2025-06-10 09:21:38.23774|{40.71272586321469, -74.01885875701497, 65.87936777648402}  |{temperature -> 20.49, humidity -> 51.66, pressure -> 1014.04}|active     |
|sensor_000|2025-06-10 09:26:38.23774|{40.74857491958641, -74.00195301153394, 70.04078332196812}  |{temperature -> 26.27, hum

In [7]:
df.orderBy("timestamp", "sensor_id").show(25, truncate=False)

+----------+--------------------------+-----------------------------------------------------------+--------------------------------------------------------------+-----------+
|sensor_id |timestamp                 |location                                                   |readings                                                      |status     |
+----------+--------------------------+-----------------------------------------------------------+--------------------------------------------------------------+-----------+
|sensor_000|2025-06-09 13:04:23.495049|{40.75150213848252, -73.92060981760952, 9.279084177906661} |{temperature -> 22.07, humidity -> 45.59, pressure -> 1012.22}|active     |
|sensor_001|2025-06-09 13:04:23.495049|{40.71220830802617, -73.93228665689061, 48.4100003910702}  |{temperature -> 20.02, humidity -> 55.69, pressure -> 1009.13}|inactive   |
|sensor_002|2025-06-09 13:04:23.495049|{40.6495966668836, -74.04535417348842, 71.74289080255328}  |{temperature -> 22.5, humi

In [None]:
df_coast = df.withColumn("latitude_block", round(col("location.latitude"), 4)) \
    .withColumn("region", when(col("latitude_block").between(40.7, 40.8), "East Coast") \
        .when(col("latitude_block").between(40.6, 40.7), "Central") \
        .when(col("latitude_block").between(40.5, 40.6), "West Coast") \
        .otherwise("Out-of-Bounds")) 

df_coast.show(truncate=False)

+----------+-------------------------+------------------------------------------------------------+--------------------------------------------------------------+-----------+--------------+-------------+
|sensor_id |timestamp                |location                                                    |readings                                                      |status     |latitude_block|region       |
+----------+-------------------------+------------------------------------------------------------+--------------------------------------------------------------+-----------+--------------+-------------+
|sensor_000|2025-06-10 09:16:38.23774|{40.676752359137545, -73.90856554030503, 19.944268473625925}|{temperature -> 28.87, humidity -> 44.55, pressure -> 1013.33}|maintenance|40.6768       |West Coast   |
|sensor_000|2025-06-10 09:21:38.23774|{40.71272586321469, -74.01885875701497, 65.87936777648402}  |{temperature -> 20.49, humidity -> 51.66, pressure -> 1014.04}|active     |40.7127   

In [19]:
# 15-minute intervals
df = df.withColumn("minute_15_bucket", 
                   from_unixtime(
                       floor(unix_timestamp("timestamp") / (15 * 60)) * (15 * 60)
                   ).cast("timestamp"))
df.show(truncate=False)

+----------+--------------------------+------------------------------------------------------------+--------------------------------------------------------------+-----------+-------------------+
|sensor_id |timestamp                 |location                                                    |readings                                                      |status     |minute_15_bucket   |
+----------+--------------------------+------------------------------------------------------------+--------------------------------------------------------------+-----------+-------------------+
|sensor_000|2025-06-09 13:04:23.495049|{40.75150213848252, -73.92060981760952, 9.279084177906661}  |{temperature -> 22.07, humidity -> 45.59, pressure -> 1012.22}|active     |2025-06-09 13:00:00|
|sensor_000|2025-06-09 13:09:23.495049|{40.63527953792914, -73.9685280725309, 88.80809613033718}   |{temperature -> 29.82, humidity -> 50.13, pressure -> 1001.28}|maintenance|2025-06-09 13:00:00|
|sensor_000|2025-06-

In [7]:
time_window = Window.partitionBy("sensor_id").orderBy("timestamp").rowsBetween(-11,0) # a rolling hours
df_lag = df.withColumn("rolliing_avg_temp", avg(col("readings.temperature")).over(time_window)) \
    .withColumn("lat_change", col("location.latitude") - lag("location.latitude").over(Window.partitionBy("sensor_id").orderBy("timestamp")))

df_lag.orderBy("sensor_id","timestamp").show(25, truncate=False)

+----------+-------------------------+------------------------------------------------------------+--------------------------------------------------------------+-----------+------------------+---------------------+
|sensor_id |timestamp                |location                                                    |readings                                                      |status     |rolliing_avg_temp |lat_change           |
+----------+-------------------------+------------------------------------------------------------+--------------------------------------------------------------+-----------+------------------+---------------------+
|sensor_000|2025-06-10 09:16:38.23774|{40.676752359137545, -73.90856554030503, 19.944268473625925}|{temperature -> 28.87, humidity -> 44.55, pressure -> 1013.33}|maintenance|28.87             |NULL                 |
|sensor_000|2025-06-10 09:21:38.23774|{40.71272586321469, -74.01885875701497, 65.87936777648402}  |{temperature -> 20.49, humidity -> 51

In [12]:

df_timestats = df_lag.withColumn("rolliing_avg_temp", avg(col("readings.temperature")).over(time_window)) \
    .withColumn("lat_change_hr", round(avg(col("lat_change")).over(time_window),4))


df_timestats.select("sensor_id", "timestamp", "readings.temperature", "rolliing_avg_temp", "lat_change", "lat_change_hr").orderBy("sensor_id", "timestamp").show(25, truncate=False)

+----------+-------------------------+-----------+------------------+---------------------+-------------+
|sensor_id |timestamp                |temperature|rolliing_avg_temp |lat_change           |lat_change_hr|
+----------+-------------------------+-----------+------------------+---------------------+-------------+
|sensor_000|2025-06-10 09:16:38.23774|28.87      |28.87             |NULL                 |NULL         |
|sensor_000|2025-06-10 09:21:38.23774|20.49      |24.68             |0.03597350407714828  |0.036        |
|sensor_000|2025-06-10 09:26:38.23774|26.27      |25.209999999999997|0.03584905637171687  |0.0359       |
|sensor_000|2025-06-10 09:31:38.23774|28.7       |26.0825           |0.028943110139209693 |0.0336       |
|sensor_000|2025-06-10 09:36:38.23774|29.49      |26.764            |-0.042559986822190865|0.0146       |
|sensor_000|2025-06-10 09:41:38.23774|24.87      |26.448333333333334|-0.09269980548027235 |-0.0069      |
|sensor_000|2025-06-10 09:46:38.23774|20.82   