In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder \
    .appName("Test") \
    .master("local[*]") \
    .config(
        "spark.jars.packages",
        "org.apache.spark:spark-sql-kafka-0-10_2.13:4.0.1"
    ) \
    .getOrCreate()

df = (
    spark.read
    .format("kafka")
    .option("kafka.bootstrap.servers", "localhost:9092")
    .option("subscribe", "smart-meter-data")
    .load()
)


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/11/19 18:17:55 WARN Utils: Your hostname, yegane, resolves to a loopback address: 127.0.1.1; using 192.168.1.6 instead (on interface wlp0s20f3)
25/11/19 18:17:55 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
:: loading settings :: url = jar:file:/home/yegane/Documents/smart-meter-simulation/.venv/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /home/yegane/.ivy2.5.2/cache
The jars for the packages stored in: /home/yegane/.ivy2.5.2/jars
org.apache.spark#spark-sql-kafka-0-10_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-6404ffa7-36d7-4f19-a6df-7add3dbbe59a;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.13;4.0.1 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.13;4.0.1 in central
	found org.apache.kafka#kafka-clients

In [3]:
df.show()

                                                                                

+--------------------+--------------------+----------------+---------+------+--------------------+-------------+
|                 key|               value|           topic|partition|offset|           timestamp|timestampType|
+--------------------+--------------------+----------------+---------+------+--------------------+-------------+
|[62 75 69 6C 64 6...|[7B 22 6D 65 74 6...|smart-meter-data|        0|     0|2025-11-19 18:16:...|            0|
|[62 75 69 6C 64 6...|[7B 22 6D 65 74 6...|smart-meter-data|        0|     1|2025-11-19 18:16:...|            0|
|[62 75 69 6C 64 6...|[7B 22 6D 65 74 6...|smart-meter-data|        0|     2|2025-11-19 18:16:...|            0|
|[62 75 69 6C 64 6...|[7B 22 6D 65 74 6...|smart-meter-data|        0|     3|2025-11-19 18:16:...|            0|
|[62 75 69 6C 64 6...|[7B 22 6D 65 74 6...|smart-meter-data|        0|     4|2025-11-19 18:16:...|            0|
|[62 75 69 6C 64 6...|[7B 22 6D 65 74 6...|smart-meter-data|        0|     5|2025-11-19 18:16:..

### Create Schema

In [4]:
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, TimestampType

meter_schema = StructType([
    StructField("meter_id", IntegerType(), nullable=False),
    StructField("building_id", IntegerType(), nullable=False),
    StructField("timestamp", TimestampType(), nullable=False),
    StructField("power_kw", FloatType(), nullable=False),
    StructField("voltage_v", FloatType(), nullable=False),
    StructField("status", IntegerType(), nullable=False)
])

### Parse Raw DataFrame

In [5]:
from pyspark.sql.functions import col, expr, from_json

raw_df = df.selectExpr("CAST(value AS STRING)", "CAST(key AS STRING)", "topic", "partition", "offset", "timestamp")

In [6]:
df = raw_df.select(from_json(col("value"), meter_schema).alias("data")).select("data.*") 

In [7]:
df = df.filter((df.status >= 0) & (df.status <= 4))

In [8]:
df.show()

+--------+-----------+--------------------+--------+---------+------+
|meter_id|building_id|           timestamp|power_kw|voltage_v|status|
+--------+-----------+--------------------+--------+---------+------+
|       1|          1|2025-11-19 18:16:...|    0.73|    229.8|     0|
|       2|          1|2025-11-19 18:16:...|    0.65|    237.3|     0|
|       3|          1|2025-11-19 18:16:...|    0.72|    231.8|     0|
|       4|          1|2025-11-19 18:16:...|    0.73|    221.3|     0|
|       5|          1|2025-11-19 18:16:...|    0.51|    245.0|     0|
|       6|          1|2025-11-19 18:16:...|    0.69|    230.1|     0|
|       7|          1|2025-11-19 18:16:...|    0.78|    227.4|     0|
|       8|          1|2025-11-19 18:16:...|    0.81|    230.7|     0|
|       9|          1|2025-11-19 18:16:...|    0.61|    231.2|     0|
|      10|          1|2025-11-19 18:16:...|    0.63|    225.5|     0|
|      11|          2|2025-11-19 18:16:...|    0.71|    233.7|     0|
|      12|          

### House Hourly Power Consumption

In [11]:
from pyspark.sql.functions import window, col, max, min, avg, sum

hourly_house_power_consumption = df.withWatermark("timestamp", "1 hour").groupBy(
    "meter_id",
    window("timestamp", "1 hour")
).agg(
    avg("power_kw").alias("avg_power"),
    max("power_kw").alias("max_power"),
    min("power_kw").alias("min_power"), 
    sum("power_kw").alias("total_power")
).withColumn("window_start", col("window.start")).withColumn("window_end", col("window.end")).drop(col("window"))

In [12]:
hourly_house_power_consumption.show()

                                                                                

+--------+------------------+---------+---------+------------------+-------------------+-------------------+
|meter_id|         avg_power|max_power|min_power|       total_power|       window_start|         window_end|
+--------+------------------+---------+---------+------------------+-------------------+-------------------+
|      67|0.7012903228882821|     0.87|     0.53|21.740000009536743|2025-11-19 17:30:00|2025-11-19 18:30:00|
|      34| 0.701249998062849|     0.95|     0.53| 22.43999993801117|2025-11-19 17:30:00|2025-11-19 18:30:00|
|      74|0.7183870990430156|     0.91|     0.47| 22.27000007033348|2025-11-19 17:30:00|2025-11-19 18:30:00|
|      91|0.6641935485024606|     0.87|     0.45| 20.59000000357628|2025-11-19 17:30:00|2025-11-19 18:30:00|
|      35|0.6887500006705523|     0.89|     0.54|22.040000021457672|2025-11-19 17:30:00|2025-11-19 18:30:00|
|      54|0.6900000022724271|     0.88|     0.43|22.080000072717667|2025-11-19 17:30:00|2025-11-19 18:30:00|
|      36|  0.68624

### Peak Hours / Minimum

In [11]:
from pyspark.sql.functions import *
from pyspark.sql.window import Window

# 24h sliding window
peak_hour_df = (
    df.withWatermark("timestamp", "25 hours")
      .groupBy(
          "meter_id",
          window("timestamp", "24 hours", "1 hour")
      )
      .agg(
          avg("power_kw").alias("avg_power"),
          max("power_kw").alias("max_power"),
          min("power_kw").alias("min_power"),
          sum("power_kw").alias("total_power")
      )
)

# ---- PEAK HOUR (highest avg power) ----
peak_w = Window.orderBy(col("avg_power").desc())

peak_hour = (
    peak_hour_df
    .withColumn("rn", row_number().over(peak_w))
    .filter(col("rn") == 1)
    .select(
        col("window.start").alias("peak_hour_start"),
        col("window.end").alias("peak_hour_end"),
        "total_power",
        "max_power",
        "avg_power"
    )
)

# ---- MINIMUM HOUR (lowest avg power) ----
min_w = Window.orderBy(col("avg_power").asc())

min_hour = (
    peak_hour_df
    .withColumn("rn", row_number().over(min_w))
    .filter(col("rn") == 1)
    .select(
        col("window.start").alias("min_hour_start"),
        col("window.end").alias("min_hour_end"),
        "total_power",
        "min_power",
        "avg_power"
    )
)

In [12]:
peak_hour.show()

min_hour.show()

25/11/18 19:31:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/18 19:31:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/18 19:31:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/18 19:31:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/18 19:31:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/18 19:31:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/18 1

+-------------------+-------------------+------------------+---------+------------------+
|    peak_hour_start|      peak_hour_end|       total_power|max_power|         avg_power|
+-------------------+-------------------+------------------+---------+------------------+
|2025-11-18 19:30:00|2025-11-19 19:30:00|14.979999959468842|     1.32|1.0699999971049172|
+-------------------+-------------------+------------------+---------+------------------+



25/11/18 19:31:13 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/18 19:31:13 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/11/18 19:31:13 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+-------------------+-------------------+------------------+---------+------------------+
|     min_hour_start|       min_hour_end|       total_power|min_power|         avg_power|
+-------------------+-------------------+------------------+---------+------------------+
|2025-11-17 17:30:00|2025-11-18 17:30:00|14.260000050067902|     0.32|0.5484615403872269|
+-------------------+-------------------+------------------+---------+------------------+



                                                                                

### Peak Hour / Min Per Meter

In [15]:
from pyspark.sql.functions import *
from pyspark.sql.window import Window

# 24h sliding window
peak_hour_meter_df = (
    df.withWatermark("timestamp", "25 hours")
      .groupBy(
          "meter_id",
          window("timestamp", "24 hours", "1 hour")
      )
      .agg(
          avg("power_kw").alias("avg_power"),
          max("power_kw").alias("max_power"),
          min("power_kw").alias("min_power"),
          sum("power_kw").alias("total_power")
      )
)

# ---- PEAK HOUR ----
peak_w = Window.partitionBy("meter_id").orderBy(col("avg_power").desc())

peak_hour_meter = (
    peak_hour_meter_df
    .withColumn("rn", row_number().over(peak_w))
    .filter(col("rn") == 1)
    .select(
        col("window.start").alias("peak_hour_start"),
        col("window.end").alias("peak_hour_end"),
        "total_power",
        "max_power",
        "avg_power"
    )
)

# ---- MIN HOUR ----
min_w = Window.partitionBy("meter_id").orderBy(col("avg_power").asc())

min_hour_meter = (
    peak_hour_meter_df
    .withColumn("rn", row_number().over(min_w))
    .filter(col("rn") == 1)
    .select(
        col("window.start").alias("min_hour_start"),
        col("window.end").alias("min_hour_end"),
        "total_power",
        "min_power",
        "avg_power"
    )
)


In [16]:
peak_hour_meter.show()

min_hour_meter.show()

                                                                                

+-------------------+-------------------+------------------+---------+------------------+
|    peak_hour_start|      peak_hour_end|       total_power|max_power|         avg_power|
+-------------------+-------------------+------------------+---------+------------------+
|2025-11-18 19:30:00|2025-11-19 19:30:00|32.999999821186066|     1.17|0.9705882300348843|
|2025-11-18 19:30:00|2025-11-19 19:30:00| 33.55999964475632|      1.3|0.9870588130810681|
|2025-11-18 19:30:00|2025-11-19 19:30:00| 34.77000004053116|     1.21|1.0226470600156223|
|2025-11-18 19:30:00|2025-11-19 19:30:00|34.309999883174896|      1.2| 1.009117643622791|
|2025-11-18 19:30:00|2025-11-19 19:30:00|34.170000076293945|     1.32|1.0050000022439396|
|2025-11-18 17:30:00|2025-11-19 17:30:00| 34.68999993801117|     1.21|0.9911428553717477|
|2025-11-18 19:30:00|2025-11-19 19:30:00| 34.01999980211258|     1.18|1.0005882294738995|
|2025-11-18 19:30:00|2025-11-19 19:30:00| 34.52999997138977|     1.22|1.0155882344526403|
|2025-11-1

[Stage 23:>                                                         (0 + 1) / 1]

+-------------------+-------------------+------------------+---------+------------------+
|     min_hour_start|       min_hour_end|       total_power|min_power|         avg_power|
+-------------------+-------------------+------------------+---------+------------------+
|2025-11-17 18:30:00|2025-11-18 18:30:00|15.560000151395798|     0.38|0.5984615442844538|
|2025-11-17 17:30:00|2025-11-18 17:30:00|  16.1599999666214|     0.44|0.6215384602546692|
|2025-11-17 17:30:00|2025-11-18 17:30:00|15.620000094175339|     0.38|0.6007692343913592|
|2025-11-17 17:30:00|2025-11-18 17:30:00|15.269999951124191|     0.43|0.5873076904278535|
|2025-11-17 17:30:00|2025-11-18 17:30:00|14.409999877214432|     0.41|0.5542307645082474|
|2025-11-17 18:30:00|2025-11-18 18:30:00|15.709999978542328|     0.42|0.6042307684054742|
|2025-11-17 18:30:00|2025-11-18 18:30:00|14.749999910593033|     0.35|0.5673076888689628|
|2025-11-17 17:30:00|2025-11-18 17:30:00|15.749999970197678|     0.42|0.6057692296229876|
|2025-11-1

                                                                                

### Building Hourly Power Consumption

In [13]:
hourly_building_power_consumption = df.withWatermark("timestamp", "2 hours").groupBy(
    "building_id",
    window("timestamp", "1 hour")
).agg(
    avg("power_kw").alias("avg_power"),
    max("power_kw").alias("max_power"),
    min("power_kw").alias("min_power"),
    sum("power_kw").alias("sum_power")
).withColumn("window_start", col("window.start")).withColumn("window_end", col("window.end")).drop(col("window"))

In [14]:
hourly_building_power_consumption.show()

+-----------+------------------+---------+---------+------------------+-------------------+-------------------+
|building_id|         avg_power|max_power|min_power|         sum_power|       window_start|         window_end|
+-----------+------------------+---------+---------+------------------+-------------------+-------------------+
|          3|0.6983750005252659|     0.99|     0.39| 223.4800001680851|2025-11-19 17:30:00|2025-11-19 18:30:00|
|          1|0.7034375005401671|     1.03|     0.37|225.10000017285347|2025-11-19 17:30:00|2025-11-19 18:30:00|
|          2|0.7035312508232892|      1.0|     0.44|225.13000026345253|2025-11-19 17:30:00|2025-11-19 18:30:00|
|         10| 0.689419353873499|     0.97|     0.37|213.71999970078468|2025-11-19 17:30:00|2025-11-19 18:30:00|
|          8|0.7038387093813189|     0.99|     0.37|218.18999990820885|2025-11-19 17:30:00|2025-11-19 18:30:00|
|          7|0.6989710611737426|     1.01|     0.41|217.38000002503395|2025-11-19 17:30:00|2025-11-19 18

                                                                                

### Power Consumption Trend - Dashboard

In [15]:
from pyspark.sql.functions import window, col, max, min, avg, sum, count

dashboard = (
    df
    .withWatermark("timestamp", "5 minutes")
    .groupBy(
        window("timestamp", "1 minute"),
        col("meter_id")
    )
    .agg(
        avg("power_kw").alias("avg_power_min"),
        avg("voltage_v").alias("avg_voltage_min"),
        count("*").alias("event_count")
    ).withColumn("window_start", col("window.start")).withColumn("window_end", col("window.end")).drop(col("window"))
)

In [16]:
dashboard.show()

[Stage 11:>                                                         (0 + 1) / 1]

+--------+------------------+------------------+-----------+-------------------+-------------------+
|meter_id|     avg_power_min|   avg_voltage_min|event_count|       window_start|         window_end|
+--------+------------------+------------------+-----------+-------------------+-------------------+
|      48|0.7439999938011169| 227.4200012207031|          5|2025-11-19 18:16:00|2025-11-19 18:17:00|
|      53|0.7033333232005438| 230.1333351135254|         12|2025-11-19 18:17:00|2025-11-19 18:18:00|
|      63|0.6150000095367432| 229.5500030517578|          2|2025-11-19 18:19:00|2025-11-19 18:20:00|
|      49|0.7066666682561239| 230.8999989827474|          3|2025-11-19 18:19:00|2025-11-19 18:20:00|
|       5|  0.69500000278155|230.83333460489908|         12|2025-11-19 18:17:00|2025-11-19 18:18:00|
|       7| 0.760833332935969|232.34166463216147|         12|2025-11-19 18:17:00|2025-11-19 18:18:00|
|      59|0.7833333412806193|230.93333435058594|          3|2025-11-19 18:19:00|2025-11-19 

                                                                                

### Power Consumption Anomaly Detection

In [17]:
from pyspark.sql import functions as F
anomaly = (
    df
    .withWatermark("timestamp", "1 minute")
    .groupBy(
        F.window("timestamp", "30 seconds"),
        F.col("meter_id")
    )
    .agg(
        F.avg("power_kw").alias("avg_power"),
        F.max("power_kw").alias("max_power"),
        F.min("power_kw").alias("min_power"),
        F.expr("bit_or(status)").alias("window_status")  
    ).withColumn("window_start", col("window.start")).withColumn("window_end", col("window.end")).drop(col("window"))
)

from pyspark.sql.functions import getbit, col, lit

anomaly = anomaly.withColumn("rare_failure", getbit(col("window_status"), lit(0)) != 0) \
                 .withColumn("spike_anomaly", getbit(col("window_status"), lit(1)) != 0) \
                 .withColumn("low_consumption", getbit(col("window_status"), lit(2)) != 0) \
                 .withColumn("voltage_anomaly", getbit(col("window_status"), lit(3)) != 0)


In [18]:
anomaly.show()

[Stage 14:>                                                         (0 + 1) / 1]

+--------+------------------+---------+---------+-------------+-------------------+-------------------+------------+-------------+---------------+---------------+
|meter_id|         avg_power|max_power|min_power|window_status|       window_start|         window_end|rare_failure|spike_anomaly|low_consumption|voltage_anomaly|
+--------+------------------+---------+---------+-------------+-------------------+-------------------+------------+-------------+---------------+---------------+
|      95| 0.671999990940094|     0.84|     0.57|            0|2025-11-19 18:16:30|2025-11-19 18:17:00|       false|        false|          false|          false|
|      22|0.6599999964237213|     0.77|     0.51|            0|2025-11-19 18:17:30|2025-11-19 18:18:00|       false|        false|          false|          false|
|      61|0.6883333424727122|     0.85|     0.55|            0|2025-11-19 18:18:00|2025-11-19 18:18:30|       false|        false|          false|          false|
|      20|0.6299999952

                                                                                

In [19]:
anomaly.printSchema()

root
 |-- meter_id: integer (nullable = true)
 |-- avg_power: double (nullable = true)
 |-- max_power: float (nullable = true)
 |-- min_power: float (nullable = true)
 |-- window_status: integer (nullable = true)
 |-- window_start: timestamp (nullable = true)
 |-- window_end: timestamp (nullable = true)
 |-- rare_failure: boolean (nullable = true)
 |-- spike_anomaly: boolean (nullable = true)
 |-- low_consumption: boolean (nullable = true)
 |-- voltage_anomaly: boolean (nullable = true)



### Power Consumption - Prediction

In [20]:
from pyspark.sql.functions import stddev

prediction = (
    df
    .withWatermark("timestamp", "10 minutes")
    .groupBy(
        window("timestamp", "5 minutes"),
        col("meter_id")
    )
    .agg(
        avg("power_kw").alias("power_5m_avg"),
        stddev("power_kw").alias("power_5m_std"),
        avg("voltage_v").alias("voltage_5m_avg"),
        stddev("voltage_v").alias("voltage_5m_std"),
        F.expr("bit_or(status)").alias("window_status")  
    ).withColumn("window_start", col("window.start")).withColumn("window_end", col("window.end")).drop(col("window"))
)

prediction = prediction.withColumn("rare_failure", getbit(col("window_status"), lit(0)) != 0) \
                 .withColumn("spike_anomaly", getbit(col("window_status"), lit(1)) != 0) \
                 .withColumn("low_consumption", getbit(col("window_status"), lit(2)) != 0) \
                 .withColumn("voltage_anomaly", getbit(col("window_status"), lit(3)) != 0)

In [21]:
prediction.show()

[Stage 17:>                                                         (0 + 1) / 1]

+--------+------------------+-------------------+------------------+------------------+-------------+-------------------+-------------------+------------+-------------+---------------+---------------+
|meter_id|      power_5m_avg|       power_5m_std|    voltage_5m_avg|    voltage_5m_std|window_status|       window_start|         window_end|rare_failure|spike_anomaly|low_consumption|voltage_anomaly|
+--------+------------------+-------------------+------------------+------------------+-------------+-------------------+-------------------+------------+-------------+---------------+---------------+
|      52|0.6562499972060323|0.09924716850744376|230.58437538146973| 4.385981267329794|            2|2025-11-19 18:15:00|2025-11-19 18:20:00|       false|         true|          false|          false|
|      58|0.7050000000745058|0.10534122962472695|229.61562538146973| 6.083245392335418|            2|2025-11-19 18:15:00|2025-11-19 18:20:00|       false|         true|          false|          fa

                                                                                

### Count Animalies / Spikes / Failures - General

In [27]:
from pyspark.sql.functions import when

anomaly_count = (
    df
    .withWatermark("timestamp", "10 minutes")
    .groupBy(
        window("timestamp", "5 minutes"),
        col("meter_id")
    )
    .agg(
        F.expr("bit_or(status)").alias("window_status"),
        
    )
)

anomaly_count = anomaly_count.agg(
    sum(when(getbit(col("window_status"), lit(0)) == 1, 1).otherwise(0)).alias("rare_failure_count"),
    sum(when(getbit(col("window_status"), lit(1)) == 1, 1).otherwise(0)).alias("spike_anomaly_count"),
    sum(when(getbit(col("window_status"), lit(2)) == 1, 1).otherwise(0)).alias("low_consumption_count"),
    sum(when(getbit(col("window_status"), lit(3)) == 1, 1).otherwise(0)).alias("voltage_anomaly_count")
)


In [28]:
anomaly_count.show()

[Stage 41:>                                                         (0 + 1) / 1]

+------------------+-------------------+---------------------+---------------------+
|rare_failure_count|spike_anomaly_count|low_consumption_count|voltage_anomaly_count|
+------------------+-------------------+---------------------+---------------------+
|                 3|                 92|                    0|                    0|
+------------------+-------------------+---------------------+---------------------+



                                                                                

### Count Animalies / Spikes / Failures - Per Building / Per Hour

In [26]:
from pyspark.sql.functions import when

anomaly_count_building = (
    df
    .withWatermark("timestamp", "2 hour")
    .groupBy(
        window("timestamp", "1 hour"),
        col("building_id")
    )
    .agg(
        F.expr("bit_or(status)").alias("window_status"),
        
    )
)

anomaly_count_building = anomaly_count_building.groupBy("building_id", "window").agg(
    sum(when(getbit(col("window_status"), lit(0)) == 1, 1).otherwise(0)).alias("rare_failure_count"),
    sum(when(getbit(col("window_status"), lit(1)) == 1, 1).otherwise(0)).alias("spike_anomaly_count"),
    sum(when(getbit(col("window_status"), lit(2)) == 1, 1).otherwise(0)).alias("low_consumption_count"),
    sum(when(getbit(col("window_status"), lit(3)) == 1, 1).otherwise(0)).alias("voltage_anomaly_count")
).withColumn("window_start", col("window.start")).withColumn("window_end", col("window.end")).drop(col("window"))


In [27]:
anomaly_count_building.show()

+-----------+------------------+-------------------+---------------------+---------------------+-------------------+-------------------+
|building_id|rare_failure_count|spike_anomaly_count|low_consumption_count|voltage_anomaly_count|       window_start|         window_end|
+-----------+------------------+-------------------+---------------------+---------------------+-------------------+-------------------+
|          3|                 0|                  1|                    0|                    0|2025-11-19 17:30:00|2025-11-19 18:30:00|
|          4|                 0|                  1|                    0|                    0|2025-11-19 17:30:00|2025-11-19 18:30:00|
|          9|                 0|                  1|                    0|                    0|2025-11-19 17:30:00|2025-11-19 18:30:00|
|          2|                 0|                  1|                    0|                    0|2025-11-19 17:30:00|2025-11-19 18:30:00|
|         10|                 0|         

                                                                                

### Hourly House Voltage Monitoring

In [28]:
hourly_house_voltage = house_agg = df.withWatermark("timestamp", "2 hour").groupBy(
    "meter_id",
    window("timestamp", "1 hour")
).agg(
    avg("voltage_v").alias("avg_voltage"),
    max("voltage_v").alias("max_voltage"),
    min("voltage_v").alias("min_voltage"), 
    sum("voltage_v").alias("sum_voltage")
).withColumn("window_start", col("window.start")).withColumn("window_end", col("window.end")).drop(col("window"))

In [29]:
hourly_house_voltage.show()

[Stage 29:>                                                         (0 + 1) / 1]

+--------+------------------+-----------+-----------+------------------+-------------------+-------------------+
|meter_id|       avg_voltage|max_voltage|min_voltage|       sum_voltage|       window_start|         window_end|
+--------+------------------+-----------+-----------+------------------+-------------------+-------------------+
|      67|229.26129051946825|      238.7|      221.2| 7107.100006103516|2025-11-19 17:30:00|2025-11-19 18:30:00|
|      34|229.35000133514404|      239.2|      217.1| 7339.200042724609|2025-11-19 17:30:00|2025-11-19 18:30:00|
|      74| 230.3774197486139|      239.0|      217.2| 7141.700012207031|2025-11-19 17:30:00|2025-11-19 18:30:00|
|      91|230.37096626527847|      242.0|      215.4| 7141.499954223633|2025-11-19 17:30:00|2025-11-19 18:30:00|
|      35|230.43437576293945|      243.0|      218.5|7373.9000244140625|2025-11-19 17:30:00|2025-11-19 18:30:00|
|      54|230.16874980926514|      239.4|      221.1| 7365.399993896484|2025-11-19 17:30:00|2025

                                                                                