In [1]:
#1. Load a large dataset of sensor logs using PySpark
from pyspark.sql import SparkSession

# Initialize Spark session
spark = SparkSession.builder.appName("DeviceAggregation").getOrCreate()

# Load cleaned sensor logs CSV
df = spark.read.option("header", "true").csv("cleaned_energy_logs.csv", inferSchema=True)

# Display sample data
df.show(5)
df.printSchema()


+---------+-------------------+----------+-------+-------+-------+
|device_id|          timestamp|energy_kwh|voltage|current|room_id|
+---------+-------------------+----------+-------+-------+-------+
|        4|2025-05-30 10:12:00|      3.24|  218.5|    4.8|      1|
|        7|2025-05-30 11:15:00|      0.75|  221.3|    2.1|      2|
|        2|2025-05-30 12:20:00|      1.12|  219.7|    3.3|      3|
|        5|2025-05-30 13:25:00|      2.54|  217.9|    5.1|      4|
|        1|2025-05-30 14:30:00|       4.1|  222.0|    7.0|      5|
+---------+-------------------+----------+-------+-------+-------+
only showing top 5 rows

root
 |-- device_id: integer (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- energy_kwh: double (nullable = true)
 |-- voltage: double (nullable = true)
 |-- current: double (nullable = true)
 |-- room_id: integer (nullable = true)



In [2]:
#2.Group by device and calculate peak vs off-peak usage
from pyspark.sql.functions import hour, when, to_timestamp, sum

# Convert string to timestamp and extract hour
df = df.withColumn("timestamp", to_timestamp("timestamp"))
df = df.withColumn("hour", hour("timestamp"))

# Classify as Peak (6 PM to 6 AM) or Off-Peak (6 AM to 6 PM)
df = df.withColumn("usage_period", when((df["hour"] >= 18) | (df["hour"] < 6), "Peak").otherwise("Off-Peak"))

# Group by device and period, aggregate total energy
peak_offpeak_usage = df.groupBy("device_id", "usage_period").agg(
    sum("energy_kwh").alias("total_energy_kwh")
)

peak_offpeak_usage.show()


+---------+------------+----------------+
|device_id|usage_period|total_energy_kwh|
+---------+------------+----------------+
|        7|        Peak|            3.33|
|        6|    Off-Peak|            3.95|
|        4|        Peak|             2.2|
|        2|    Off-Peak|            1.12|
|        5|    Off-Peak|            2.54|
|        4|    Off-Peak|            3.24|
|        8|    Off-Peak|            0.55|
|        3|    Off-Peak|            1.75|
|        1|    Off-Peak|             4.1|
|        7|    Off-Peak|            0.75|
+---------+------------+----------------+



In [3]:
#3.Identify top energy-consuming devices
# Total usage per device
top_devices = df.groupBy("device_id").agg(
    sum("energy_kwh").alias("total_usage_kwh")
).orderBy("total_usage_kwh", ascending=False)

top_devices.show(10)


+---------+---------------+
|device_id|total_usage_kwh|
+---------+---------------+
|        4|           5.44|
|        1|            4.1|
|        7|           4.08|
|        6|           3.95|
|        5|           2.54|
|        3|           1.75|
|        2|           1.12|
|        8|           0.55|
+---------+---------------+



In [4]:
#Save the output
# Save to output folder
peak_offpeak_usage.write.option("header", "true").mode("overwrite").csv("output/peak_offpeak_usage")
top_devices.write.option("header", "true").mode("overwrite").csv("output/top_energy_devices")
