In [0]:
spark

In [0]:
#1.Upload cleaned logs to Databricks

df = spark.read.option("header", "true").csv("file:/Workspace/Shared/cleaned_energy_logs.csv", inferSchema=True)

df.show(5)
df.printSchema()


+---------+-------------------+----------+-------+-------+-------+
|device_id|          timestamp|energy_kwh|voltage|current|room_id|
+---------+-------------------+----------+-------+-------+-------+
|        4|2025-05-30 10:12:00|      3.24|  218.5|    4.8|      1|
|        7|2025-05-30 11:15:00|      0.75|  221.3|    2.1|      2|
|        2|2025-05-30 12:20:00|      1.12|  219.7|    3.3|      3|
|        5|2025-05-30 13:25:00|      2.54|  217.9|    5.1|      4|
|        1|2025-05-30 14:30:00|       4.1|  222.0|    7.0|      5|
+---------+-------------------+----------+-------+-------+-------+
only showing top 5 rows

root
 |-- device_id: integer (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- energy_kwh: double (nullable = true)
 |-- voltage: double (nullable = true)
 |-- current: double (nullable = true)
 |-- room_id: integer (nullable = true)



In [0]:
#2.Build ETL pipeline to calculate daily/weekly summaries

from pyspark.sql.functions import to_date, to_timestamp, weekofyear, sum, round

# Timestamp conversion and new columns
df = df.withColumn("timestamp", to_timestamp("timestamp"))
df = df.withColumn("date", to_date("timestamp"))
df = df.withColumn("week", weekofyear("timestamp"))

# Daily Summary per Room
daily_summary = df.groupBy("date", "room_id") \
    .agg(round(sum("energy_kwh"), 2).alias("daily_energy_kwh"))

# Weekly Summary per Room
weekly_summary = df.groupBy("week", "room_id") \
    .agg(round(sum("energy_kwh"), 2).alias("weekly_energy_kwh"))

daily_summary.show()
weekly_summary.show()


+----------+-------+----------------+
|      date|room_id|daily_energy_kwh|
+----------+-------+----------------+
|2025-05-30|      2|             2.5|
|2025-05-30|      1|            7.19|
|2025-05-30|      4|            4.74|
|2025-05-30|      3|            1.67|
|2025-05-30|      5|            7.43|
+----------+-------+----------------+

+----+-------+-----------------+
|week|room_id|weekly_energy_kwh|
+----+-------+-----------------+
|  22|      5|             7.43|
|  22|      2|              2.5|
|  22|      4|             4.74|
|  22|      1|             7.19|
|  22|      3|             1.67|
+----+-------+-----------------+



In [0]:
#3.Save Final Results in Delta & CSV Format
# Save as Delta
daily_summary.write.format("delta").mode("overwrite").save("file:/Workspace/Shared/delta/daily_summary")
weekly_summary.write.format("delta").mode("overwrite").save("file:/Workspace/Shared/delta/weekly_summary")

# Save as CSV
daily_summary.write.option("header", "true").mode("overwrite").csv("file:/Workspace/Shared/output/daily_summary_csv")
weekly_summary.write.option("header", "true").mode("overwrite").csv("file:/Workspace/Shared/output/weekly_summary_csv")


In [0]:
#4.Optional – SQL query for devices exceeding 10 kWh/day
# Register view
df.createOrReplaceTempView("energy_logs")

# SQL: devices using >10 kWh in a day
spark.sql("""
SELECT device_id, date, ROUND(SUM(energy_kwh), 2) AS daily_kwh
FROM energy_logs
GROUP BY device_id, date
HAVING daily_kwh > 10
""").show()


+---------+----+---------+
|device_id|date|daily_kwh|
+---------+----+---------+
+---------+----+---------+

