In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Week3_Day5_PySpark_SQL_Time_Analytics") \
    .getOrCreate()

print("✅ Spark Session Created")

✅ Spark Session Created


In [2]:
from pyspark.sql import Row
from pyspark.sql.functions import col, to_timestamp

data = [
    Row(order_id=1, customer_id=101, order_date="2024-01-01 10:15:00", amount=500),
    Row(order_id=2, customer_id=102, order_date="2024-01-01 11:00:00", amount=700),
    Row(order_id=3, customer_id=101, order_date="2024-01-02 09:45:00", amount=300),
    Row(order_id=4, customer_id=103, order_date="2024-01-03 14:20:00", amount=900),
    Row(order_id=5, customer_id=102, order_date="2024-01-03 17:10:00", amount=400)
]

df = spark.createDataFrame(data)
df = df.withColumn("order_date", to_timestamp(col("order_date")))
df.show(truncate=False)

+--------+-----------+-------------------+------+
|order_id|customer_id|order_date         |amount|
+--------+-----------+-------------------+------+
|1       |101        |2024-01-01 10:15:00|500   |
|2       |102        |2024-01-01 11:00:00|700   |
|3       |101        |2024-01-02 09:45:00|300   |
|4       |103        |2024-01-03 14:20:00|900   |
|5       |102        |2024-01-03 17:10:00|400   |
+--------+-----------+-------------------+------+



In [3]:
df.createOrReplaceTempView("orders")
print("✅ Temporary SQL View Created")

✅ Temporary SQL View Created


In [4]:
spark.sql("""
SELECT customer_id, SUM(amount) AS total_amount
FROM orders
GROUP BY customer_id
""").show()

+-----------+------------+
|customer_id|total_amount|
+-----------+------------+
|        101|         800|
|        102|        1100|
|        103|         900|
+-----------+------------+



In [5]:
spark.sql("""
SELECT DATE(order_date) AS order_day,
       COUNT(order_id) AS total_orders,
       SUM(amount) AS daily_sales
FROM orders
GROUP BY DATE(order_date)
ORDER BY order_day
""").show()

+----------+------------+-----------+
| order_day|total_orders|daily_sales|
+----------+------------+-----------+
|2024-01-01|           2|       1200|
|2024-01-02|           1|        300|
|2024-01-03|           2|       1300|
+----------+------------+-----------+



In [6]:
from pyspark.sql.functions import dayofweek, month, year, date_format

df_dates = df.withColumn("day_of_week", dayofweek("order_date")) \
             .withColumn("month", month("order_date")) \
             .withColumn("year", year("order_date")) \
             .withColumn("formatted_date", date_format("order_date", "yyyy-MM-dd"))
df_dates.show()

+--------+-----------+-------------------+------+-----------+-----+----+--------------+
|order_id|customer_id|         order_date|amount|day_of_week|month|year|formatted_date|
+--------+-----------+-------------------+------+-----------+-----+----+--------------+
|       1|        101|2024-01-01 10:15:00|   500|          2|    1|2024|    2024-01-01|
|       2|        102|2024-01-01 11:00:00|   700|          2|    1|2024|    2024-01-01|
|       3|        101|2024-01-02 09:45:00|   300|          3|    1|2024|    2024-01-02|
|       4|        103|2024-01-03 14:20:00|   900|          4|    1|2024|    2024-01-03|
|       5|        102|2024-01-03 17:10:00|   400|          4|    1|2024|    2024-01-03|
+--------+-----------+-------------------+------+-----------+-----+----+--------------+



In [7]:
from pyspark.sql.functions import window

df_window = df.groupBy(
    window("order_date", "1 day")
).sum("amount")

df_window.select("window.start", "window.end", "sum(amount)").show(truncate=False)

+-------------------+-------------------+-----------+
|start              |end                |sum(amount)|
+-------------------+-------------------+-----------+
|2024-01-01 00:00:00|2024-01-02 00:00:00|1200       |
|2024-01-02 00:00:00|2024-01-03 00:00:00|300        |
|2024-01-03 00:00:00|2024-01-04 00:00:00|1300       |
+-------------------+-------------------+-----------+

