In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Week2_Day6_WindowFunctions") \
    .getOrCreate()

print("✅ Spark Session Created")

✅ Spark Session Created


In [3]:
from pyspark.sql.functions import col, sum, row_number, rank
from pyspark.sql.window import Window

df = spark.read.csv("../data/sales.csv", header=True, inferSchema=True)
df.show(5)

+--------+-----------+----------+--------+--------+----------+
|order_id|customer_id|order_date| product|quantity|unit_price|
+--------+-----------+----------+--------+--------+----------+
|       1|       1001|2024-01-01|Widget A|       2|      9.99|
|       2|       1002|2024-01-03|Widget B|       1|     19.99|
|       3|       1001|2024-01-07|Widget C|       5|       4.5|
|       4|       1003|2024-02-10|Widget A|       3|      9.99|
|       5|       1004|2024-02-15|Widget B|       2|     19.99|
+--------+-----------+----------+--------+--------+----------+
only showing top 5 rows



In [4]:
# Define window partitioned by customer_id, ordered by order_date
window_spec = Window.partitionBy("customer_id").orderBy("order_date")

# Row number
df.withColumn("row_num", row_number().over(window_spec)) \
  .show()

# Rank
df.withColumn("rank", rank().over(window_spec)) \
  .show()

+--------+-----------+----------+--------+--------+----------+-------+
|order_id|customer_id|order_date| product|quantity|unit_price|row_num|
+--------+-----------+----------+--------+--------+----------+-------+
|       1|       1001|2024-01-01|Widget A|       2|      9.99|      1|
|       3|       1001|2024-01-07|Widget C|       5|       4.5|      2|
|       8|       1001|2024-03-20|Widget B|       4|     19.99|      3|
|       2|       1002|2024-01-03|Widget B|       1|     19.99|      1|
|       6|       1002|2024-03-01|Widget D|       1|     29.99|      2|
|       4|       1003|2024-02-10|Widget A|       3|      9.99|      1|
|       5|       1004|2024-02-15|Widget B|       2|     19.99|      1|
|       7|       1005|2024-03-05|Widget A|      10|      9.99|      1|
|       9|       1006|2024-04-02|Widget C|       2|       4.5|      1|
|      10|       1007|2024-04-15|Widget D|       3|     29.99|      1|
+--------+-----------+----------+--------+--------+----------+-------+

+----

In [5]:
from pyspark.sql.functions import sum as spark_sum

window_cust = Window.partitionBy("customer_id").orderBy("order_date").rowsBetween(Window.unboundedPreceding, Window.currentRow)

df.withColumn("revenue", col("quantity") * col("unit_price")) \
  .withColumn("cumulative_revenue", spark_sum("revenue").over(window_cust)) \
  .show()

+--------+-----------+----------+--------+--------+----------+-------+------------------+
|order_id|customer_id|order_date| product|quantity|unit_price|revenue|cumulative_revenue|
+--------+-----------+----------+--------+--------+----------+-------+------------------+
|       1|       1001|2024-01-01|Widget A|       2|      9.99|  19.98|             19.98|
|       3|       1001|2024-01-07|Widget C|       5|       4.5|   22.5|42.480000000000004|
|       8|       1001|2024-03-20|Widget B|       4|     19.99|  79.96|            122.44|
|       2|       1002|2024-01-03|Widget B|       1|     19.99|  19.99|             19.99|
|       6|       1002|2024-03-01|Widget D|       1|     29.99|  29.99|             49.98|
|       4|       1003|2024-02-10|Widget A|       3|      9.99|  29.97|             29.97|
|       5|       1004|2024-02-15|Widget B|       2|     19.99|  39.98|             39.98|
|       7|       1005|2024-03-05|Widget A|      10|      9.99|   99.9|              99.9|
|       9|

In [6]:
from pyspark.sql.functions import date_format, avg

df_with_month = df.withColumn("order_month", date_format(col("order_date"), "yyyy-MM")) \
                  .withColumn("revenue", col("quantity") * col("unit_price"))

window_month = Window.orderBy("order_month").rowsBetween(-2, 0)  # last 3 months including current

df_with_month.groupBy("order_month") \
    .agg(spark_sum("revenue").alias("monthly_revenue")) \
    .withColumn("moving_avg_revenue", avg("monthly_revenue").over(window_month)) \
    .show()

+-----------+------------------+------------------+
|order_month|   monthly_revenue|moving_avg_revenue|
+-----------+------------------+------------------+
|    2024-01|             62.47|             62.47|
|    2024-02| 69.94999999999999|             66.21|
|    2024-03|209.85000000000002|114.08999999999999|
|    2024-04|             98.97|126.25666666666666|
+-----------+------------------+------------------+

