In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Week3_Day4_PySpark_Window_Functions") \
    .getOrCreate()

print("✅ Spark Session Created")

✅ Spark Session Created


In [2]:
from pyspark.sql import Row

data = [
    Row(customer_id=1, order_date="2024-01-01", amount=500),
    Row(customer_id=1, order_date="2024-01-05", amount=300),
    Row(customer_id=1, order_date="2024-01-10", amount=400),
    Row(customer_id=2, order_date="2024-01-02", amount=1000),
    Row(customer_id=2, order_date="2024-01-03", amount=800),
    Row(customer_id=3, order_date="2024-01-05", amount=600)
]

df = spark.createDataFrame(data)
df.show()

+-----------+----------+------+
|customer_id|order_date|amount|
+-----------+----------+------+
|          1|2024-01-01|   500|
|          1|2024-01-05|   300|
|          1|2024-01-10|   400|
|          2|2024-01-02|  1000|
|          2|2024-01-03|   800|
|          3|2024-01-05|   600|
+-----------+----------+------+



In [3]:
from pyspark.sql.window import Window
from pyspark.sql.functions import sum, avg, row_number, rank, dense_rank, lag, lead

In [4]:
windowSpec = Window.partitionBy("customer_id").orderBy("order_date")

In [5]:
df_running = df.withColumn(
    "running_total",
    sum("amount").over(windowSpec)
)
df_running.show()

+-----------+----------+------+-------------+
|customer_id|order_date|amount|running_total|
+-----------+----------+------+-------------+
|          1|2024-01-01|   500|          500|
|          1|2024-01-05|   300|          800|
|          1|2024-01-10|   400|         1200|
|          2|2024-01-02|  1000|         1000|
|          2|2024-01-03|   800|         1800|
|          3|2024-01-05|   600|          600|
+-----------+----------+------+-------------+



In [6]:
df_avg = df_running.withColumn(
    "average_amount",
    avg("amount").over(windowSpec)
)
df_avg.show()

+-----------+----------+------+-------------+--------------+
|customer_id|order_date|amount|running_total|average_amount|
+-----------+----------+------+-------------+--------------+
|          1|2024-01-01|   500|          500|         500.0|
|          1|2024-01-05|   300|          800|         400.0|
|          1|2024-01-10|   400|         1200|         400.0|
|          2|2024-01-02|  1000|         1000|        1000.0|
|          2|2024-01-03|   800|         1800|         900.0|
|          3|2024-01-05|   600|          600|         600.0|
+-----------+----------+------+-------------+--------------+



In [7]:
df_ranked = df_avg.withColumn("row_number", row_number().over(windowSpec)) \
    .withColumn("rank", rank().over(windowSpec)) \
    .withColumn("dense_rank", dense_rank().over(windowSpec))
df_ranked.show()

+-----------+----------+------+-------------+--------------+----------+----+----------+
|customer_id|order_date|amount|running_total|average_amount|row_number|rank|dense_rank|
+-----------+----------+------+-------------+--------------+----------+----+----------+
|          1|2024-01-01|   500|          500|         500.0|         1|   1|         1|
|          1|2024-01-05|   300|          800|         400.0|         2|   2|         2|
|          1|2024-01-10|   400|         1200|         400.0|         3|   3|         3|
|          2|2024-01-02|  1000|         1000|        1000.0|         1|   1|         1|
|          2|2024-01-03|   800|         1800|         900.0|         2|   2|         2|
|          3|2024-01-05|   600|          600|         600.0|         1|   1|         1|
+-----------+----------+------+-------------+--------------+----------+----+----------+



In [8]:
df_final = df_ranked.withColumn("prev_order", lag("amount", 1).over(windowSpec)) \
    .withColumn("next_order", lead("amount", 1).over(windowSpec))
df_final.show()

+-----------+----------+------+-------------+--------------+----------+----+----------+----------+----------+
|customer_id|order_date|amount|running_total|average_amount|row_number|rank|dense_rank|prev_order|next_order|
+-----------+----------+------+-------------+--------------+----------+----+----------+----------+----------+
|          1|2024-01-01|   500|          500|         500.0|         1|   1|         1|      NULL|       300|
|          1|2024-01-05|   300|          800|         400.0|         2|   2|         2|       500|       400|
|          1|2024-01-10|   400|         1200|         400.0|         3|   3|         3|       300|      NULL|
|          2|2024-01-02|  1000|         1000|        1000.0|         1|   1|         1|      NULL|       800|
|          2|2024-01-03|   800|         1800|         900.0|         2|   2|         2|      1000|      NULL|
|          3|2024-01-05|   600|          600|         600.0|         1|   1|         1|      NULL|      NULL|
+---------

In [9]:
df_final.write.mode("overwrite").parquet("output/week3_day4_window_functions")
print("✅ Results saved to output/week3_day4_window_functions")

✅ Results saved to output/week3_day4_window_functions
