In [1]:
import os
os.getcwd()
os.chdir("H:\pyspark_advanced-coding_interview")
print(os.getcwd())


from pyspark.sql import SparkSession

# Create a Spark session with optimized settings
spark = (
    SparkSession.builder 
    .appName("OptimizedLocalSpark") 
    .config("spark.driver.memory", "8g")        
    .config("spark.executor.memory", "8g")    
    .config("spark.executor.cores", "4")       
    .config("spark.cores.max", "12")           
    .config("spark.sql.shuffle.partitions", "28")  
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") 
    .getOrCreate()
)
sc = spark.sparkContext

H:\pyspark_advanced-coding_interview


# How to calculate rolling / moving average ? | Ex - 3 day rolling average

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Initialize Spark session
spark = SparkSession.builder.appName("RollingAverageExample").getOrCreate()

# Sample data
data = [
    (1, '2024-10-01', 100),
    (2, '2024-10-02', 200),
    (3, '2024-10-03', 300),
    (4, '2024-10-04', 400),
    (5, '2024-10-05', 500),
    (6, '2024-10-06', 600)
]

columns = ["id", "date", "value"]
df = spark.createDataFrame(data, columns)

# Create a temporary table
df.createOrReplaceTempView("temp_table")
df.show()


+---+----------+-----+
| id|      date|value|
+---+----------+-----+
|  1|2024-10-01|  100|
|  2|2024-10-02|  200|
|  3|2024-10-03|  300|
|  4|2024-10-04|  400|
|  5|2024-10-05|  500|
|  6|2024-10-06|  600|
+---+----------+-----+



In [3]:
res = spark.sql("""  
    SELECT 
    id,
    date,
    value,
    AVG(value) OVER (ORDER BY date  ROWS BETWEEN 2 PRECEDING AND CURRENT ROW
    ) AS rolling_avg_3day
FROM temp_table
ORDER BY date
            
                """)

res.show()

+---+----------+-----+----------------+
| id|      date|value|rolling_avg_3day|
+---+----------+-----+----------------+
|  1|2024-10-01|  100|           100.0|
|  2|2024-10-02|  200|           150.0|
|  3|2024-10-03|  300|           200.0|
|  4|2024-10-04|  400|           300.0|
|  5|2024-10-05|  500|           400.0|
|  6|2024-10-06|  600|           500.0|
+---+----------+-----+----------------+



In [5]:
result_sql_alternative = spark.sql("""
    SELECT 
        id,
        date,
        value,
        (value + COALESCE(LAG(value, 1) OVER (ORDER BY date), 0) + COALESCE(LAG(value, 2) OVER (ORDER BY date), 0)) / 3 AS rolling_avg_3day
    FROM temp_table
    ORDER BY date
""")

result_sql_alternative.show()


+---+----------+-----+------------------+
| id|      date|value|  rolling_avg_3day|
+---+----------+-----+------------------+
|  1|2024-10-01|  100|33.333333333333336|
|  2|2024-10-02|  200|             100.0|
|  3|2024-10-03|  300|             200.0|
|  4|2024-10-04|  400|             300.0|
|  5|2024-10-05|  500|             400.0|
|  6|2024-10-06|  600|             500.0|
+---+----------+-----+------------------+



In [4]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Define the window
window_spec = Window.orderBy("date").rowsBetween(-2, 0)

# Calculate rolling average
df_with_rolling_avg = df.withColumn("rolling_avg_3day", F.avg("value").over(window_spec))
df_with_rolling_avg.show()


+---+----------+-----+----------------+
| id|      date|value|rolling_avg_3day|
+---+----------+-----+----------------+
|  1|2024-10-01|  100|           100.0|
|  2|2024-10-02|  200|           150.0|
|  3|2024-10-03|  300|           200.0|
|  4|2024-10-04|  400|           300.0|
|  5|2024-10-05|  500|           400.0|
|  6|2024-10-06|  600|           500.0|
+---+----------+-----+----------------+

