### Setup

Make sure you have the files available from previous demos.

In [0]:
# This cell sets all the configuration parameters to connect to Azure Data Lake
spark.conf.set("fs.azure.account.auth.type.<account_name>.dfs.core.windows.net", "OAuth")
spark.conf.set("fs.azure.account.oauth.provider.type.<account_name>.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set("fs.azure.account.oauth2.client.id.<account_name>.dfs.core.windows.net", "****************************")
spark.conf.set("fs.azure.account.oauth2.client.secret.<account_name>.dfs.core.windows.net", "*******************************")
spark.conf.set("fs.azure.account.oauth2.client.endpoint.<account_name>.dfs.core.windows.net", "https://login.microsoftonline.com/************************/oauth2/token")

Verify that cloud storage is accessible

In [0]:
dbutils.fs.ls("abfss://pyspark@warnerdatalake.dfs.core.windows.net/")

Let's load the transaction data

In [0]:
from pyspark.sql import Window
from pyspark.sql import functions as F

# Path to transactions data
parquet_path = "abfss://pyspark@warnerdatalake.dfs.core.windows.net//imports//transactions_data.parquet"

# Load the transactions data
df_transactions = spark.read.parquet(parquet_path)

# Display the first few rows
df_transactions.limit(5).display()


transaction_id,customer_id,transaction_date,amount,category
1,3065,2025-03-17,76.1,Clothes
2,3274,2025-02-18,91.91,Clothes
3,130,2025-01-10,11.81,Accessories
4,320,2025-03-06,20.37,Furniture
5,6480,2025-03-22,12.31,Beauty


Let's do a ranking operation

In [0]:
# Ranking transactions within each customer

# Define a window specification for ranking transactions per customer
window_spec = Window.partitionBy("customer_id").orderBy(F.col("transaction_date").asc())

df_ranked = (
    df_transactions
        .withColumn("row_number", F.row_number().over(window_spec))
        .withColumn("rank", F.rank().over(window_spec))
        .withColumn("dense_rank", F.dense_rank().over(window_spec))
)

df_ranked.limit(5).display()


transaction_id,customer_id,transaction_date,amount,category,row_number,rank,dense_rank
587479,1,2025-01-01,67.07,Accessories,1,1,1
619033,1,2025-01-01,9.05,Toys,2,1,1
766739,1,2025-01-02,76.63,Sports,3,3,2
910648,1,2025-01-02,3.41,Books,4,3,2
836549,1,2025-01-03,77.07,Toys,5,5,3


Let's calculate a rolling average

In [0]:
# Moving average of transaction amount

# Define a window for a rolling average (7-day window per customer)
rolling_window = Window.partitionBy("customer_id").orderBy(F.col("transaction_date")).rowsBetween(-6, 0)

df_rolling_avg = (
    df_transactions
        .withColumn("rolling_avg_amount", F.avg("amount").over(rolling_window))
)

df_rolling_avg.limit(5).display()


transaction_id,customer_id,transaction_date,amount,category,rolling_avg_amount
587479,1,2025-01-01,67.07,Accessories,67.07
619033,1,2025-01-01,9.05,Toys,38.06
766739,1,2025-01-02,76.63,Sports,50.916667
910648,1,2025-01-02,3.41,Books,39.04
836549,1,2025-01-03,77.07,Toys,46.646


Let's calculate a running total

In [0]:
# Running total of transaction amounts per customer

# Define a window for cumulative sum per customer
cumulative_window = Window.partitionBy("customer_id").orderBy(F.col("transaction_date")).rowsBetween(Window.unboundedPreceding, 0)

df_running_total = (
    df_transactions
        .withColumn("cumulative_total", F.sum("amount").over(cumulative_window))
)

df_running_total.limit(5).display()


transaction_id,customer_id,transaction_date,amount,category,cumulative_total
587479,1,2025-01-01,67.07,Accessories,67.07
619033,1,2025-01-01,9.05,Toys,76.12
766739,1,2025-01-02,76.63,Sports,152.75
910648,1,2025-01-02,3.41,Books,156.16
836549,1,2025-01-03,77.07,Toys,233.23


Let's look at lagging and leading indicators

In [0]:
# Compare each transaction to the previous one (lag) and the next one (lead)

df_lag_lead = (
    df_transactions
        .withColumn("previous_transaction_amount", F.lag("amount", 1).over(window_spec))
        .withColumn("next_transaction_amount", F.lead("amount", 1).over(window_spec))
)

df_lag_lead.limit(5).display()


transaction_id,customer_id,transaction_date,amount,category,previous_transaction_amount,next_transaction_amount
587479,1,2025-01-01,67.07,Accessories,,9.05
619033,1,2025-01-01,9.05,Toys,67.07,76.63
766739,1,2025-01-02,76.63,Sports,9.05,3.41
910648,1,2025-01-02,3.41,Books,76.63,77.07
836549,1,2025-01-03,77.07,Toys,3.41,93.73


And we can combine these approaches as well

In [0]:
# Combining Multiple Window Functions for Trend Analysis

df_trend_analysis = (
    df_transactions
        .withColumn("row_number", F.row_number().over(window_spec))
        .withColumn("rolling_avg_amount", F.avg("amount").over(rolling_window))
        .withColumn("cumulative_total", F.sum("amount").over(cumulative_window))
        .withColumn("previous_transaction_amount", F.lag("amount", 1).over(window_spec))
        .withColumn("next_transaction_amount", F.lead("amount", 1).over(window_spec))
)

df_trend_analysis.limit(5).display()


transaction_id,customer_id,transaction_date,amount,category,row_number,rolling_avg_amount,cumulative_total,previous_transaction_amount,next_transaction_amount
587479,1,2025-01-01,67.07,Accessories,1,67.07,67.07,,9.05
619033,1,2025-01-01,9.05,Toys,2,38.06,76.12,67.07,76.63
766739,1,2025-01-02,76.63,Sports,3,50.916667,152.75,9.05,3.41
910648,1,2025-01-02,3.41,Books,4,39.04,156.16,76.63,77.07
836549,1,2025-01-03,77.07,Toys,5,46.646,233.23,3.41,93.73
