### Setup

Make sure you have the files available from previous demos.

In [0]:
# This cell sets all the configuration parameters to connect to Azure Data Lake
spark.conf.set("fs.azure.account.auth.type.<account_name>.dfs.core.windows.net", "OAuth")
spark.conf.set("fs.azure.account.oauth.provider.type.<account_name>.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set("fs.azure.account.oauth2.client.id.<account_name>.dfs.core.windows.net", "****************************")
spark.conf.set("fs.azure.account.oauth2.client.secret.<account_name>.dfs.core.windows.net", "*******************************")
spark.conf.set("fs.azure.account.oauth2.client.endpoint.<account_name>.dfs.core.windows.net", "https://login.microsoftonline.com/************************/oauth2/token")

Verify that cloud storage is accessible

In [0]:
display(dbutils.fs.ls("abfss://etl1@dbstoragebbpbs73u57xmm.dfs.core.windows.net/"))

Let's load the transaction data

In [0]:
from pyspark.sql import Window
from pyspark.sql import functions as F

# Path to transactions data
parquet_path = "abfss://etl1@dbstoragebbpbs73u57xmm.dfs.core.windows.net/transactions_data.parquet"

# Load the transactions data
df_transactions = spark.read.parquet(parquet_path)

# Display the first few rows
df_transactions.limit(5).display()


Let's do a ranking operation

In [0]:
# Ranking using Window Function. There are 3 types of ranking.
# 1. Row Number
# 2. Rank
# 3. Dense Rank

window_spec = Window.partitionBy(F.col('customer_id')).orderBy(F.col('transaction_date').asc())
display(window_spec)

df_ranked = (
    df_transactions
        .withColumn('row_number', F.row_number().over(window_spec))
        .withColumn('rank', F.rank().over(window_spec))
        .withColumn('dense_rank', F.dense_rank().over(window_spec))
)

df_ranked.limit(100).display()

# # Ranking transactions within each customer

# # Define a window specification for ranking transactions per customer
# window_spec = Window.partitionBy("customer_id").orderBy(F.col("transaction_date").asc())
# display(window_spec)

# df_ranked = (
#     df_transactions
#         .withColumn("row_number", F.row_number().over(window_spec))
#         .withColumn("rank", F.rank().over(window_spec))
#         .withColumn("dense_rank", F.dense_rank().over(window_spec))
# )

# df_ranked.limit(500).display()


Let's calculate a rolling average

In [0]:
# Moving average of transaction amount

# Define a window for a rolling average (7-day window per customer)
rolling_window = Window.partitionBy('customer_id').orderBy('transaction_date').rowsBetween(-6, 0)

df_rolling_avg = (
    df_transactions
        .withColumn('rolling_avg_amount', F.avg('amount').over(rolling_window))
)


df_rolling_avg.limit(500).display()

# # Moving average of transaction amount

# # Define a window for a rolling average (7-day window per customer)
# rolling_window = Window.partitionBy("customer_id").orderBy(F.col("transaction_date")).rowsBetween(-6, 0)

# df_rolling_avg = (
#     df_transactions
#         .withColumn("rolling_avg_amount", F.avg("amount").over(rolling_window))
# )

# df_rolling_avg.limit(5).display()


Let's calculate a running total

In [0]:
# Running total of transaction amounts per customer

# Define a window for cumulative sum per customer

cumulative_window = Window.partitionBy('customer_id').orderBy('transaction_date').rowsBetween(Window.unboundedPreceding,0)

df_running_total = (
    df_transactions
        .withColumn('cumulative_amount', F.sum('amount').over(cumulative_window))
)

df_running_total.limit(500).display()
# # Running total of transaction amounts per customer

# # Define a window for cumulative sum per customer
# cumulative_window = Window.partitionBy("customer_id").orderBy(F.col("transaction_date")).rowsBetween(Window.unboundedPreceding, 0)

# df_running_total = (
#     df_transactions
#         .withColumn("cumulative_total", F.sum("amount").over(cumulative_window))
# )

# df_running_total.limit(5).display()


Let's look at lagging and leading indicators

In [0]:
# Compare each transaction to the previous one (lag) and the next one (lead)

df_lag_lead = (
    df_transactions
        .withColumn("previous_transaction_amount", F.lag("amount", 1).over(window_spec))
        .withColumn("next_transaction_amount", F.lead("amount", 1).over(window_spec))
)

df_lag_lead.limit(5).display()


And we can combine these approaches as well

In [0]:
# Combining Multiple Window Functions for Trend Analysis

df_trend_analysis = (
    df_transactions
        .withColumn("row_number", F.row_number().over(window_spec))
        .withColumn("rolling_avg_amount", F.avg("amount").over(rolling_window))
        .withColumn("cumulative_total", F.sum("amount").over(cumulative_window))
        .withColumn("previous_transaction_amount", F.lag("amount", 1).over(window_spec))
        .withColumn("next_transaction_amount", F.lead("amount", 1).over(window_spec))
)

df_trend_analysis.limit(5).display()
