### Setup

Make sure you have the files available from previous demos.

In [0]:
# This cell sets all the configuration parameters to connect to Azure Data Lake
spark.conf.set("fs.azure.account.auth.type.<account_name>.dfs.core.windows.net", "OAuth")
spark.conf.set("fs.azure.account.oauth.provider.type.<account_name>.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set("fs.azure.account.oauth2.client.id.<account_name>.dfs.core.windows.net", "****************************")
spark.conf.set("fs.azure.account.oauth2.client.secret.<account_name>.dfs.core.windows.net", "*******************************")
spark.conf.set("fs.azure.account.oauth2.client.endpoint.<account_name>.dfs.core.windows.net", "https://login.microsoftonline.com/************************/oauth2/token")

Verify that cloud storage is accessible

In [0]:
dbutils.fs.ls("abfss://pyspark@warnerdatalake.dfs.core.windows.net/")

Let's load the transactions dataframe

In [0]:
# Path to the transactions data
parquet_path = "abfss://pyspark@warnerdatalake.dfs.core.windows.net//imports//transactions_data.parquet"

# Load the transactions data
df_transactions = spark.read.parquet(parquet_path)
df_transactions.printSchema()

# Display the first 5 records
df_transactions.limit(5).display()

root
 |-- transaction_id: long (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- transaction_date: date (nullable = true)
 |-- amount: decimal(10,2) (nullable = true)
 |-- category: string (nullable = true)



transaction_id,customer_id,transaction_date,amount,category
1,3065,2025-03-17,76.1,Clothes
2,3274,2025-02-18,91.91,Clothes
3,130,2025-01-10,11.81,Accessories
4,320,2025-03-06,20.37,Furniture
5,6480,2025-03-22,12.31,Beauty


Let's do some grouping and aggregation

In [0]:
from pyspark.sql import functions as F
# Total amount per category
df_category_total = (
    df_transactions
        .groupBy("category")
        .agg(F.sum("amount").alias("total_amount"))
        .orderBy(F.col("total_amount").desc())
)

df_category_total.limit(5).display()


category,total_amount
Food,5026910.32
Electronics,5022730.15
Beauty,5003682.15
Furniture,5000355.12
Sports,4998734.93


In [0]:
# Average, max and min amount per category ordered by average
df_category_stats = (
    df_transactions
        .groupBy("category")
        .agg(
            F.avg("amount").alias("avg_amount"),
            F.max("amount").alias("max_amount"),
            F.min("amount").alias("min_amount")
        )
        .orderBy(F.col("avg_amount").desc())
)

df_category_stats.limit(5).display()


category,avg_amount,max_amount,min_amount
Accessories,50.1049,100.0,0.01
Electronics,50.065091,100.0,0.0
Toys,50.035075,100.0,0.0
Food,50.03494,100.0,0.0
Gadgets,50.014188,100.0,0.0


Let's do some Exploratory Data Analysis (EDA)

In [0]:
# Get the statistics for a single column
df_transactions.select("amount").summary().display()


summary,amount
count,1000000.0
mean,49.999384
stddev,28.88635459684146
min,0.0
25%,24.96
50%,50.03
75%,75.03
max,100.0


In [0]:
# Calculate the distribution of categories
df_category_distribution = (
    df_transactions
        .groupBy("category")
        .count()
        .orderBy(F.col("count").desc())
)

df_category_distribution.limit(5).display()


category,count
Food,100468
Electronics,100324
Furniture,100174
Beauty,100124
Sports,100054


In [0]:
# Mix aggregations in a single result, top categories by total and average amount

df_top_categories = (
    df_transactions
        .groupBy("category")
        .agg(
            F.sum("amount").alias("total_amount"),
            F.avg("amount").alias("avg_amount"),
            F.count("*").alias("transaction_count")
        )
        .orderBy(F.col("total_amount").desc(), F.col("avg_amount").desc())
)

df_top_categories.limit(5).display()


category,total_amount,avg_amount,transaction_count
Food,5026910.32,50.03494,100468
Electronics,5022730.15,50.065091,100324
Beauty,5003682.15,49.974853,100124
Furniture,5000355.12,49.916696,100174
Sports,4998734.93,49.960371,100054


In [0]:
# Calculate the average transaction amount by age group

# Load the customers data
customers_path = "abfss://pyspark@warnerdatalake.dfs.core.windows.net//imports//customers_data.parquet"
df_customers = spark.read.parquet(customers_path)

# Join transactions with customers
df_joined = df_transactions.join(df_customers, on="customer_id", how="inner")

# Add age group column and calculate average spending by age group
df_age_group_spending = (
    df_joined
        .withColumn(
            "age_group", 
            F.when(F.col("age") < 30, "Under 30")
             .when((F.col("age") >= 30) & (F.col("age") < 50), "30-49")
             .otherwise("50 and Above")
        )
        .groupBy("age_group")
        .agg(F.avg("amount").alias("avg_spending"))
        .orderBy(F.col("avg_spending").desc())
)

df_age_group_spending.limit(5).display()


age_group,avg_spending
Under 30,50.140706
30-49,49.97632
50 and Above,49.969583
