### Setup

Make sure you have the files available from previous demos.

In [0]:
# This cell sets all the configuration parameters to connect to Azure Data Lake
spark.conf.set("fs.azure.account.auth.type.<account_name>.dfs.core.windows.net", "OAuth")
spark.conf.set("fs.azure.account.oauth.provider.type.<account_name>.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set("fs.azure.account.oauth2.client.id.<account_name>.dfs.core.windows.net", "****************************")
spark.conf.set("fs.azure.account.oauth2.client.secret.<account_name>.dfs.core.windows.net", "*******************************")
spark.conf.set("fs.azure.account.oauth2.client.endpoint.<account_name>.dfs.core.windows.net", "https://login.microsoftonline.com/************************/oauth2/token")

Verify that cloud storage is accessible

In [0]:
dbutils.fs.ls("abfss://pyspark@warnerdatalake.dfs.core.windows.net/")

Let's load the transactions data

In [0]:
from pyspark.sql import functions as F
from pyspark.sql import Window

# Load transactions data from Azure Data Lake
parquet_path = "abfss://pyspark@warnerdatalake.dfs.core.windows.net//imports//transactions_data.parquet"

df_transactions = spark.read.parquet(parquet_path)

# Display sample data
df_transactions.limit(5).display()


transaction_id,customer_id,transaction_date,amount,category
1,3065,2025-03-17,76.1,Clothes
2,3274,2025-02-18,91.91,Clothes
3,130,2025-01-10,11.81,Accessories
4,320,2025-03-06,20.37,Furniture
5,6480,2025-03-22,12.31,Beauty


Let's try some pivoting to reshape the data

In [0]:
# Pivot categories into columns

df_pivot = (
    df_transactions
        .groupBy("customer_id")
        .pivot("category")
        .agg(F.sum("amount").alias("total_spent"))
)

df_pivot.limit(5).display()


customer_id,Accessories,Beauty,Books,Clothes,Electronics,Food,Furniture,Gadgets,Sports,Toys
6336,237.93,265.67,388.71,543.6,317.88,501.65,256.61,500.88,361.65,1017.56
1645,522.79,346.71,449.05,524.83,525.35,366.4,304.46,639.11,216.31,754.29
8638,548.8,313.37,473.96,760.0,317.45,351.61,479.45,702.97,473.0,550.14
5518,318.13,218.53,579.58,441.95,918.84,278.58,413.52,331.55,595.63,441.42
9852,304.21,335.45,532.39,428.12,704.74,476.28,250.78,663.14,219.1,377.75


We can unpivot as well

In [0]:
# Reshaping data back into rows
unpivot_columns = df_pivot.columns[1:]  # Exclude 'customer_id'

df_unpivot = df_pivot.selectExpr(
    "customer_id",
    "stack(" + str(len(unpivot_columns)) + ", " +
    ", ".join([f"'{col}', {col}" for col in unpivot_columns]) +
    ") as (category, total_spent)"
)

df_unpivot.limit(5).display()


customer_id,category,total_spent
6336,Accessories,237.93
6336,Beauty,265.67
6336,Books,388.71
6336,Clothes,543.6
6336,Electronics,317.88


We can do approximations on large datasets

In [0]:
# HyperLogLog for Approximate Distinct Counting

df_hyperloglog = (
    df_transactions
        .groupBy("category")
        .agg(F.approx_count_distinct("customer_id").alias("approx_unique_customers"))
)

df_hyperloglog.limit(5).display()


category,approx_unique_customers
Food,9823
Sports,9823
Electronics,9823
Books,9823
Accessories,9823


In [0]:
# Bloom filters for membership testing
# Define the output path with Bloom Filters enabled
bloom_parquet_path = "abfss://pyspark@warnerdatalake.dfs.core.windows.net//exports//transactions_bloom"

# Step 1: Specify the Bloom filter options
bloom_filter_options = {
    "spark.sql.parquet.bloom.filter.enabled": "true",  # Enable Bloom filter
    "spark.sql.parquet.bloom.filter.column": "customer_id",  # Apply Bloom filter to customer_id
    "spark.sql.parquet.bloom.filter.expected.ndv": "10000",  # Expected unique customer IDs
    "spark.sql.parquet.bloom.filter.fpp": "0.01"  # 1% false positive probability
}

# Step 2: Write DataFrame to Parquet with Bloom filter enabled
df_transactions.write \
    .option("parquet.bloom.filter.enabled", bloom_filter_options["spark.sql.parquet.bloom.filter.enabled"]) \
    .option("parquet.bloom.filter.column", bloom_filter_options["spark.sql.parquet.bloom.filter.column"]) \
    .option("parquet.bloom.filter.expected.ndv", bloom_filter_options["spark.sql.parquet.bloom.filter.expected.ndv"]) \
    .option("parquet.bloom.filter.fpp", bloom_filter_options["spark.sql.parquet.bloom.filter.fpp"]) \
    .mode("overwrite") \
    .parquet(bloom_parquet_path)

print("✅ Transactions data written to Parquet with Bloom Filter enabled.")


✅ Transactions data written to Parquet with Bloom Filter enabled.


In [0]:
df_bloom = spark.read \
    .option("parquet.filter.bloom.enabled", "true") \
    .parquet(bloom_parquet_path)

# Apply filter on customer_id to check performance
df_filtered = df_bloom.filter(F.col("customer_id") == 12345)

df_filtered.limit(5).display()


transaction_id,customer_id,transaction_date,amount,category
