### Setup

Make sure you have the files available from previous demos.

In [0]:
# This cell sets all the configuration parameters to connect to Azure Data Lake
spark.conf.set("fs.azure.account.auth.type.<account_name>.dfs.core.windows.net", "OAuth")
spark.conf.set("fs.azure.account.oauth.provider.type.<account_name>.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set("fs.azure.account.oauth2.client.id.<account_name>.dfs.core.windows.net", "****************************")
spark.conf.set("fs.azure.account.oauth2.client.secret.<account_name>.dfs.core.windows.net", "*******************************")
spark.conf.set("fs.azure.account.oauth2.client.endpoint.<account_name>.dfs.core.windows.net", "https://login.microsoftonline.com/************************/oauth2/token")

Verify that cloud storage is accessible

In [0]:
display(dbutils.fs.ls("abfss://etl1@dbstoragebbpbs73u57xmm.dfs.core.windows.net/"))

Let's load the transactions data

In [0]:
from pyspark.sql import functions as F
from pyspark.sql import Window

# Load transactions data from Azure Data Lake
parquet_path = "abfss://etl1@dbstoragebbpbs73u57xmm.dfs.core.windows.net/transactions_data.parquet"

df_transactions = spark.read.parquet(parquet_path)

# Display sample data
df_transactions.limit(100).display()


Let's try some pivoting to reshape the data

In [0]:
df_pivot = (
    df_transactions.groupBy('customer_id')
        .pivot('category')
        .agg(F.sum('amount').alias('total_spent'))
)

df_pivot.limit(100).display()


# # Pivot categories into columns

# df_pivot = (
#     df_transactions
#         .groupBy("customer_id")
#         .pivot("category")
#         .agg(F.sum("amount").alias("total_spent"))
# )

# df_pivot.limit(5).display()


We can unpivot as well

In [0]:
#To unpivit
 
unpivot_columns = df_pivot.columns[1:]
df_unpivot = df_pivot.selectExpr(
    "customer_id",
    "stack(" + str(len(unpivot_columns)) + ", " +
    ", ".join([f"'{col}', {col}" for col in unpivot_columns]) +
    ") as (category, total_spent)"
)

df_unpivot.limit(100).display()


# # Reshaping data back into rows
# unpivot_columns = df_pivot.columns[1:]  # Exclude 'customer_id'

# df_unpivot = df_pivot.selectExpr(
#     "customer_id",
#     "stack(" + str(len(unpivot_columns)) + ", " +
#     ", ".join([f"'{col}', {col}" for col in unpivot_columns]) +
#     ") as (category, total_spent)"
# )

# df_unpivot.limit(5).display()


We can do approximations on large datasets

In [0]:

df_grouped = (
    df_transactions.groupBy('category')
        .agg(F.countDistinct('customer_id').alias('total_amount'))        
)
df_grouped.limit(100).display()

df_grouped_aprox = (
    df_transactions.groupBy('category')
        .agg(F.approxCountDistinct('customer_id').alias('aprox_total_amount'))
)
df_grouped_aprox.limit(100).display()

# # HyperLogLog for Approximate Distinct Counting

# df_hyperloglog = (
#     df_transactions
#         .groupBy("category")
#         .agg(F.approx_count_distinct("customer_id").alias("approx_unique_customers"))
# )

# df_hyperloglog.limit(5).display()


In [0]:
# Bloom filters for membership testing
# Define the output path with Bloom Filters enabled
bloom_parquet_path = "abfss://etl1@dbstoragebbpbs73u57xmm.dfs.core.windows.net//exports//transactions_bloom"

# Step 1: Specify the Bloom filter options
bloom_filter_options = {
    "spark.sql.parquet.bloom.filter.enabled": "true",  # Enable Bloom filter
    "spark.sql.parquet.bloom.filter.column": "customer_id",  # Apply Bloom filter to customer_id
    "spark.sql.parquet.bloom.filter.expected.ndv": "10000",  # Expected unique customer IDs
    "spark.sql.parquet.bloom.filter.fpp": "0.01"  # 1% false positive probability
}

# Step 2: Write DataFrame to Parquet with Bloom filter enabled
df_transactions.write \
    .option("parquet.bloom.filter.enabled", bloom_filter_options["spark.sql.parquet.bloom.filter.enabled"]) \
    .option("parquet.bloom.filter.column", bloom_filter_options["spark.sql.parquet.bloom.filter.column"]) \
    .option("parquet.bloom.filter.expected.ndv", bloom_filter_options["spark.sql.parquet.bloom.filter.expected.ndv"]) \
    .option("parquet.bloom.filter.fpp", bloom_filter_options["spark.sql.parquet.bloom.filter.fpp"]) \
    .mode("overwrite") \
    .parquet(bloom_parquet_path)

print("✅ Transactions data written to Parquet with Bloom Filter enabled.")


In [0]:
df_bloom = spark.read \
    .option("parquet.filter.bloom.enabled", "true") \
    .parquet(bloom_parquet_path)

# Apply filter on customer_id to check performance
df_filtered = df_bloom.filter(F.col("customer_id") == 12345)

df_filtered.limit(5).display()
