### Setup

Make sure you have the files available from previous demos.

In [0]:
# This cell sets all the configuration parameters to connect to Azure Data Lake
spark.conf.set("fs.azure.account.auth.type.<account_name>.dfs.core.windows.net", "OAuth")
spark.conf.set("fs.azure.account.oauth.provider.type.<account_name>.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set("fs.azure.account.oauth2.client.id.<account_name>.dfs.core.windows.net", "****************************")
spark.conf.set("fs.azure.account.oauth2.client.secret.<account_name>.dfs.core.windows.net", "*******************************")
spark.conf.set("fs.azure.account.oauth2.client.endpoint.<account_name>.dfs.core.windows.net", "https://login.microsoftonline.com/************************/oauth2/token")

Verify that cloud storage is accessible

In [0]:
display(dbutils.fs.ls("abfss://etl1@dbstoragebbpbs73u57xmm.dfs.core.windows.net/"))

Let's load the transactions data with some optimizations

In [0]:
from pyspark.sql import functions as F

# Define the path to the transactions dataset
parquet_path = "abfss://etl1@dbstoragebbpbs73u57xmm.dfs.core.windows.net/transactions_data.parquet"

# Apply column pruning and predicate pushdown
df_optimized = (
    spark.read.parquet(parquet_path)
        .select("category", "amount")  # Column pruning
        .filter(F.col("amount") > 50)  # Predicate pushdown
        .groupBy("category")
        .agg(F.sum("amount").alias("total_amount"))
)

# Display the optimized DataFrame
df_optimized.limit(5).display()


Verify the predicate pushdown in the plan

In [0]:

df_optimized.explain(True)


# #Look for PushedFilters: [GreaterThan(amount,50)] in the output, which confirms predicate pushdown is happening.
# df_optimized.explain(True)

We can optimize retrieval and fit a query pattern with partitioning

In [0]:
# First write the dataframe as a partitioned set of folders
partitioned_path = "abfss://etl1@dbstoragebbpbs73u57xmm.dfs.core.windows.net//exports//transactions_partitioned"
df_transactions = spark.read.parquet(parquet_path)
df_transactions.write.mode("overwrite").partitionBy("category", "transaction_date").parquet(partitioned_path)


Then we can read it back with the right filter

In [0]:
df_partitioned = (
    spark.read.parquet(partitioned_path)
        .filter(F.col('category') == 'Electronics')
)

df_partitioned.limit(50).display()


# df_partitioned = (
#     spark.read.parquet(partitioned_path)
#         .filter(F.col("category") == "Electronics")  # Partition pruning
# )

# df_partitioned.limit(5).display()


Verify the partition pruning in the plan

In [0]:
df_partitioned.explain(True)


# # Look for PartitionFilters: [isnotnull(category), (category = Electronics)], which confirms partition pruning.
# df_partitioned.explain(True)

Some file formats have even more optimizations like delta

In [0]:
delta_path = "abfss://etl1@dbstoragebbpbs73u57xmm.dfs.core.windows.net//exports//transactions_delta"

df_transactions.write.format("delta").mode("overwrite").partitionBy("category", "transaction_date").save(delta_path)


Delta supports Z-Ordering, which improves range-based queries (e.g., amount > X)

In [0]:
from delta.tables import DeltaTable

# Load Delta table
delta_table = DeltaTable.forPath(spark, delta_path)

# Optimize Delta Table Storage
delta_table.optimize().executeZOrderBy("amount")

To confirm all optimizations, let's run .explain() on the Delta table.

In [0]:
df_delta = (
    spark.read.format("delta").load(delta_path)
        .filter((F.col("amount") > 50) & (F.col("category") == "Electronics"))
)

df_delta.explain(True)

# Expected Execution Plan Output
# PushedFilters: [GreaterThan(amount,50)] → ✅ Predicate Pushdown confirmed.
# PartitionFilters: [category=Electronics] → ✅ Partition Pruning confirmed.
# Z-Ordering by amount → ✅ File skipping optimization confirmed.
