In [0]:
from pyspark.sql.functions import expr, rand, monotonically_increasing_id
import time
df = spark.range(0, 10_000_000) \
    .withColumn("customer_id", (rand() * 10000).cast("int")) \
    .withColumn("sales_amount", (rand() * 1000).cast("double")) \
    .withColumn("region", expr("CASE WHEN customer_id % 4 = 0 THEN 'North' " +
                               "WHEN customer_id % 4 = 1 THEN 'South' " +
                               "WHEN customer_id % 4 = 2 THEN 'East' ELSE 'West' END"))

df.write.format("delta").mode("overwrite").save("/tmp/sales_data")


In [0]:
start_time = time.time()

df_unoptimized = spark.read.format("delta").load("/tmp/sales_data")
filtered_df = df_unoptimized.filter("customer_id = 1234")
filtered_df.count()

print(f"Unoptimized Read Time: {time.time() - start_time:.2f} seconds")


Unoptimized Read Time: 11.30 seconds


In [0]:
spark.sql("OPTIMIZE delta.`/tmp/sales_data` ZORDER BY (customer_id)")


Out[3]: DataFrame[path: string, metrics: struct<numFilesAdded:bigint,numFilesRemoved:bigint,filesAdded:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,filesRemoved:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,partitionsOptimized:bigint,zOrderStats:struct<strategyName:string,inputCubeFiles:struct<num:bigint,size:bigint>,inputOtherFiles:struct<num:bigint,size:bigint>,inputNumCubes:bigint,mergedFiles:struct<num:bigint,size:bigint>,numOutputCubes:bigint,mergedNumCubes:bigint>,numBatches:bigint,totalConsideredFiles:bigint,totalFilesSkipped:bigint,preserveInsertionOrder:boolean,numFilesSkippedToReduceWriteAmplification:bigint,numBytesSkippedToReduceWriteAmplification:bigint,startTimeMs:bigint,endTimeMs:bigint,totalClusterParallelism:bigint,totalScheduledTasks:bigint,autoCompactParallelismStats:struct<maxClusterActiveParallelism:bigint,minClusterActiveParallelism:bigint,maxSessionActiveParallelism:bigint,minSessionActiveParallelism:bi

In [0]:
start_time = time.time()

df_optimized = spark.read.format("delta").load("/tmp/sales_data")
filtered_df = df_optimized.filter("customer_id = 1234")
filtered_df.count()

print(f"Optimized Read Time: {time.time() - start_time:.2f} seconds")


Optimized Read Time: 2.57 seconds
