# Delta Optimize Compaction

In [1]:
import pyspark
from delta import *

builder = pyspark.sql.SparkSession.builder.master("local[4]").appName("compaction") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [3]:
df = df.repartition(1000)

In [5]:
from delta.tables import *

deltaTable = DeltaTable.forPath(spark, "test/delta_table")

In [4]:
# write to delta
df.write.format("delta").partitionBy("education").save("test/delta_table")

In [2]:
# read in 2M rows dataset
df = spark.read.csv("data/census_2M.csv", header=True)

In [6]:
# get n files per partition
!ls test/delta_table/education\=10th/*.parquet | wc -l

    1000


In [7]:
%%time
# query
df = spark.read.format("delta").load("test/delta_table")
res = df.where(df.education == "10th").collect()

CPU times: user 160 ms, sys: 19.4 ms, total: 179 ms
Wall time: 14.6 s


In [8]:
%%time
%%capture
# run compaction
deltaTable.optimize().executeCompaction()

CPU times: user 6.87 ms, sys: 3.81 ms, total: 10.7 ms
Wall time: 24.7 s


In [9]:
%%time
# query again
df = spark.read.format("delta").load("test/delta_table")
res = df.where(df.education == "10th").collect()

CPU times: user 156 ms, sys: 16 ms, total: 172 ms
Wall time: 4.62 s


In [None]:
# get n files per partition
!ls test/delta_table/education\=10th/*.parquet | wc -l

In [10]:
# get n files
!ls test/delta_table/education\=10th/*.parquet | wc -l

    1001
