In [None]:
import warnings
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import functions as F
from pyspark.sql import Window
from delta.tables import DeltaTable

warnings.filterwarnings("ignore", category=FutureWarning)
# Create SparkSession
spark = (SparkSession.builder.appName("DeltaSession")
            .config("spark.executor.memory", "16G")
            .config("spark.driver.memory", "8G")
            #.config("spark.driver.maxResultSize", "2G")
            .config("spark.jars.packages", "io.delta:delta-core_2.12:2.3.0")
            .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
            .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
            .getOrCreate()
        )

In [None]:
# Define the schema for a DataFrame
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("city", StringType(), True),
])

# Create a DataFrame using the schema
data = [(1, "Alice", 25, "New York"), 
        (2, "Bob", 30, "San Francisco"), 
        (3, "Charlie", 35, "Chicago")]

df = spark.createDataFrame(data, schema)

# Show the DataFrame
df.show()

df.write.format("delta").saveAsTable("optimization22")

In [None]:
spark.sql("DESCRIBE optimization22").toPandas()

In [None]:
spark.sql("DESCRIBE HISTORY optimization22").toPandas()

In [None]:
spark.sql("SHOW TABLES").toPandas()

In [None]:
%%time

df.count()

In [None]:
df_dt = DeltaTable.forPath(spark, 'spark-warehouse/optimization22')

df_dt.toDF().toPandas()

## Compaction

In [None]:
opt = df_dt.optimize().executeCompaction()

In [None]:
opt.toPandas()

In [None]:
opt.select("metrics.*").toPandas()

## Vacuum to see the compaction more clearly

In [None]:
spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled", "false")

In [None]:
df_dt.vacuum(0)

In [None]:
# If you have a large amount of data and only want to optimize a subset of it, you can specify an optional partition predicate using `where`
deltaTable.optimize().where("date='2021-11-18'").executeCompaction()

In [None]:
# mini ttest

In [None]:
df = spark.range(0, 5)

## Z-Order

In [None]:
dt = DeltaTable.forPath(spark, 'extract/09delta')

dt.toDF().toPandas()

In [None]:
opt = spark.sql("OPTIMIZE optimization ZORDER BY (id)")

In [None]:
# python
dt.optimize().executeZOrderBy('id')

In [None]:
opt.toPandas()

In [None]:
opt.collect()

In [None]:
opt.select("metrics.*").toPandas()

In [None]:
# If you have a large amount of data and only want to optimize a subset of it, you can specify an optional partition predicate using `where`
df_dt.optimize().where("date='2021-11-18'").executeZOrderBy(eventType)

In [None]:
spark.sql("SELECT * FROM delta.`extract/delta`").show()

## Deletion Vector Support

In [None]:
spark.sql("""
ALTER TABLE <table> 
SET TBLPROPERTIES (delta.enableDeletionVectors = true)
""")