### Optimize - merge all small data files into a large file

In [0]:
data = [(1,100),(2,200),(3,300)]
schema = "id INT, salary INT"
df = spark.createDataFrame(data,schema)
df.display()


In [0]:
# Execute for 5 times
df.write.format("delta")\
            .mode("append")\
            .save("/Volumes/inceptez_catalog/inputdb/employee/deltaopt/")
print("Data written")

In [0]:
spark.sql("OPTIMIZE delta.`/Volumes/inceptez_catalog/inputdb/employee/deltaopt`")


In [0]:
spark.sql("select * from delta.`/Volumes/inceptez_catalog/inputdb/employee/deltaopt`").display()

In [0]:
spark.sql("DESCRIBE HISTORY delta.`/Volumes/inceptez_catalog/inputdb/employee/deltaopt`").display()

In [0]:
spark.sql("RESTORE delta.`/Volumes/inceptez_catalog/inputdb/employee/deltaopt` TO VERSION AS OF 4")

In [0]:
spark.sql("select * from delta.`/Volumes/inceptez_catalog/inputdb/employee/deltaopt`").display()

In [0]:
spark.sql("RESTORE delta.`/Volumes/inceptez_catalog/inputdb/employee/deltaopt` TO VERSION AS OF 5")

In [0]:
spark.sql("select * from delta.`/Volumes/inceptez_catalog/inputdb/employee/deltaopt`").display()

###ZOrder - During optimize, colocate or sort the records based on specified columns in ZORDER

In [0]:
data = [(1, "John", "Sales"),
        (2, "Mary", "HR"),
        (3, "Raj", "IT"),
        (4, "Anita", "Finance")]

df = spark.createDataFrame(data, ["emp_id", "emp_name", "dept"])

# Save as Delta table (with small files)
path = "/Volumes/inceptez_catalog/inputdb/employee/emp_optimize"
df.repartition(4).write.format("delta").mode("overwrite").save(path)
print("Table created")


In [0]:
spark.sql("OPTIMIZE delta.`/Volumes/inceptez_catalog/inputdb/employee/emp_optimize` ZORDER BY (emp_id)")

**Liquid Clustering** - Automatic, ongoing, table-level clustering
- physically groups data in a table based on one or more columns to improve query performance when we do filtering and joins

In [0]:
%sql
CREATE TABLE inceptez_catalog.inputdb.salesdata
(id int,
product string,
amount long
)
CLUSTER BY AUTO

In [0]:
%sql
insert into inceptez_catalog.inputdb.salesdata values(1,'Mobile',5000);

In [0]:
spark.sql("DESCRIBE HISTORY inceptez_catalog.inputdb.salesdata").display()

In [0]:
%sql
CREATE OR REPLACE TABLE delta.`/Volumes/inceptez_catalog/inputdb/moviesdata/movies_delta`
CLUSTER BY AUTO
AS 
SELECT 'Inception' AS Title, 2010 AS Release_Year, 8.8 AS Rating
UNION ALL
SELECT 'Interstellar', 2014, 8.6;

### Vacuum - To delete old/unused data files


In [0]:

spark.sql("select * from delta.`/Volumes/inceptez_catalog/inputdb/employee/employee_delta`").display()

In [0]:
spark.sql("DESCRIBE HISTORY delta.`/Volumes/inceptez_catalog/inputdb/employee/employee_delta`").display()

In [0]:
display(dbutils.fs.ls("/Volumes/inceptez_catalog/inputdb/employee/employee_delta"))

In [0]:
spark.sql("insert into delta.`/Volumes/inceptez_catalog/inputdb/employee/employee_delta` select 7,'Ganesh'")

In [0]:

spark.conf.set(
    "spark.databricks.delta.retentionDurationCheck.enabled",
    "false"
)
from delta.tables import DeltaTable
delta_tbl = DeltaTable.forPath(spark, "/Volumes/inceptez_catalog/inputdb/employee/employee_delta")
delta_tbl.vacuum(retentionHours=1)

In [0]:
%sql
CREATE TABLE inceptez_catalog.inputdb.sales_data (
    sales_id INT,
    product_id INT,
    region STRING,
    sales_amount DOUBLE,
    sales_date DATE
)
USING DELTA
CLUSTER BY AUTO;

In [0]:
%sql
INSERT INTO inceptez_catalog.inputdb.sales_data VALUES
  (1, 101, 'North', 1000.50, '2025-10-16'),
  (2, 102, 'South', 500.75, '2025-10-16'),
  (3, 103, 'East', 700.20, '2025-10-16'),
  (4, 101, 'West', 1200.00, '2025-10-16');

In [0]:
%sql
INSERT INTO inceptez_catalog.inputdb.sales_data VALUES
  (5, 102, 'North', 300.00, '2025-10-17'),
  (6, 103, 'South', 450.00, '2025-10-17'),
  (7, 101, 'East', 800.00, '2025-10-17'),
  (8, 104, 'West', 950.00, '2025-10-17');

In [0]:
%sql
-- See files and clustering info
DESCRIBE DETAIL inceptez_catalog.inputdb.sales_data;

In [0]:
%sql
SELECT * FROM inceptez_catalog.inputdb.sales_data
WHERE region = 'North';

In [0]:
%sql
DESCRIBE DETAIL inceptez_catalog.inputdb.sales_data;

In [0]:
%sql
delete from  inceptez_catalog.inputdb.sales_data where sales_id=8;

In [0]:
%sql
DESCRIBE HISTORY inceptez_catalog.inputdb.sales_data;

In [0]:
%sql
update inceptez_catalog.inputdb.sales_data set sales_amount=1000 where sales_id=1;
    
select * from inceptez_catalog.inputdb.sales_data;