## Write to and read from a Delta Lake table

### Write a Spark DataFrame to a Delta Lake table

In [1]:
from pyspark.sql import SparkSession
import subprocess

spark = SparkSession.builder \
    .appName("DeltaLakeDemo") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

delta_path = "/opt/spark/delta/"

:: loading settings :: url = jar:file:/opt/spark-4.0.1-bin-hadoop3/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /home/spark/.ivy2.5.2/cache
The jars for the packages stored in: /home/spark/.ivy2.5.2/jars
io.delta#delta-spark_2.13 added as a dependency
org.apache.spark#spark-connect_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-4ad4336d-30f6-4876-a9c3-e72ea8f5b17f;1.0
	confs: [default]
	found io.delta#delta-spark_2.13;4.0.1 in central
	found io.delta#delta-storage;4.0.1 in central
	found org.antlr#antlr4-runtime;4.13.1 in central
	found org.apache.spark#spark-connect_2.13;4.1.1 in central
	found org.apache.spark#spark-pipelines_2.13;4.1.1 in central
	found org.scala-lang.modules#scala-parallel-collections_2.13;1.2.0 in central
	found jakarta.servlet#jakarta.servlet-api;5.0.0 in central
	found javax.servlet#javax.servlet-api;4.0.1 in central
	found com.google.guava#guava;33.4.8-jre in central
	fo

In [2]:
import os,shutil

if os.path.exists(delta_path+"table1"):
    shutil.rmtree(delta_path+"table1")

In [3]:
data = spark.range(0, 5)

(data
  .write
  .format("delta")
  .save(delta_path + "table1")
)

                                                                                

### Read the above Delta Lake table to a Spark DataFrame and display the DataFrame

In [4]:
df = (spark
        .read
        .format("delta")
        .load(delta_path + "table1")
        .orderBy("id")
      )

df.show()

26/02/18 09:21:31 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+



### Overwrite a Delta Lake table

In [5]:
data = spark.range(5, 10)

# Create a temporary view from the DataFrame to use in SQL queries
# This allows us to reference the data in the MERGE statement using the alias 'data_view'
view_data = data.createOrReplaceTempView("data_view")
table2 = delta_path + "table1"
spark.sql(f"""
  MERGE INTO delta.`{table2}` AS target
  USING data_view AS source
  ON target.id = source.id
  WHEN NOT MATCHED THEN
    INSERT *
""")

DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]

In [6]:
from delta.tables import DeltaTable
from pyspark.sql.functions import col

delta_table = DeltaTable.forPath(spark, delta_path + "table1")

delta_table.alias("target") \
    .merge(
        data.alias("source"),
        "target.id = source.id"
    ) \
    .whenNotMatchedInsert(values={"id": col("source.id")}) \
    .execute()

DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]

In [7]:
df = (spark
        .read
        .format("delta")
        .load(delta_path + "table1")
        .orderBy("id")
      )

df.show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
+---+



### Update Delta Lake Table (Delta Lake Transactions)

In [8]:
from delta.tables import *
from pyspark.sql.functions import *


delta_table = DeltaTable.forPath(spark, delta_path + "table1")

spark.sql(f"""
  UPDATE delta.`{delta_path}table1`
  SET id = id + 100
  WHERE id % 2 == 0
""")


26/02/18 09:22:35 WARN UpdateCommand: Could not validate number of records due to missing statistics.


DataFrame[num_affected_rows: bigint]

In [9]:
# Create multiple modifications to demonstrate vacuum effects
def modify_delta_table(show_history=False):
# Modification 1: Update some records
    spark.sql(f"""
        UPDATE delta.`{delta_path}table1`
        SET id = id + 10
        WHERE id < 105
    """)

    # Modification 2: Delete some records
    spark.sql(f"""
        DELETE FROM delta.`{delta_path}table1`
        WHERE id > 110
    """)

    # Modification 3: Insert new records
    spark.sql(f"""
        INSERT INTO delta.`{delta_path}table1`
        VALUES (200), (201), (202)
    """)

    # Modification 4: Another update
    spark.sql(f"""
        UPDATE delta.`{delta_path}table1`
        SET id = id - 5
        WHERE id >= 200
    """)

    # Show current state
    print("Current table state:")
    delta_table.toDF().orderBy("id").show()

    if show_history:
        # Show table history to see all modifications
        print("\nTable history (showing version accumulation):")
        delta_table.history().select("version", "timestamp", "operation").show(truncate=False)

In [12]:
modify_delta_table(False)

# default retention is 7 days, but we can set it to 0 hours for testing
# RETAIN 0 HOURS option will fail if the following setting is not disabled:
# spark.databricks.delta.retentionDurationCheck.enabled = false

spark.sql(f"""
    VACUUM delta.`{delta_path}table1` 
""")

26/02/18 09:28:36 WARN UpdateCommand: Could not validate number of records due to missing statistics.
26/02/18 09:28:40 WARN DeleteCommand: Could not validate number of records due to missing statistics.
26/02/18 09:28:46 WARN UpdateCommand: Could not validate number of records due to missing statistics.


Current table state:
+---+
| id|
+---+
| 31|
| 33|
| 35|
| 37|
| 39|
|106|
|108|
|110|
|195|
|196|
|197|
+---+



                                                                                

Deleted 0 files and directories in a total of 1 directories.


DataFrame[path: string]

In [13]:
modify_delta_table(True)
result = spark.sql(f"""
        OPTIMIZE delta.`{delta_path}table1`
""")


26/02/18 09:32:57 WARN UpdateCommand: Could not validate number of records due to missing statistics.
26/02/18 09:33:02 WARN DeleteCommand: Could not validate number of records due to missing statistics.
26/02/18 09:33:13 WARN UpdateCommand: Could not validate number of records due to missing statistics.


Current table state:
+---+
| id|
+---+
| 41|
| 43|
| 45|
| 47|
| 49|
|106|
|108|
|110|
|195|
|196|
|197|
+---+


Table history (showing version accumulation):
+-------+-----------------------+------------+
|version|timestamp              |operation   |
+-------+-----------------------+------------+
|22     |2026-02-18 09:33:13.975|UPDATE      |
|21     |2026-02-18 09:33:09.282|WRITE       |
|20     |2026-02-18 09:33:02.564|DELETE      |
|19     |2026-02-18 09:32:57.746|UPDATE      |
|18     |2026-02-18 09:29:08.46 |VACUUM END  |
|17     |2026-02-18 09:29:06.181|VACUUM START|
|16     |2026-02-18 09:28:46.875|UPDATE      |
|15     |2026-02-18 09:28:43.024|WRITE       |
|14     |2026-02-18 09:28:40.24 |DELETE      |
|13     |2026-02-18 09:28:36.468|UPDATE      |
|12     |2026-02-18 09:25:24.526|UPDATE      |
|11     |2026-02-18 09:25:20.559|WRITE       |
|10     |2026-02-18 09:25:15.062|DELETE      |
|9      |2026-02-18 09:25:11.783|UPDATE      |
|8      |2026-02-18 09:23:30.841|VACUUM EN

                                                                                

In [14]:
result.show(truncate=False)

+----------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------+
|path                        |metrics                                                                                                                                            |
+----------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------+
|file:/opt/spark/delta/table1|{1, 11, {547, 547, 547.0, 1, 547}, {494, 494, 494.0, 11, 5434}, 1, NULL, NULL, 1, 1, 11, 0, false, 0, 0, 1771407205716, 0, 32, 0, NULL, NULL, 1, 1}|
+----------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------+



In [15]:
# Extract and display the optimization metrics in a readable format
metrics = result.select("metrics").collect()[0][0]

print("=" * 80)
print("OPTIMIZE Command Metrics Explanation")
print("=" * 80)

print("\n1. FILES ADDED:")
print(f"   - Number of files added: {metrics['numFilesAdded']}")
print(f"   - Min size: {metrics['filesAdded']['min']} bytes")
print(f"   - Max size: {metrics['filesAdded']['max']} bytes")
print(f"   - Avg size: {metrics['filesAdded']['avg']:.2f} bytes")
print(f"   - Total files: {metrics['filesAdded']['totalFiles']}")
print(f"   - Total size: {metrics['filesAdded']['totalSize']} bytes")

print("\n2. FILES REMOVED:")
print(f"   - Number of files removed: {metrics['numFilesRemoved']}")
print(f"   - Min size: {metrics['filesRemoved']['min']} bytes")
print(f"   - Max size: {metrics['filesRemoved']['max']} bytes")
print(f"   - Avg size: {metrics['filesRemoved']['avg']:.2f} bytes")
print(f"   - Total files: {metrics['filesRemoved']['totalFiles']}")
print(f"   - Total size: {metrics['filesRemoved']['totalSize']} bytes")

print("\n3. PARTITIONS & COLUMNS:")
print(f"   - Partitions optimized: {metrics['partitionsOptimized']}")
print(f"   - Number of table columns: {metrics['numTableColumns']}")
print(f"   - Columns with stats: {metrics['numTableColumnsWithStats']}")

print("\n4. FILES & DATA:")
print(f"   - Total files considered: {metrics['totalConsideredFiles']}")
print(f"   - Total files skipped: {metrics['totalFilesSkipped']}")
print(f"   - Files skipped to reduce write amplification: {metrics['numFilesSkippedToReduceWriteAmplification']}")
print(f"   - Bytes skipped to reduce write amplification: {metrics['numBytesSkippedToReduceWriteAmplification']}")
print(f"   - Number of bins: {metrics['numBins']}")
print(f"   - Number of batches: {metrics['numBatches']}")
print(f"   - Preserve insertion order: {metrics['preserveInsertionOrder']}")

print("\n5. TIMING:")
print(f"   - Start time (ms): {metrics['startTimeMs']}")
print(f"   - End time (ms): {metrics['endTimeMs']}")
print(f"   - Duration (ms): {metrics['endTimeMs'] - metrics['startTimeMs']}")

print("\n6. PARALLELISM:")
print(f"   - Total cluster parallelism: {metrics['totalClusterParallelism']}")
print(f"   - Total scheduled tasks: {metrics['totalScheduledTasks']}")

OPTIMIZE Command Metrics Explanation

1. FILES ADDED:
   - Number of files added: 1
   - Min size: 547 bytes
   - Max size: 547 bytes
   - Avg size: 547.00 bytes
   - Total files: 1
   - Total size: 547 bytes

2. FILES REMOVED:
   - Number of files removed: 11
   - Min size: 494 bytes
   - Max size: 494 bytes
   - Avg size: 494.00 bytes
   - Total files: 11
   - Total size: 5434 bytes

3. PARTITIONS & COLUMNS:
   - Partitions optimized: 1
   - Number of table columns: 1
   - Columns with stats: 1

4. FILES & DATA:
   - Total files considered: 11
   - Total files skipped: 0
   - Files skipped to reduce write amplification: 0
   - Bytes skipped to reduce write amplification: 0
   - Number of bins: 1
   - Number of batches: 1
   - Preserve insertion order: False

5. TIMING:
   - Start time (ms): 1771407205716
   - End time (ms): 0
   - Duration (ms): -1771407205716

6. PARALLELISM:
   - Total cluster parallelism: 32
   - Total scheduled tasks: 0


###  `delete`

In [16]:
# Delete every even value
(delta_table
  .delete(
    condition = expr("id % 2 == 0")
  )
)

(delta_table
  .toDF()
  .orderBy("id")
  .show()
)

26/02/18 09:38:16 WARN DeleteCommand: Could not validate number of records due to missing statistics.


+---+
| id|
+---+
| 41|
| 43|
| 45|
| 47|
| 49|
|195|
|197|
+---+



### `merge` Delta Lake Table

In [17]:
# Upsert (merge) new data
new_data = spark.range(0, 20)

(delta_table.alias("old_data")
  .merge(
      new_data.alias("new_data"),
      "old_data.id = new_data.id"
      )
  .whenMatchedUpdate(set = { "id": col("new_data.id") })
  .whenNotMatchedInsert(values = { "id": col("new_data.id") })
  .execute()
)

(delta_table
  .toDF()
  .orderBy("id")
  .show(5)
)

26/02/18 09:38:30 WARN MapPartitionsRDD: RDD 984 was locally checkpointed, its lineage has been truncated and cannot be recomputed after unpersisting


+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+
only showing top 5 rows


### Time Travel: Display the entire history of the above Delta Lake table

In [18]:
# get the full history of the table
delta_table_history = (DeltaTable
                        .forPath(spark, f"{delta_path}table1")
                        .history()
                      )

(delta_table_history
   .select("version", "timestamp", "operation", "operationParameters", "operationMetrics", "engineInfo")
   .show()
)

+-------+--------------------+------------+--------------------+--------------------+--------------------+
|version|           timestamp|   operation| operationParameters|    operationMetrics|          engineInfo|
+-------+--------------------+------------+--------------------+--------------------+--------------------+
|     25|2026-02-18 09:38:...|       MERGE|{predicate -> ["(...|{numTargetRowsCop...|Apache-Spark/4.0....|
|     24|2026-02-18 09:38:...|      DELETE|{predicate -> ["(...|{numRemovedFiles ...|Apache-Spark/4.0....|
|     23|2026-02-18 09:33:...|    OPTIMIZE|{predicate -> [],...|{numRemovedFiles ...|Apache-Spark/4.0....|
|     22|2026-02-18 09:33:...|      UPDATE|{predicate -> ["(...|{numRemovedFiles ...|Apache-Spark/4.0....|
|     21|2026-02-18 09:33:...|       WRITE|{mode -> Append, ...|{numFiles -> 3, n...|Apache-Spark/4.0....|
|     20|2026-02-18 09:33:...|      DELETE|{predicate -> ["(...|{numRemovedFiles ...|Apache-Spark/4.0....|
|     19|2026-02-18 09:32:...|      U

### Latest version of the Delta Lake table

In [47]:
# get the full history of the table
delta_table_history = (DeltaTable
                        .forPath(spark, f"{delta_path}table1")
                        .history()
                      )

(delta_table_history
   .select("version", "timestamp", "operation", "operationParameters", "operationMetrics", "engineInfo")
   .show()
)

+-------+--------------------+-----------------+--------------------+--------------------+--------------------+
|version|           timestamp|        operation| operationParameters|    operationMetrics|          engineInfo|
+-------+--------------------+-----------------+--------------------+--------------------+--------------------+
|     49|2026-02-18 06:44:...|            MERGE|{predicate -> ["(...|{numTargetRowsCop...|Apache-Spark/4.0....|
|     48|2026-02-18 06:44:...|            MERGE|{predicate -> ["(...|{numTargetRowsCop...|Apache-Spark/4.0....|
|     47|2026-02-18 06:44:...|           DELETE|{predicate -> ["(...|{numRemovedFiles ...|Apache-Spark/4.0....|
|     46|2026-02-18 06:43:...|SET TBLPROPERTIES|{properties -> {"...|                  {}|Apache-Spark/4.0....|
|     45|2026-02-18 06:41:...|         OPTIMIZE|{predicate -> [],...|{numRemovedFiles ...|Apache-Spark/4.0....|
|     44|2026-02-18 06:41:...|           UPDATE|{predicate -> ["(...|{numRemovedFiles ...|Apache-Spark/4

### Latest version of the Delta Lake table

In [49]:
df = (spark
        .read
        .format("delta")
        .load(f"{delta_path}table1")
        .orderBy("id")
      )

df.show(5)

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+
only showing top 5 rows


### Time travel to the version `0` of the Delta Lake table using Delta Lake's history feature

In [19]:
df = (spark
        .read
        .format("delta")
        .option("versionAsOf", 0) # we pass an option `versionAsOf` with the required version number we are interested in
        .load(f"{delta_path}table1")
        .orderBy("id")
      )

df.show(5)

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+



### Time travel to the version `3` of the Delta Lake table using Delta Lake's  history feature

In [20]:
df = (spark
        .read
        .format("delta")
        .option("versionAsOf", 3) # we pass an option `versionAsOf` with the required version number we are interested in
        .load(f"{delta_path}table1")
        .orderBy("id")
      )

df.show(5)

+---+
| id|
+---+
| 11|
| 13|
| 15|
| 17|
| 19|
+---+
only showing top 5 rows
