In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_timestamp, lit
from delta import DeltaTable
import os

# Delta Lake Change Data Capture (CDC) Demonstration
# This notebook demonstrates how to use Delta Lake's Change Data Feed feature
# to track and query changes over time

# Import required libraries

# Create SparkSession with Delta Lake support and CDC enabled
spark = SparkSession.builder \
    .appName("Delta Lake CDC Demo") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

print("✓ Spark session created with Delta Lake support")

:: loading settings :: url = jar:file:/opt/spark-4.0.1-bin-hadoop3/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /home/spark/.ivy2.5.2/cache
The jars for the packages stored in: /home/spark/.ivy2.5.2/jars
io.delta#delta-spark_2.13 added as a dependency
org.apache.spark#spark-connect_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-98cd512a-b31d-498f-b90f-72d477defaa0;1.0
	confs: [default]
	found io.delta#delta-spark_2.13;4.0.1 in central
	found io.delta#delta-storage;4.0.1 in central
	found org.antlr#antlr4-runtime;4.13.1 in central
	found org.apache.spark#spark-connect_2.13;4.1.1 in central
	found org.apache.spark#spark-pipelines_2.13;4.1.1 in central
	found org.scala-lang.modules#scala-parallel-collections_2.13;1.2.0 in central
	found jakarta.servlet#jakarta.servlet-api;5.0.0 in central
	found javax.servlet#javax.servlet-api;4.0.1 in central
	found com.google.guava#guava;33.4.8-jre in central
	fo

✓ Spark session created with Delta Lake support


In [2]:
import shutil

# Set up a Delta table path
delta_path = os.path.join("/opt/spark/delta", "delta_cdc_demo")

# Clean up any existing data
if os.path.exists(delta_path):
    shutil.rmtree(delta_path)

df_cdc = spark.createDataFrame(
    [(1, "Alice", "HR"),
     (2, "Bob", "IT"),
     (3, "Carol", "Finance")],
    ["id", "name", "dept"]
)

# Write Delta table with Change Data Feed enabled
(df_cdc.write
 .format("delta")
 .option("delta.enableChangeDataFeed", "true")
 .mode("overwrite")
 .save(delta_path))

print("✓ Delta table created with CDC enabled")

                                                                                

✓ Delta table created with CDC enabled


In [3]:
# Perform updates and deletes
delta_table = DeltaTable.forPath(spark, delta_path)

# Update
delta_table.update(
    condition=col("id") == 2,
    set={"dept": lit("Security")}
)

# Delete
delta_table.delete(condition=col("id") == 3)

print("✓ Update and delete operations complete")

26/02/18 06:49:29 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
26/02/18 06:49:34 WARN UpdateCommand: Could not validate number of records due to missing statistics.
26/02/18 06:49:36 WARN DeleteCommand: Could not validate number of records due to missing statistics.


✓ Update and delete operations complete


In [4]:
# Read change data feed
cdc_df = (spark.read
          .format("delta")
          .option("readChangeFeed", "true")
          .option("startingVersion", 0)
          .load(delta_path))

cdc_df.show(truncate=False)

26/02/18 06:49:37 WARN CaseInsensitiveStringMap: Converting duplicated key startingVersion into CaseInsensitiveStringMap.
26/02/18 06:49:37 WARN CaseInsensitiveStringMap: Converting duplicated key readchangefeed into CaseInsensitiveStringMap.


+---+-----+--------+----------------+---------------+-----------------------+
|id |name |dept    |_change_type    |_commit_version|_commit_timestamp      |
+---+-----+--------+----------------+---------------+-----------------------+
|2  |Bob  |IT      |update_preimage |1              |2026-02-18 06:49:34.33 |
|2  |Bob  |Security|update_postimage|1              |2026-02-18 06:49:34.33 |
|3  |Carol|Finance |delete          |2              |2026-02-18 06:49:36.583|
|3  |Carol|Finance |insert          |0              |2026-02-18 06:49:28.077|
|1  |Alice|HR      |insert          |0              |2026-02-18 06:49:28.077|
|2  |Bob  |IT      |insert          |0              |2026-02-18 06:49:28.077|
+---+-----+--------+----------------+---------------+-----------------------+



In [5]:
# Write CDC values to CSV
spark_data = "/opt/spark/data"
cdc_output_path = os.path.join(spark_data, "cdc_csv")

(cdc_df
 .write
 .mode("overwrite")
 .option("header", "true")
 .csv(cdc_output_path))

print(f"✓ CDC values written to CSV at: {cdc_output_path}")

✓ CDC values written to CSV at: /opt/spark/data/cdc_csv
