In [6]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_timestamp, lit
from delta import DeltaTable
import os

spark = SparkSession.builder \
    .appName("Delta Lake CDC Demo") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

print("✓ Spark session created with Delta Lake support")

✓ Spark session created with Delta Lake support


In [7]:
import shutil

# Set up a Delta table path
delta_path = os.path.join("/opt/spark/delta", "delta_cdc_demo")

# Clean up any existing data
if os.path.exists(delta_path):
    shutil.rmtree(delta_path)

df_cdc = spark.createDataFrame(
    [(1, "Alice", "HR"),
     (2, "Bob", "IT"),
     (3, "Carol", "Finance")],
    ["id", "name", "dept"]
)

# Write Delta table with Change Data Feed enabled
(df_cdc.write
 .format("delta")
 .option("delta.enableChangeDataFeed", "true")
 .mode("append")
 .save(delta_path))

print("✓ Delta table created with CDC enabled")

                                                                                

✓ Delta table created with CDC enabled


In [8]:
# Perform updates and deletes
delta_table = DeltaTable.forPath(spark, delta_path)

# Update
delta_table.update(
    condition=col("id") == 2,
    set={"dept": lit("Security")}
)

# Delete
delta_table.delete(condition=col("id") == 3)

print("✓ Update and delete operations complete")

26/02/18 09:41:33 WARN UpdateCommand: Could not validate number of records due to missing statistics.
26/02/18 09:41:35 WARN DeleteCommand: Could not validate number of records due to missing statistics.


✓ Update and delete operations complete


In [9]:
# Read change data feed
cdc_df = (spark.read
          .format("delta")
          .option("readChangeFeed", "true")
          .option("startingVersion", 0)
          .load(delta_path))

cdc_df.show(truncate=False)

26/02/18 09:41:58 WARN CaseInsensitiveStringMap: Converting duplicated key startingVersion into CaseInsensitiveStringMap.
26/02/18 09:41:58 WARN CaseInsensitiveStringMap: Converting duplicated key readchangefeed into CaseInsensitiveStringMap.


+---+-----+--------+----------------+---------------+-----------------------+
|id |name |dept    |_change_type    |_commit_version|_commit_timestamp      |
+---+-----+--------+----------------+---------------+-----------------------+
|2  |Bob  |IT      |update_preimage |1              |2026-02-18 09:41:33.216|
|2  |Bob  |Security|update_postimage|1              |2026-02-18 09:41:33.216|
|3  |Carol|Finance |delete          |2              |2026-02-18 09:41:35.427|
|3  |Carol|Finance |insert          |0              |2026-02-18 09:41:01.469|
|1  |Alice|HR      |insert          |0              |2026-02-18 09:41:01.469|
|2  |Bob  |IT      |insert          |0              |2026-02-18 09:41:01.469|
+---+-----+--------+----------------+---------------+-----------------------+



In [10]:
# Write CDC values to CSV
spark_data = "/opt/spark/data"
cdc_output_path = os.path.join(spark_data, "cdc_csv")

(cdc_df
 .write
 .mode("overwrite")
 .option("header", "true")
 .csv(cdc_output_path))

print(f"✓ CDC values written to CSV at: {cdc_output_path}")

✓ CDC values written to CSV at: /opt/spark/data/cdc_csv
