In [None]:
import warnings
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql import functions as F
from pyspark.sql import Window
from delta.tables import DeltaTable

warnings.filterwarnings("ignore", category=FutureWarning)

# Create SparkSession
spark = SparkSession.builder.appName("DeltaSession") \
            .config("spark.jars.packages", "io.delta:delta-core_2.12:2.3.0") \
            .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
            .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")\
            .getOrCreate()

## New Table

In [None]:
spark.sql("""
CREATE TABLE IF NOT EXISTS 05changedatafeed (
  id INT,
  name STRING,
  age INT,
  city STRING
  )
  USING DELTA
  TBLPROPERTIES (delta.enableChangeDataFeed = true)
""")

In [None]:
spark.sql("""
INSERT INTO 05changedatafeed (id, name, age, city)
VALUES (5, 'Mateus', 5, 'Sao Paulo')
""")

In [None]:
spark.sql("SELECT * FROM 05changedatafeed").toPandas()

In [None]:
spark.sql("DESCRIBE HISTORY 05changedatafeed").toPandas()

## Existing Table

In [None]:
# Define the schema for a DataFrame
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("city", StringType(), True)
])

# Create a DataFrame using the schema
data = [(1, "Alice", 25, "New York"), 
        (2, "Bob", 30, "San Francisco"), 
        (3, "Charlie", 35, "Chicago")]

df = spark.createDataFrame(data, schema)

# Show the DataFrame
df.toPandas()

In [None]:
df.write.format("delta").saveAsTable("05changedatafeed2")

In [None]:
spark.sql("""
ALTER TABLE 05changedatafeed2 
SET TBLPROPERTIES (delta.enableChangeDataFeed = true)
""")

In [None]:
spark.sql("DESCRIBE HISTORY 05changedatafeed2").toPandas()

## All new tables

In [None]:
# python
spark.conf.set("spark.databricks.delta.properties.defaults.enableChangeDataFeed", "true")

In [None]:
# Define the schema for a DataFrame
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("city", StringType(), True)
])

# Create a DataFrame using the schema
data = [(1, "Alice", 25, "New York"), 
        (2, "Bob", 30, "San Francisco"), 
        (3, "Charlie", 35, "Chicago")]

df = spark.createDataFrame(data, schema)

# Show the DataFrame
df.toPandas()

In [None]:
df.write.format("delta").saveAsTable("05changedatafeed3")

In [None]:
dt = DeltaTable.forName(spark, "05changedatafeed3")

dt.toDF().toPandas()

In [None]:
dt.history().toPandas()

## readChangeFeed

In [None]:
spark.sql("SHOW TABLES").toPandas()

In [None]:
# Define the schema for a DataFrame
schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("city", StringType(), True)
])

# Create a DataFrame using the schema
data = [(5, "Anderson", 32, "Tokio"), 
        (6, None, 35, "Chicago")]

df = spark.createDataFrame(data, schema)

df.toPandas()

In [None]:
df.write.mode("append").format("delta").saveAsTable("05changedatafeed")

In [None]:
(spark.read
    .format("delta")
    .option("readChangeFeed", "true")
    .option("startingVersion", 0)
    .table("05changedatafeed")
).toPandas()

In [None]:
df.write.mode("append").format("delta").saveAsTable("05changedatafeed")

In [None]:
(spark.read
    .format("delta")
    .option("readChangeFeed", "true")
    .option("startingVersion", 1)
    .table("05changedatafeed")
    .orderBy("_commit_version")
).toPandas()

In [None]:
spark.sql("SELECT * FROM table_changes('05changedatafeed', 2)").toPandas()

In [None]:
(spark.read
    .format("delta")
    .option("readChangeFeed", "true")
    .option("startingVersion", 2)
    .option("endingVersion", 3)
    .table("05changedatafeed")
).toPandas()

In [None]:
spark.sql("SELECT * FROM table_changes('05changedatafeed', 1, 2)").toPandas()

In [None]:
dt = DeltaTable.forName(spark, "05changedatafeed")

dt.delete(F.col("age") == 5)

In [None]:
(spark.read
    .format("delta")
    .option("readChangeFeed", "true")
    .option("startingTimestamp", '2023-05-02 21:22:04.193')
    .option("endingTimestamp", "2023-05-02 21:22:06.223")
    .table("05changedatafeed")
).toPandas()

In [None]:
df.write.mode("append").format("delta").saveAsTable("05changedatafeed2")

In [None]:
spark.sql("SELECT * FROM table_changes('05changedatafeed2', 1)").toPandas()

In [None]:
df.write.mode("append").format("delta").saveAsTable("05changedatafeed3")

In [None]:
spark.sql("SELECT * FROM table_changes('05changedatafeed3', 1)").toPandas()

In [None]:
# path based tables
(spark.read.format("delta")
  .option("readChangeFeed", "true")
  .option("startingVersion", 0)
  .load("spark-warehouse/05changedatafeed3")
).toPandas()

In [None]:
spark.sql("SELECT * FROM table_changes_by_path('spark-warehouse/05changedatafeed3', 0)").toPandas()