In [2]:
from delta import configure_spark_with_delta_pip
from pyspark.sql import SparkSession

builder = SparkSession.builder \
    .master("local[*]") \
    .appName("DeltaLakeExample") \
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
        .config("spark.jars.packages", "io.delta:delta-spark_2.13:4.0.1") 

spark = configure_spark_with_delta_pip(builder).getOrCreate()

print("Spark session created successfully!")
print(f"Spark version: {spark.version}")

Spark session created successfully!
Spark version: 4.0.0


In [3]:
df = spark.createDataFrame([(1, "Starter"), (2, "Pro")], ["id", "plan"])
df.write.format("delta").mode("overwrite").save("/tmp/delta-tables/table2")
print("Delta table created successfully!")

                                                                                

Delta table created successfully!


In [4]:
# 1. Read the Delta table we created
print("=== Reading Delta Table ===")
delta_df = spark.read.format("delta").load("/tmp/delta-tables/table2")
delta_df.show()

=== Reading Delta Table ===


26/02/03 20:24:51 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+---+-------+
| id|   plan|
+---+-------+
|  1|Starter|
|  2|    Pro|
+---+-------+



In [5]:
# 2. Check table schema and metadata
print("\n=== Table Schema ===")
delta_df.printSchema()
print("\n=== Table Count ===")
print(f"Number of records: {delta_df.count()}")


=== Table Schema ===
root
 |-- id: long (nullable = true)
 |-- plan: string (nullable = true)


=== Table Count ===
Number of records: 2


In [6]:
# 4. Add more data (Append mode)
print("\n=== Appending New Data ===")
new_data = spark.createDataFrame([(3, "Enterprise"), (4, "Basic")], ["id", "plan"])
new_data.write.format("delta").mode("append").save("/opt/spark/delta-tables/table2")

# Read updated table
updated_df = spark.read.format("delta").load("/opt/spark/delta-tables/table2")
print("Updated table:")
updated_df.show()


=== Appending New Data ===
Updated table:
+---+----------+
| id|      plan|
+---+----------+
|  3|Enterprise|
|  4|     Basic|
+---+----------+



In [9]:
from delta.tables import DeltaTable

deltaTable = DeltaTable.forPath(spark, "/opt/spark/delta-tables/table2")

# 5. Update existing records
print("\n=== Updating Records ===")
deltaTable.update(
    condition="id = 1",
    set={"plan": "'Starter Pro'"}
)

# Read after update
print("After update:")
spark.read.format("delta").load("/opt/spark/delta-tables/table2").show()


=== Updating Records ===
After update:
+---+----------+
| id|      plan|
+---+----------+
|  3|Enterprise|
|  4|     Basic|
+---+----------+



In [10]:
# 6. Delete records
print("\n=== Deleting Records ===")
deltaTable.delete("id = 4")

# Read after delete
print("After delete:")
spark.read.format("delta").load("/opt/spark/delta-tables/table2").show()


=== Deleting Records ===


26/02/03 20:26:02 WARN DeleteCommand: Could not validate number of records due to missing statistics.


After delete:
+---+----------+
| id|      plan|
+---+----------+
|  3|Enterprise|
+---+----------+



In [12]:
# 7. Upsert (Merge) operation
print("\n=== Upsert/Merge Operation ===")
merge_data = spark.createDataFrame([(2, "Pro Max"), (5, "Premium")], ["id", "plan"])

deltaTable.alias("target").merge(
    merge_data.alias("source"),
    "target.id = source.id"
).whenMatchedUpdate(set={"plan": "source.plan"}).whenNotMatchedInsert(values={"id": "source.id", "plan": "source.plan"}).execute()

print("After merge:")
spark.read.format("delta").load("/opt/spark/delta-tables/table2").show()

# 8. Time travel queries
print("\n=== Time Travel Queries ===")
# Read table at version 0
print("Version 0:")
spark.read.format("delta").option("versionAsOf", 0).load("/opt/spark/delta-tables/table2").show()


print("\n=== Final Table State ===")
spark.read.format("delta").load("/opt/spark/delta-tables/table2").show()
deltaTable.history().select("version", "timestamp", "operation").show()


=== Upsert/Merge Operation ===


26/02/03 20:26:54 WARN MapPartitionsRDD: RDD 215 was locally checkpointed, its lineage has been truncated and cannot be recomputed after unpersisting


After merge:
+---+----------+
| id|      plan|
+---+----------+
|  3|Enterprise|
|  2|   Pro Max|
|  5|   Premium|
+---+----------+


=== Time Travel Queries ===
Version 0:
+---+----------+
| id|      plan|
+---+----------+
|  3|Enterprise|
|  4|     Basic|
+---+----------+


=== Final Table State ===
+---+----------+
| id|      plan|
+---+----------+
|  3|Enterprise|
|  2|   Pro Max|
|  5|   Premium|
+---+----------+

+-------+--------------------+---------+
|version|           timestamp|operation|
+-------+--------------------+---------+
|      3|2026-02-03 20:26:...|    MERGE|
|      2|2026-02-03 20:25:...|    MERGE|
|      1|2026-02-03 20:25:...|   DELETE|
|      0|2026-02-03 20:24:...|    WRITE|
+-------+--------------------+---------+

