In [None]:
pip install delta-spark==4.0.0


In [1]:
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip

builder = (
    SparkSession.builder
        .appName("DeltaDemo")
        .master("local[1]")
        # IMPORTANT: Delta Lake configs
        .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
        .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()



In [3]:
from pyspark.sql import Row

data = [
    Row(id=1, name="Ravi",  salary=50000.0),
    Row(id=2, name="Priya", salary=60000.0),
    Row(id=3, name="John",  salary=45000.0),
]

df = spark.createDataFrame(data)

delta_path = "file:///c:/data/delta/emp"

# Write as Delta table (overwrite mode)
df.write.format("delta").mode("overwrite").save(delta_path)


In [4]:
df2 = spark.read.format("delta").load(delta_path)
df2.show()


+---+-----+-------+
| id| name| salary|
+---+-----+-------+
|  1| Ravi|50000.0|
|  2|Priya|60000.0|
|  3| John|45000.0|
+---+-----+-------+



In [5]:
new_data = [
    (4, "Anita", 52000.0),
    (5, "Kumar", 62000.0),
]
cols =["id","name","salary"]
new_df = spark.createDataFrame(new_data, cols)

new_df.write.format("delta").mode("append").save(delta_path)


In [6]:
df2 = spark.read.format("delta").load(delta_path)
df2.show()


+---+-----+-------+
| id| name| salary|
+---+-----+-------+
|  1| Ravi|50000.0|
|  2|Priya|60000.0|
|  3| John|45000.0|
|  4|Anita|52000.0|
|  5|Kumar|62000.0|
+---+-----+-------+



In [31]:
df_v1 = spark.read.format("delta").option("versionAsOf", 3).load(delta_path)
df_v1.show()

+---+-----+-------+
| id| name| salary|
+---+-----+-------+
|  4|Anita|52000.0|
|  5|Kumar|62000.0|
|  2|Priya|60000.0|
|  3| John|45000.0|
+---+-----+-------+



In [25]:
from delta.tables import DeltaTable

dt = DeltaTable.forPath(spark, delta_path)

dt.history().select(
    "version",
    "timestamp",
    "operation",
    "operationParameters"
).show(truncate=False)

+-------+-----------------------+---------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|version|timestamp              |operation|operationParameters                                                                                                                                                            |
+-------+-----------------------+---------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|4      |2025-12-05 12:33:06.947|MERGE    |{predicate -> ["(id#3887L = id#8464L)"], matchedPredicates -> [{"actionType":"update"}], notMatchedPredicates -> [{"actionType":"insert"}], notMatchedBySourcePredicates -> []}|
|3      |2025-12-05 12:25:35.677|DELETE   |{predicate -> ["(id#3887L = 1)"]}                                            

In [32]:
from delta.tables import DeltaTable
from pyspark.sql.functions import *
delta_table = DeltaTable.forPath(spark, delta_path)

delta_table.update(
    condition="id = 1 ",
    set={
        "salary": "salary +1000"
    }
)

{"ts": "2025-12-05 12:44:40.691", "level": "ERROR", "logger": "SQLQueryContextLogger", "msg": "[UNRESOLVED_COLUMN.WITH_SUGGESTION] A column, variable, or function parameter with name `abc` cannot be resolved. Did you mean one of the following? [`id`, `name`, `salary`]. SQLSTATE: 42703", "context": {"errorClass": "UNRESOLVED_COLUMN.WITH_SUGGESTION"}, "exception": {"class": "Py4JJavaError", "msg": "An error occurred while calling o313.update.\n: org.apache.spark.sql.AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column, variable, or function parameter with name `abc` cannot be resolved. Did you mean one of the following? [`id`, `name`, `salary`]. SQLSTATE: 42703; line 1 pos 0;\n'DeltaUpdateTable [salary#12625], ['abc], (id#12623L = cast(1 as bigint))\n+- Relation [id#12623L,name#12624,salary#12625] parquet\n\r\n\tat org.apache.spark.sql.errors.QueryCompilationErrors$.unresolvedAttributeError(QueryCompilationErrors.scala:401)\r\n\tat org.apache.spark.sql.catalyst.analysis.CheckA

AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column, variable, or function parameter with name `abc` cannot be resolved. Did you mean one of the following? [`id`, `name`, `salary`]. SQLSTATE: 42703; line 1 pos 0;
'DeltaUpdateTable [salary#12625], ['abc], (id#12623L = cast(1 as bigint))
+- Relation [id#12623L,name#12624,salary#12625] parquet


In [21]:
df2 = spark.read.format("delta").load(delta_path)
df2.show()

+---+-----+-------+
| id| name| salary|
+---+-----+-------+
|  4|Anita|52000.0|
|  5|Kumar|62000.0|
|  2|Priya|60000.0|
|  3| John|45000.0|
+---+-----+-------+



In [16]:
delta_table.delete("id = 1")



In [19]:
df2 = spark.read.format("delta").load(delta_path)
df2.show()

+---+-----+-------+
| id| name| salary|
+---+-----+-------+
|  4|Anita|52000.0|
|  5|Kumar|62000.0|
|  2|Priya|60000.0|
|  3| John|45000.0|
+---+-----+-------+



In [27]:
changes_data = [
    (1, "Ravi",55000.0),  
    (3, "John",58000.0),  
    (7, "Meena",63000.0),  
]
cols =["id","name","salary"]
changes_df = spark.createDataFrame(changes_data, cols)
changes_df.show()

+---+-----+-------+
| id| name| salary|
+---+-----+-------+
|  1| Ravi|55000.0|
|  3| John|58000.0|
|  7|Meena|63000.0|
+---+-----+-------+



In [23]:
delta_table.alias("t").merge(
    source=changes_df.alias("s"),
    condition="t.id = s.id"             # join condition
).whenMatchedUpdate(
    set={
        "name":   "s.name",
         "salary": "s.salary"
    }
).whenNotMatchedInsert(
    values={
        "id": "s.id",
        "name":   "s.name",
        "salary": "s.salary"
    }
).execute()


DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]

In [24]:
df2 = spark.read.format("delta").load(delta_path)
df2.show()

+---+-----+-------+
| id| name| salary|
+---+-----+-------+
|  1| Ravi|55000.0|
|  2|Priya|60000.0|
|  3| John|58000.0|
|  7|Meena|63000.0|
|  4|Anita|52000.0|
|  5|Kumar|62000.0|
+---+-----+-------+



In [36]:
from delta.tables import DeltaTable

dt = DeltaTable.forPath(spark, delta_path)

dt.history().select(
    "version",
    "timestamp",
    "operation",
    "operationParameters"
).show(truncate=False)


+-------+-----------------------+---------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|version|timestamp              |operation|operationParameters                                                                                                                                                            |
+-------+-----------------------+---------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|5      |2025-12-04 19:51:05.324|MERGE    |{predicate -> ["(id#3192L = id#5308L)"], matchedPredicates -> [{"actionType":"update"}], notMatchedPredicates -> [{"actionType":"insert"}], notMatchedBySourcePredicates -> []}|
|4      |2025-12-04 19:41:00.409|DELETE   |{predicate -> ["(id#3192L = 1)"]}                                            

In [31]:
df_v1 = spark.read.format("delta").option("versionAsOf", 3).load(delta_path)
df_v1.show()


+---+-----+------------------+
| id| name|            salary|
+---+-----+------------------+
|  1| Ravi|60500.000000000015|
|  2|Priya|           60000.0|
|  3| John|           45000.0|
|  4|Anita|           52000.0|
|  5|Kumar|           62000.0|
+---+-----+------------------+

