In [1]:
import sys, os
os.environ["PATH"] = os.path.dirname(sys.executable) + os.pathsep + os.environ["PATH"]

In [2]:
import os
os.environ["PYSPARK_PYTHON"] = r"C:\Users\krishna\anaconda3\envs\spark39\python.exe"
os.environ["PYSPARK_DRIVER_PYTHON"] = r"C:\Users\krishna\anaconda3\envs\spark39\python.exe"

In [3]:
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip

In [4]:
builder = (
    SparkSession.builder.appName("DeltaApp")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()

In [5]:
from delta.tables import *

In [6]:
DeltaTable.create(spark)\
.tableName("employee_demo")\
.addColumn("emp_id","LONG")\
.addColumn("ename","STRING")\
.addColumn("gender","STRING")\
.addColumn("salary","LONG")\
.addColumn("dept","LONG")\
.property("description","table created")\
.location(r"C:\Users\Krishna\data\empdata")\
.execute()

<delta.tables.DeltaTable at 0x2276dbdcd90>

In [7]:
data = [(100,'Ram','M',5000,10),(200,'Sia','F',4500,20)]

In [8]:
sch = ["emp_id","ename","gender","salary","dept"]

In [9]:
df= spark.createDataFrame(data=data,schema=sch)

In [10]:
df.show()

+------+-----+------+------+----+
|emp_id|ename|gender|salary|dept|
+------+-----+------+------+----+
|   100|  Ram|     M|  5000|  10|
|   200|  Sia|     F|  4500|  20|
+------+-----+------+------+----+



In [11]:
df.write.format('delta').mode("append").saveAsTable("employee_demo")

In [12]:
tb = DeltaTable.forName(spark,'employee_Demo')

In [14]:
tb1 = DeltaTable.forPath(spark,r'C:\Users\Krishna\data\empdata')

In [19]:
tb.toDF().show()

+------+-----+------+------+----+
|emp_id|ename|gender|salary|dept|
+------+-----+------+------+----+
|   200|  Sia|     F|  4500|  20|
|   100|  Ram|     M|  5000|  10|
+------+-----+------+------+----+



In [22]:
tb.delete('emp_id =100')

In [23]:
tb.toDF().show()

+------+-----+------+------+----+
|emp_id|ename|gender|salary|dept|
+------+-----+------+------+----+
|   200|  Sia|     F|  4500|  20|
+------+-----+------+------+----+



In [26]:
tb.history().show()

+-------+--------------------+------+--------+------------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+------------+--------------------+
|version|           timestamp|userId|userName|   operation| operationParameters| job|notebook|clusterId|readVersion|isolationLevel|isBlindAppend|    operationMetrics|userMetadata|          engineInfo|
+-------+--------------------+------+--------+------------+--------------------+----+--------+---------+-----------+--------------+-------------+--------------------+------------+--------------------+
|      2|2025-08-25 16:24:...|  null|    null|      DELETE|{predicate -> ["(...|null|    null|     null|          1|  Serializable|        false|{numRemovedFiles ...|        null|Apache-Spark/3.3....|
|      1|2025-08-25 15:54:...|  null|    null|       WRITE|{mode -> Append, ...|null|    null|     null|          0|  Serializable|         true|{numFiles -> 3, n...|        null|Apache-Spark/3.3.

In [27]:
data = [(300,'Ravan','M',5000,10),(400,'Hanuman','m',4500,20)]

In [28]:
ch = ["emp_id","ename","gender","salary","dept"]

In [29]:
newdf= spark.createDataFrame(data=data,schema=ch)

In [31]:
newdf.write.insertInto('employee_demo',overwrite=False)

In [41]:
tb = DeltaTable.forName(spark,'employee_Demo')

In [53]:
tb.toDF().show()

+------+-------+------+------+----+
|emp_id|  ename|gender|salary|dept|
+------+-------+------+------+----+
|   400|Hanuman|     m|  4500|  20|
|   300|  Ravan|     M|  5000|  10|
|   900|   John|     M|  8000|  10|
|   900|   John|     M|  8000|  10|
|   200|    Sia|     F|  4500|  20|
+------+-------+------+------+----+



In [62]:
tb.history().select('version','timestamp','operation').show(truncate=100)

+-------+-----------------------+------------+
|version|              timestamp|   operation|
+-------+-----------------------+------------+
|      9|2025-08-25 17:04:07.198|      UPDATE|
|      8|2025-08-25 17:03:55.784|      UPDATE|
|      7|2025-08-25 17:03:43.674|      UPDATE|
|      6|2025-08-25 16:56:33.272|      UPDATE|
|      5| 2025-08-25 16:50:51.63|       WRITE|
|      4| 2025-08-25 16:50:13.44|       WRITE|
|      3|2025-08-25 16:40:13.149|       WRITE|
|      2|2025-08-25 16:24:28.854|      DELETE|
|      1| 2025-08-25 15:54:28.54|       WRITE|
|      0|2025-08-25 15:54:06.561|CREATE TABLE|
+-------+-----------------------+------------+



In [35]:
tb.toDF().createTempView('emp')

In [36]:
spark.catalog.listTables()

[Table(name='employee_demo', database='default', description=None, tableType='EXTERNAL', isTemporary=False),
 Table(name='emp', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]

In [40]:
spark.sql("insert into employee_demo(emp_id,ename,gender,salary,dept) values(900,'SMith','M',4900,10)").show()

++
||
++
++



In [44]:
spark.sql("update employee_demo set ename='John' where ename='SMith'").show()

+-----------------+
|num_affected_rows|
+-----------------+
|                2|
+-----------------+



In [52]:
tb.update(condition = "ename = 'John' ",set={"salary": "8000"})

In [58]:
#time travel
spark.sql('select * from employee_demo  version as of 2').show()

+------+-----+------+------+----+
|emp_id|ename|gender|salary|dept|
+------+-----+------+------+----+
|   200|  Sia|     F|  4500|  20|
+------+-----+------+------+----+



In [64]:
spark.sql("select * from employee_demo  timestamp as of '2025-08-25 16:50:51.6'").show()

+------+-------+------+------+----+
|emp_id|  ename|gender|salary|dept|
+------+-------+------+------+----+
|   400|Hanuman|     m|  4500|  20|
|   300|  Ravan|     M|  5000|  10|
|   900|  SMith|     M|  4900|  10|
|   200|    Sia|     F|  4500|  20|
+------+-------+------+------+----+



In [66]:
tb.restoreToVersion(5).show()

+------------------------+--------------------------+-----------------+------------------+------------------+-------------------+
|table_size_after_restore|num_of_files_after_restore|num_removed_files|num_restored_files|removed_files_size|restored_files_size|
+------------------------+--------------------------+-----------------+------------------+------------------+-------------------+
|                    7275|                         5|                0|                 0|                 0|                  0|
+------------------------+--------------------------+-----------------+------------------+------------------+-------------------+



In [67]:
tb.toDF().show()

+------+-------+------+------+----+
|emp_id|  ename|gender|salary|dept|
+------+-------+------+------+----+
|   400|Hanuman|     m|  4500|  20|
|   900|  SMith|     M|  4900|  10|
|   300|  Ravan|     M|  5000|  10|
|   900|  SMith|     M|  4900|  10|
|   200|    Sia|     F|  4500|  20|
+------+-------+------+------+----+



In [68]:
#schema Evolution
from pyspark.sql.types import *

In [71]:
sch = StructType([\
StructField("emp_id",LongType(),True),\
StructField("ename",StringType(),True),\
StructField("gender",StringType(),True),\
StructField("salary",LongType(),True),\
StructField("dept",LongType(),True),\
StructField("addcol1",StringType(),True)])


In [72]:
newdata =[(1000,'Cup','N',4567,30,'good')]

In [73]:
newdf = spark.createDataFrame(data=newdata,schema=sch)

In [74]:
newdf.show()

+------+-----+------+------+----+-------+
|emp_id|ename|gender|salary|dept|addcol1|
+------+-----+------+------+----+-------+
|  1000|  Cup|     N|  4567|  30|   good|
+------+-----+------+------+----+-------+



In [75]:
# will fail
newdf.write.format('delta').mode('append')\
.saveAsTable('employee_demo')

AnalysisException: A schema mismatch detected when writing to the Delta table (Table ID: 13c07e7b-f59c-40c9-a3b1-910e645c5a65).
To enable schema migration using DataFrameWriter or DataStreamWriter, please set:
'.option("mergeSchema", "true")'.
For other operations, set the session configuration
spark.databricks.delta.schema.autoMerge.enabled to "true". See the documentation
specific to the operation for details.

Table schema:
root
-- emp_id: long (nullable = true)
-- ename: string (nullable = true)
-- gender: string (nullable = true)
-- salary: long (nullable = true)
-- dept: long (nullable = true)


Data schema:
root
-- emp_id: long (nullable = true)
-- ename: string (nullable = true)
-- gender: string (nullable = true)
-- salary: long (nullable = true)
-- dept: long (nullable = true)
-- addcol1: string (nullable = true)

         

In [80]:
newdf.write.format('delta').mode('append')\
.option('mergeSchema',"true").saveAsTable('employee_demo')

In [81]:
tbnew = DeltaTable.forName(spark,'employee_demo')
tbnew.toDF().show()

+------+-------+------+------+----+-------+
|emp_id|  ename|gender|salary|dept|addcol1|
+------+-------+------+------+----+-------+
|  1000|    Cup|     N|  4567|  30|   good|
|  1000|    Cup|     N|  4567|  30|   good|
|   400|Hanuman|     m|  4500|  20|   null|
|   900|  SMith|     M|  4900|  10|   null|
|   300|  Ravan|     M|  5000|  10|   null|
|   900|  SMith|     M|  4900|  10|   null|
|   200|    Sia|     F|  4500|  20|   null|
+------+-------+------+------+----+-------+



In [82]:
sch = StructType([\
StructField("emp_id",LongType(),True),\
StructField("ename",StringType(),True),\
StructField("gender",StringType(),True),\
StructField("salary",LongType(),True),\
StructField("dept",LongType(),True),\
StructField("addcol2",StringType(),True)])

In [83]:
newdata =[(2000,'pen','N',4567,30,'good')]

In [84]:
newdf = spark.createDataFrame(data=newdata,schema=sch)

In [85]:
newdf.show()

+------+-----+------+------+----+-------+
|emp_id|ename|gender|salary|dept|addcol2|
+------+-----+------+------+----+-------+
|  2000|  pen|     N|  4567|  30|   good|
+------+-----+------+------+----+-------+



In [86]:
newdf.write.format('delta').mode('append')\
.option('mergeSchema',"true").saveAsTable('employee_demo')

In [87]:
tbnew = DeltaTable.forName(spark,'employee_demo')
tbnew.toDF().show()

+------+-------+------+------+----+-------+-------+
|emp_id|  ename|gender|salary|dept|addcol1|addcol2|
+------+-------+------+------+----+-------+-------+
|  2000|    pen|     N|  4567|  30|   null|   good|
|  1000|    Cup|     N|  4567|  30|   good|   null|
|  1000|    Cup|     N|  4567|  30|   good|   null|
|   400|Hanuman|     m|  4500|  20|   null|   null|
|   900|  SMith|     M|  4900|  10|   null|   null|
|   300|  Ravan|     M|  5000|  10|   null|   null|
|   900|  SMith|     M|  4900|  10|   null|   null|
|   200|    Sia|     F|  4500|  20|   null|   null|
+------+-------+------+------+----+-------+-------+



In [89]:
tb1.optimize().executeCompaction()

DataFrame[path: string, metrics: struct<numFilesAdded:bigint,numFilesRemoved:bigint,filesAdded:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,filesRemoved:struct<min:bigint,max:bigint,avg:double,totalFiles:bigint,totalSize:bigint>,partitionsOptimized:bigint,zOrderStats:struct<strategyName:string,inputCubeFiles:struct<num:bigint,size:bigint>,inputOtherFiles:struct<num:bigint,size:bigint>,inputNumCubes:bigint,mergedFiles:struct<num:bigint,size:bigint>,numOutputCubes:bigint,mergedNumCubes:bigint>,numBatches:bigint,totalConsideredFiles:bigint,totalFilesSkipped:bigint,preserveInsertionOrder:boolean,numFilesSkippedToReduceWriteAmplification:bigint,numBytesSkippedToReduceWriteAmplification:bigint,startTimeMs:bigint,endTimeMs:bigint,totalClusterParallelism:bigint,totalScheduledTasks:bigint,autoCompactParallelismStats:struct<maxClusterActiveParallelism:bigint,minClusterActiveParallelism:bigint,maxSessionActiveParallelism:bigint,minSessionActiveParallelism:bigint>>]

In [91]:
tb1.history().show()

+-------+--------------------+------+--------+------------+--------------------+----+--------+---------+-----------+-----------------+-------------+--------------------+------------+--------------------+
|version|           timestamp|userId|userName|   operation| operationParameters| job|notebook|clusterId|readVersion|   isolationLevel|isBlindAppend|    operationMetrics|userMetadata|          engineInfo|
+-------+--------------------+------+--------+------------+--------------------+----+--------+---------+-----------+-----------------+-------------+--------------------+------------+--------------------+
|     15|2025-08-26 12:06:...|  null|    null|    OPTIMIZE|{predicate -> [],...|null|    null|     null|         14|SnapshotIsolation|        false|{numRemovedFiles ...|        null|Apache-Spark/3.3....|
|     14|2025-08-26 11:08:...|  null|    null|       WRITE|{mode -> Append, ...|null|    null|     null|         13|     Serializable|         true|{numFiles -> 2, n...|        null|Ap

In [97]:
# Dry run: Just see what would be deleted without actually deleting anything
spark.sql("VACUUM delta.'C:\Users\Krishna\data\empdata' DRY RUN")

SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 16-17: truncated \UXXXXXXXX escape (854032216.py, line 2)

In [95]:
tb1.vacuum(retentionHours=0)

DataFrame[]

In [94]:
#To change the retention period, you need to modify the Spark configuration. However, Delta Lake enforces a minimum 168 hours 
#(7 days) retention period unless you explicitly override it.
spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled", "false")

In [None]:
#spark.sql("VACUUM delta.`dbfs:/mnt/delta/my_table` RETAIN 2 HOURS")