In [0]:
from delta.tables import DeltaTable

In [0]:
# MERGE for incremental updates
deltaTable = DeltaTable.forPath(spark, "/Volumes/workspace/ecommerce/ecommerce_data/delta/events")
updates = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv", header=True, inferSchema=True)

In [0]:
dedup_updates = updates.dropDuplicates(['user_session', 'event_time'])
deltaTable.alias("t").merge(
    dedup_updates.alias("s"),
    "t.user_session = s.user_session AND t.event_time = s.event_time"
).whenMatchedUpdateAll() \
 .whenNotMatchedInsertAll() \
 .execute()


com.databricks.backend.common.rpc.CommandCancelledException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$5(SequenceExecutionState.scala:139)
	at scala.Option.getOrElse(Option.scala:201)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:139)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:136)
	at scala.collection.immutable.Range.foreach(Range.scala:192)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:136)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:721)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:441)
	at scala.Option.getOrElse(Option.scala:201)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:441)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.can

In [0]:
# Time travel
v0 = spark.read.format("delta").option("versionAsOf", 0).load("/Volumes/workspace/ecommerce/ecommerce_data/delta/events")
yesterday = spark.read.format("delta") \
    .option("timestampAsOf", "2024-01-01").load("/Volumes/workspace/ecommerce/ecommerce_data/delta/events")

In [0]:
# Optimize
spark.sql("OPTIMIZE events_table ZORDER BY (event_type, user_id)")
spark.sql("VACUUM events_table RETAIN 168 HOURS")

DataFrame[path: string]

In [0]:
# Verify Delta table is valid
from delta.tables import DeltaTable

DeltaTable.isDeltaTable(
    spark,
    "/Volumes/workspace/ecommerce/ecommerce_data/delta/events"
)

np.True_

In [0]:
# Check table details (schema, format, location)
spark.sql("""
DESCRIBE DETAIL events_table
""").show(truncate=False)

+------+------------------------------------+------------------------------+-----------+--------+-----------------------+-------------------+----------------+-----------------+--------+-----------+-------------------------------------+----------------+----------------+-----------------------------------------+---------------------------------------------------------------+-------------+
|format|id                                  |name                          |description|location|createdAt              |lastModified       |partitionColumns|clusteringColumns|numFiles|sizeInBytes|properties                           |minReaderVersion|minWriterVersion|tableFeatures                            |statistics                                                     |clusterByAuto|
+------+------------------------------------+------------------------------+-----------+--------+-----------------------+-------------------+----------------+-----------------+--------+-----------+-----------------------

In [0]:
# Verify MERGE worked (row count increased / updated)
spark.sql("""
SELECT COUNT(*) AS total_rows
FROM events_table
""").show()

+----------+
|total_rows|
+----------+
|  67501979|
+----------+



In [0]:
# Verify MERGE logic (sample matched rows)
spark.sql("""
SELECT user_session, event_time, event_type, user_id
FROM events_table
ORDER BY event_time DESC
LIMIT 10
""").show(truncate=False)

+------------------------------------+-------------------+----------+---------+
|user_session                        |event_time         |event_type|user_id  |
+------------------------------------+-------------------+----------+---------+
|90aca71c-ed8a-4670-866a-761ebacb732d|2019-11-30 23:59:59|view      |579969851|
|6fecf566-ebb0-4e70-a243-cdc13ce044cb|2019-11-30 23:59:59|view      |557794415|
|368ddc8b-5db9-40fb-b7ff-b6582a1192c0|2019-11-30 23:59:59|view      |531607492|
|02b4131c-0112-4231-aafa-ceaa08e77c1b|2019-11-30 23:59:58|view      |532714000|
|734c5eef-0742-4f8b-9d22-48f75b0bc359|2019-11-30 23:59:58|view      |545223467|
|e4bfecbc-8a99-437d-b6b8-b5dfbd87bf78|2019-11-30 23:59:57|view      |579968742|
|7165dc28-9cba-40a0-9ed8-76fd909ccb3b|2019-11-30 23:59:57|view      |562661595|
|941ff09f-c5da-49dd-8501-70c950202a4a|2019-11-30 23:59:57|view      |514007176|
|b8ff34a9-41be-4497-b373-408c6e02b952|2019-11-30 23:59:55|view      |579175262|
|a7836fb7-1dfd-4e50-a678-3725887f4408|20

In [0]:
# Verify deduplication worked
spark.sql("""
SELECT user_session, event_time, COUNT(*) AS cnt
FROM events_table
GROUP BY user_session, event_time
HAVING cnt > 1
""").show()

+--------------------+-------------------+---+
|        user_session|         event_time|cnt|
+--------------------+-------------------+---+
|ebb54b87-b736-4a3...|2019-11-17 11:07:13|  2|
|98b56db6-7dbc-4d0...|2019-11-16 06:10:41|  2|
|7ee9676f-82db-49d...|2019-11-14 09:11:58|  2|
|c1668e02-a4ae-4fa...|2019-11-16 15:17:44|  2|
|7f24707a-28e5-48e...|2019-11-17 16:57:49|  2|
|5ef1ffff-6488-44a...|2019-11-15 17:55:58|  2|
|b2d60eda-c237-45b...|2019-11-17 11:13:07|  2|
|f26599ac-ccae-456...|2019-11-16 13:07:05|  2|
|3bb41ffe-0f9b-453...|2019-11-17 15:37:58|  2|
|59994963-c1b5-477...|2019-11-16 09:27:50|  4|
|b8c74aa7-1268-49c...|2019-11-18 16:13:13|  2|
|7b887936-e17a-4c1...|2019-11-19 15:02:52|  2|
|27274d57-9efd-405...|2019-11-15 11:25:37|  2|
|7eceb278-577b-4e4...|2019-11-16 21:07:25|  2|
|4165ca86-7912-433...|2019-11-17 11:42:18|  2|
|ff45504d-a2ee-414...|2019-11-18 07:48:11|  2|
|78f38bf5-572e-4a1...|2019-11-16 18:20:54|  2|
|ec0c529d-11cf-4a2...|2019-11-17 15:10:51|  4|
|54f9bedb-a5b

In [0]:
# Check Delta transaction history (MERGE proof)
deltaTable.history().show(truncate=False)

+-------+-------------------+--------------+--------------------------+---------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----+-----------------+------------------------+-----------+-----------------+-------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [0]:
# Test Time Travel (Version 0 vs Current)
print("Version 0 count:", v0.count())
print("Current count:", spark.table("events_table").count())

Version 0 count: 67501979
Current count: 67501979


In [0]:
# Test Time Travel (timestamp-based)
yesterday = spark.read.format("delta") \
    .option("timestampAsOf", "2026-01-13 11:18:54").load("/Volumes/workspace/ecommerce/ecommerce_data/delta/events")
yesterday.select("event_time").orderBy("event_time").limit(5).show()

+-------------------+
|         event_time|
+-------------------+
|2019-11-01 00:00:00|
|2019-11-01 00:00:00|
|2019-11-01 00:00:01|
|2019-11-01 00:00:01|
|2019-11-01 00:00:01|
+-------------------+



In [0]:
# Verify OPTIMIZE effect (file compaction)
spark.sql("""
DESCRIBE DETAIL events_table
""").select("numFiles").show()

+--------+
|numFiles|
+--------+
|      26|
+--------+



In [0]:
# Confirm VACUUM retention
spark.sql("DESCRIBE HISTORY events_table").show(truncate=False)


+-------+-------------------+--------------+--------------------------+---------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------+----+-----------------+------------------------+-----------+-----------------+-------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+--------------------------------------------------+
|version|timestamp          |userId        |userName                  |operation                        |operationParameters                                                                                                                                    |job |notebook         |clusterId               |readVersion|isolati