In [0]:
csv_data = """id,name,category,price
1,Amit,Electronics,50000
2,Priya,Furniture,3000
3,Rahul,Stationery,200
4,Neha,Books,800
5,Karthik,Electronics,45000
"""
# save to DBFS
dbutils.fs.put("dbfs:/temp/products.csv", csv_data, overwrite=True)

Wrote 139 bytes.


True

In [0]:
df = spark.read.option("header", True).option("inferSchema",True).csv("dbfs:/temp/products.csv")
df.show()

+---+-------+-----------+-----+
| id|   name|   category|price|
+---+-------+-----------+-----+
|  1|   Amit|Electronics|50000|
|  2|  Priya|  Furniture| 3000|
|  3|  Rahul| Stationery|  200|
|  4|   Neha|      Books|  800|
|  5|Karthik|Electronics|45000|
+---+-------+-----------+-----+



Write the data in the delta table

In [0]:
df.write.format("delta").save("/tmp/delta/products")

In [0]:
df_delta = spark.read.format("delta").load("/tmp/delta/products")
df_delta.show()

+---+-------+-----------+-----+
| id|   name|   category|price|
+---+-------+-----------+-----+
|  1|   Amit|Electronics|50000|
|  2|  Priya|  Furniture| 3000|
|  3|  Rahul| Stationery|  200|
|  4|   Neha|      Books|  800|
|  5|Karthik|Electronics|45000|
+---+-------+-----------+-----+



### Update table

In [0]:
from delta.tables import DeltaTable
deltaTable = DeltaTable.forPath(spark, "/tmp/delta/products") 

# update price for product where id = 2
deltaTable.update(
  condition = "id = 2",
  set = { "price": "3500" }
)
deltaTable.toDF().show()


+---+-------+-----------+-----+
| id|   name|   category|price|
+---+-------+-----------+-----+
|  1|   Amit|Electronics|50000|
|  3|  Rahul| Stationery|  200|
|  4|   Neha|      Books|  800|
|  5|Karthik|Electronics|45000|
|  2|  Priya|  Furniture| 3500|
+---+-------+-----------+-----+



### upsert

In [0]:
new_data = [
    (2, "Priya", "Furniture", 4000),     # Update
    (6, "Sneha", "Kitchen", 1200)        # Insert
]

update_df = spark.createDataFrame(new_data, ["id", "name", "category", "price"])
deltaTable.alias("target").merge(
  update_df.alias("source"),
  "target.id = source.id"
  ).whenMatchedUpdateAll()\
.whenNotMatchedInsertAll().execute()
deltaTable.toDF().show()


+---+-------+-----------+-----+
| id|   name|   category|price|
+---+-------+-----------+-----+
|  1|   Amit|Electronics|50000|
|  3|  Rahul| Stationery|  200|
|  4|   Neha|      Books|  800|
|  5|Karthik|Electronics|45000|
|  2|  Priya|  Furniture| 4000|
|  6|  Sneha|    Kitchen| 1200|
+---+-------+-----------+-----+



### Version

In [0]:
# Current version
deltaTable.toDF().show()

# Read previous version
previous_df = spark.read.format("delta").option("versionAsOf", 0).load("/tmp/delta/products")
previous_df.show()

+---+-------+-----------+-----+
| id|   name|   category|price|
+---+-------+-----------+-----+
|  1|   Amit|Electronics|50000|
|  3|  Rahul| Stationery|  200|
|  4|   Neha|      Books|  800|
|  5|Karthik|Electronics|45000|
|  2|  Priya|  Furniture| 4000|
|  6|  Sneha|    Kitchen| 1200|
+---+-------+-----------+-----+

+---+-------+-----------+-----+
| id|   name|   category|price|
+---+-------+-----------+-----+
|  1|   Amit|Electronics|50000|
|  2|  Priya|  Furniture| 3000|
|  3|  Rahul| Stationery|  200|
|  4|   Neha|      Books|  800|
|  5|Karthik|Electronics|45000|
+---+-------+-----------+-----+



### Partitioning

In [0]:
df.write.format("delta").mode("overwrite").partitionBy("category").save("/tmp/delta/products_partitioned")

In [0]:
spark.read.format("delta").load("/tmp/delta/products_partitioned").show()

+---+-------+-----------+-----+
| id|   name|   category|price|
+---+-------+-----------+-----+
|  5|Karthik|Electronics|45000|
|  1|   Amit|Electronics|50000|
|  2|  Priya|  Furniture| 3000|
|  4|   Neha|      Books|  800|
|  3|  Rahul| Stationery|  200|
+---+-------+-----------+-----+

