In [None]:
import random

session_id = random.randint(0,1000000)
delta_table_path = "/delta/delta-table-{0}".format(session_id)

delta_table_path

In [None]:
## Create a table

In [None]:
data = spark.range(0,5)
data.show()
data.write.format("delta").save(delta_table_path)

In [None]:
## Read data

In [None]:
df = spark.read.format("delta").load(delta_table_path)
df.show()

In [None]:
## Update table data

In [None]:
data = spark.range(5,10)
data.write.format("delta").mode("overwrite").save(delta_table_path)
df.show()

In [None]:
## Save as catalog tables

In [None]:
data.write.format("delta").saveAsTable("ManagedDeltaTable")
spark.sql("CREATE TABLE ExternalDeltaTable USING DELTA LOCATION '{0}'".format(delta_table_path))
spark.sql("SHOW TABLES").show()

- With this code, you created a new table in the catalog from an existing dataframe, referred to as a managed table
- Then you defined a new external table in the catalog that uses an existing location, referred to as an external table
- In the output you can see both tables, no matter how they were created, are listed in the catalog.

- Now you can look at the extended properties of both of these tables

In [None]:
spark.sql("DESCRIBE EXTENDED ManagedDeltaTable").show(truncate=False)

In [None]:
spark.sql("DESCRIBE EXTENDED ExternalDeltaTable").show(truncate=False)

## Conditional update without overwrite
- Delta Lake provides programmatic APIs to conditional update, delete, and merge data into tables
    - This is commonly referred to as an upsert

In [None]:
from delta.tables import *
from pyspark.sql.functions import *

delta_table = DeltaTable.forPath(spark, delta_table_path)

# Here you just added 100 to every even ID.
delta_table.update(
  condition = expr("id % 2 == 0"),
  set = { "id": expr("id + 100") })
delta_table.toDF().show()

In [None]:
delta_table.delete("id % 2 == 0")
delta_table.toDF().show()

In [None]:
# Notice that every even row has been deleted.
new_data = spark.range(0,20).alias("newData")

delta_table.alias("oldData")\
    .merge(new_data.alias("newData"), "oldData.id = newData.id")\
    .whenMatchedUpdate(set = { "id": lit("-1")})\
    .whenNotMatchedInsert(values = { "id": col("newData.id") })\
    .execute()

delta_table.toDF().show(100)

## History
- Delta Lake's has the ability to allow looking into history of a table
- That is, the changes that were made to the underlying Delta Table
- The cell below shows how simple it is to inspect the history.

In [None]:
delta_table.history().show(20, 1000, False)

In [None]:
## Read older versions of data using Time Travel

In [None]:
df = spark.read.format("delta").option("versionAsOf", 0).load(delta_table_path)
df.show()

In [None]:
## Write a stream of data to a table

In [None]:
streaming_df = spark.readStream.format("rate").load()
stream = streaming_df\
    .selectExpr("value as id")\
    .writeStream\
    .format("delta")\
    .option("checkpointLocation", "/tmp/checkpoint-{0}".format(session_id))\
    .start(delta_table_path)

In [None]:
## Read a stream of changes from a table

In [None]:
delta_table.toDF().sort(col("id").desc()).show(100)

In [None]:
delta_table.history().drop("userId", "userName", "job", "notebook", "clusterId", "isolationLevel", "isBlindAppend").show(20, 1000, False)

In [None]:
stream.stop()
delta_table.history().drop("userId", "userName", "job", "notebook", "clusterId", "isolationLevel", "isBlindAppend").show(100, 1000, False)

In [None]:
spark.sql("DESCRIBE HISTORY delta.`{0}`".format(delta_table_path)).show()