In [1]:
import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [None]:
data = spark.range(0, 5)
data.write.format("delta").save("/tmp/delta-table")

## Read data

In [None]:
df = spark.read.format("delta").load("/tmp/delta-table")
df.show()

## Update table data

### Overwrite

In [None]:
data = spark.range(5, 10)
data.write.format("delta").mode("overwrite").save("/tmp/delta-table")

- If you read this table again, you should see only the values 5-9 you have added because you overwrote the previous data.

In [None]:
df = spark.read.format("delta").load("/tmp/delta-table")
df.show()

## Conditional update without overwrite

- Delta Lake provides programmatic APIs to conditional update, delete, and merge (upsert) data into tables

In [None]:
from delta.tables import *
from pyspark.sql.functions import *

deltaTable = DeltaTable.forPath(spark, "/tmp/delta-table")

In [None]:
# Update every even value by adding 100 to it
deltaTable.update(
  condition = expr("id % 2 == 0"),
  set = { "id": expr("id + 100") })

In [None]:
# Delete every even value
deltaTable.delete(condition = expr("id % 2 == 0"))

In [None]:
# Upsert (merge) new data
newData = spark.range(0, 20)
deltaTable.alias("oldData") \
  .merge(
    newData.alias("newData"),
    "oldData.id = newData.id") \
  .whenMatchedUpdate(set = { "id": col("newData.id") }) \
  .whenNotMatchedInsert(values = { "id": col("newData.id") }) \
  .execute()

deltaTable.toDF().show()

## Read older versions of data using time travel

- You can query previous snapshots of your Delta table by using time travel.
- If you want to access the data that you overwrote, you can query a snapshot of the table before you overwrote the first set of data using the versionAsOf option.

In [None]:
df = spark.read.format("delta").option("versionAsOf", 0).load("/tmp/delta-table")
df.show()

- You should see the first set of data, from before you overwrote it
- Time travel takes advantage of the power of the Delta Lake transaction log to access data that is no longer in the table
- Removing the version 0 option (or specifying version 1) would let you see the newer data again

## Write a stream of data to a table

- You can also write to a Delta table using Structured Streaming
- The Delta Lake transaction log guarantees exactly-once processing, even when there are other streams or batch queries running concurrently against the table
- By default, streams run in append mode, which adds new records to the table:

In [None]:
streamingDf = spark.readStream.format("rate").load()
stream = streamingDf.selectExpr("value as id").writeStream.format("delta").option("checkpointLocation", "/tmp/checkpoint").start("/tmp/delta-table")

- While the stream is running, you can read the table using the earlier commands.
- Note: 
    - If you’re running this in a shell, you may see the streaming task progress, which make it hard to type commands in that shell.
    - It may be useful to start another shell in a new terminal for querying the table.

- You can stop the stream by running stream.stop() in the same terminal that started the stream.

## Read a stream of changes from a table

- While the stream is writing to the Delta table, you can also read from that table as streaming source
- For example, you can start another streaming query that prints all the changes made to the Delta table
- You can specify which version Structured Streaming should start from by providing the startingVersion or startingTimestamp option to get changes from that point onwards

In [5]:
stream2 = spark.readStream.format("delta").load("/tmp/delta-table").writeStream.format("console").start()

Py4JJavaError: An error occurred while calling o31.load.
: java.lang.ClassNotFoundException: 
Failed to find data source: delta. Please find packages at
http://spark.apache.org/third-party-projects.html
       
	at org.apache.spark.sql.errors.QueryExecutionErrors$.failedToFindDataSourceError(QueryExecutionErrors.scala:443)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:670)
	at org.apache.spark.sql.streaming.DataStreamReader.loadInternal(DataStreamReader.scala:156)
	at org.apache.spark.sql.streaming.DataStreamReader.load(DataStreamReader.scala:209)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: java.lang.ClassNotFoundException: delta.DefaultSource
	at java.base/java.net.URLClassLoader.findClass(URLClassLoader.java:476)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:589)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:522)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$5(DataSource.scala:656)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$4(DataSource.scala:656)
	at scala.util.Failure.orElse(Try.scala:224)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:656)
	... 14 more


In [None]:
stream2.show()