In [1]:
# ensure compatible versions of PySpark and Delta Lake are installed
%pip install pyspark==3.3.1 
%pip install delta-spark==3.1.0
import logging
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col
from delta.tables import DeltaTable


logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

logger.info("Creating SparkSession...")

# create a Spark session that installs Delta Lake onto the Spark Engine remotely
spark = SparkSession.builder \
    .appName("DeltaLakeIntegration") \
    .master("local[*]") \
    .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.1.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

logger.info("Setting up Spark configurations...")
spark.conf.set("spark.sql.shuffle.partitions", 4)

Collecting pyspark==3.3.1
  Using cached pyspark-3.3.1-py2.py3-none-any.whl
Installing collected packages: pyspark
  Attempting uninstall: pyspark
    Found existing installation: pyspark 3.5.0
    Can't uninstall 'pyspark'. No files were found to uninstall.
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
delta-spark 3.1.0 requires pyspark<3.6.0,>=3.5.0, but you have pyspark 3.3.1 which is incompatible.[0m[31m
[0mSuccessfully installed pyspark-3.3.1
Note: you may need to restart the kernel to use updated packages.
Collecting py4j==0.10.9.7 (from pyspark<3.6.0,>=3.5.0->delta-spark==3.1.0)
  Using cached py4j-0.10.9.7-py2.py3-none-any.whl.metadata (1.5 kB)
Using cached py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
Installing collected packages: py4j
  Attempting uninstall: py4j
    Found existing installation: py4j 0.10.9.5
    Uninstalling py4j-0.10.9.5:
 

INFO:__main__:Creating SparkSession...
INFO:__main__:Setting up Spark configurations...


In [8]:
# Define delta output path
delta_output_path = "/home/jovyan/data/delta_table_of_dog_owners"

# --- Dogs at the Park Scenario ---

print("Welcome to the Doggy Delta Adventure!")

# --- Create Test Data and Initial Write to Delta Table ---

# It's a bright day at the park. Let's meet our first set of dog owners!
owner_data = [Row(owner_id=1, owner_name="Alice", dog_name="Buddy"),
              Row(owner_id=2, owner_name="Bob", dog_name="Max"),
              Row(owner_id=3, owner_name="Charlie", dog_name="Bella"),
              Row(owner_id=4, owner_name="David", dog_name="Lucy"),
              Row(owner_id=5, owner_name="Emma", dog_name="Bailey"),
              Row(owner_id=6, owner_name="Frank", dog_name="Rosie")]
owner_df = spark.createDataFrame(owner_data)

# Write initial test data to Delta table
owner_df.write.format("delta").mode("overwrite").save(delta_output_path)
print("Initial doggy data successfully added!")

# --- Additional Dogs Joining the Park ---

print("\nNew dogs are arriving at the park!")
new_owner_data = [Row(owner_id=7, owner_name="Grace", dog_name="Cooper"),
                  Row(owner_id=8, owner_name="Hannah", dog_name="Daisy")]
new_owner_df = spark.createDataFrame(new_owner_data)

# Append new test data to the Delta table
new_owner_df.write.format("delta").mode("append").save(delta_output_path)
print("New doggy friends added to the park!")

# --- Let's Take a Look at Our Park's Population ---

print("\nLet's see who's at the park today:")
deltaTable = DeltaTable.forPath(spark, delta_output_path)
current_df = deltaTable.toDF()
current_df.show()

# --- Oh No, a Mischievous Dog Causes Trouble! ---

print("\nUh-oh! Trouble's brewing at the park.")
print("One mischievous dog has started a commotion!")

# Simulate a misbehaving dog by changing its owner
deltaTable.update("dog_name = 'Buddy'", {"owner_name": "'Mischief'"})
print("Buddy's owner has been mysteriously changed to 'Mischief'!")
current_df.show()

# --- Time to Restore Order ---

print("\nTime to restore peace and order at the park.")

# Rollback to the previous version to correct the owner of Buddy
previous_version = deltaTable.history().select("version").collect()[1][0]
deltaTable.restoreToVersion(previous_version)
print("Order restored! Buddy is back with his rightful owner.")

# --- Let's Check Our Park Population Again ---

print("\nLet's check the park's population after restoring order:")
current_df = deltaTable.toDF()
current_df.show()


Welcome to the Doggy Delta Adventure!
Initial doggy data successfully added!

New dogs are arriving at the park!
New doggy friends added to the park!

Let's see who's at the park today:
+--------+----------+--------+
|owner_id|owner_name|dog_name|
+--------+----------+--------+
|       3|   Charlie|   Bella|
|       8|    Hannah|   Daisy|
|       7|     Grace|  Cooper|
|       1|     Alice|   Buddy|
|       5|      Emma|  Bailey|
|       6|     Frank|   Rosie|
|       4|     David|    Lucy|
|       2|       Bob|     Max|
+--------+----------+--------+


Uh-oh! Trouble's brewing at the park.
One mischievous dog has started a commotion!
Buddy's owner has been mysteriously changed to 'Mischief'!

Time to restore peace and order at the park.
Order restored! Buddy is back with his rightful owner.

Let's check the park's population after restoring order:
+--------+----------+--------+
|owner_id|owner_name|dog_name|
+--------+----------+--------+
|       3|   Charlie|   Bella|
|       8|    H

In [None]:
# Stop the SparkSession (you'll have to re-run all the things if you click this)
logger.info("Stopping SparkSession...")
spark.stop()
logger.info("SparkSession stopped.")