In [21]:
from delta import configure_spark_with_delta_pip
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

# Create Spark session with Delta Lake
builder = (
    SparkSession.builder
    .appName("DeltaLake ACID Demo")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
)

# Wrap builder to include delta-spark pip package
spark = configure_spark_with_delta_pip(builder).getOrCreate()


In [22]:
# Read existing Delta table (Copy not to break other notebooks)
delta_path = "../data/clean/patients_main_copy"
patients_delta = spark.read.format("delta").load(delta_path)

# Show table preview
print("Original table preview:")
patients_delta.show(10, truncate=False)


Original table preview:
+------------------------------------+----------+---------+------+-----+-----------+-------+-------------+---------------+--------------+---------------+--------------+--------+--------+--------+----------+---------------+--------------+--------------+---+-----------+
|id                                  |birthdate |deathdate|gender|race |ethnicity  |marital|state        |city           |num_encounters|first_encounter|last_encounter|pain_min|pain_max|pain_avg|pain_count|num_medications|total_med_cost|num_conditions|age|is_deceased|
+------------------------------------+----------+---------+------+-----+-----------+-------+-------------+---------------+--------------+---------------+--------------+--------+--------+--------+----------+---------------+--------------+--------------+---+-----------+
|b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85|2019-02-17|NULL     |M     |white|nonhispanic|unknown|Massachusetts|Springfield    |12            |2019-02-17     |2021-07-25    |0

In [23]:
# Show schema
patients_delta.printSchema()


root
 |-- id: string (nullable = true)
 |-- birthdate: date (nullable = true)
 |-- deathdate: date (nullable = true)
 |-- gender: string (nullable = true)
 |-- race: string (nullable = true)
 |-- ethnicity: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- state: string (nullable = true)
 |-- city: string (nullable = true)
 |-- num_encounters: long (nullable = true)
 |-- first_encounter: date (nullable = true)
 |-- last_encounter: date (nullable = true)
 |-- pain_min: double (nullable = true)
 |-- pain_max: double (nullable = true)
 |-- pain_avg: double (nullable = true)
 |-- pain_count: long (nullable = true)
 |-- num_medications: long (nullable = true)
 |-- total_med_cost: double (nullable = true)
 |-- num_conditions: long (nullable = true)
 |-- age: long (nullable = true)
 |-- is_deceased: integer (nullable = true)



In [24]:
# Count rows before overwrite
original_count = patients_delta.count()
print(f"Original row count: {original_count}")


Original row count: 1163


In [None]:
# Add a new patient
# Read new patient CSV
new_patient_df = spark.read.csv("../data/raw/new_patient.csv", header=True, inferSchema=True)

# Combine with existing table (add new_patient)
patients_updated = patients_delta.unionByName(new_patient_df)

# We could use "append" here (at the end of the notebook) if we were adding just this one patient,
# but in this case we are using "overwrite" to replace the table with the updated DataFrame.


In [26]:
# Table preview
patients_updated.show(5)

# Show newly added patient from the updated table
patients_updated.filter(patients_delta.id == "1234567-abcd-8901-efgh-12gf14sgh56i").show(5)


+--------------------+----------+---------+------+-----+-----------+-------+-------------+-----------+--------------+---------------+--------------+--------+--------+--------+----------+---------------+--------------+--------------+---+-----------+
|                  id| birthdate|deathdate|gender| race|  ethnicity|marital|        state|       city|num_encounters|first_encounter|last_encounter|pain_min|pain_max|pain_avg|pain_count|num_medications|total_med_cost|num_conditions|age|is_deceased|
+--------------------+----------+---------+------+-----+-----------+-------+-------------+-----------+--------------+---------------+--------------+--------+--------+--------+----------+---------------+--------------+--------------+---+-----------+
|b9c610cd-28a6-463...|2019-02-17|     NULL|     M|white|nonhispanic|unknown|Massachusetts|Springfield|            12|     2019-02-17|    2021-07-25|     0.0|     4.0|     1.8|        10|              2|       5313.63|             0|  6|          0|
|c1f

In [27]:
# Overwrite Delta table (ACID demo)
# We use "overwrite" because we created a new DataFrame with updates and a new patient.
# This demonstrates an atomic (all or nothing) write: either the entire table is replaced or nothing is applied.
# If we were adding only a single new patient without modifying existing rows, we could use "append" instead.
patients_updated.write.format("delta").mode("overwrite").save(delta_path)

print("✅ Delta table successfully overwriten")


✅ Delta table successfully overwriten


In [31]:
# Verify table after overwrite
patients_after = spark.read.format("delta").load(delta_path)

patients_after.show(5, truncate=False)

print(f"Row count after overwrite: {patients_after.count()}")
print(f"Original row count: {original_count}")

+------------------------------------+----------+---------+------+-----+-----------+-------+-------------+-----------+--------------+---------------+--------------+--------+--------+--------+----------+---------------+--------------+--------------+---+-----------+
|id                                  |birthdate |deathdate|gender|race |ethnicity  |marital|state        |city       |num_encounters|first_encounter|last_encounter|pain_min|pain_max|pain_avg|pain_count|num_medications|total_med_cost|num_conditions|age|is_deceased|
+------------------------------------+----------+---------+------+-----+-----------+-------+-------------+-----------+--------------+---------------+--------------+--------+--------+--------+----------+---------------+--------------+--------------+---+-----------+
|b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85|2019-02-17|NULL     |M     |white|nonhispanic|unknown|Massachusetts|Springfield|12            |2019-02-17     |2021-07-25    |0.0     |4.0     |1.8     |10        |2  