In [1]:
from delta import configure_spark_with_delta_pip
from pyspark.sql import SparkSession

# Create Spark session with Delta Lake support
builder = (
    SparkSession.builder
    .appName("Synthea Delta Pipeline")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
)

# Wrap builder to include delta-spark pip package
spark = configure_spark_with_delta_pip(builder).getOrCreate()


:: loading settings :: url = jar:file:/Users/armandas.lidzius/Documents/Git/.data/lib/python3.13/site-packages/pyspark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /Users/armandas.lidzius/.ivy2.5.2/cache
The jars for the packages stored in: /Users/armandas.lidzius/.ivy2.5.2/jars
io.delta#delta-spark_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-2571f9fb-3cd1-43e2-913d-617de28dec75;1.0
	confs: [default]
	found io.delta#delta-spark_2.13;4.0.0 in central
	found io.delta#delta-storage;4.0.0 in central
	found org.antlr#antlr4-runtime;4.13.1 in central
:: resolution report :: resolve 70ms :: artifacts dl 3ms
	:: modules in use:
	io.delta#delta-spark_2.13;4.0.0 from central in [default]
	io.delta#delta-storage;4.0.0 from central in [default]
	org.antlr#antlr4-runtime;4.13.1 from central in [default]
	---------------------------------------------------------------------
	|                  |           

In [2]:
# Read the delta table that was created in exploration_and_transformation.ipynb
patients_delta = spark.read.format("delta").load("../data/clean/patients_main")

# Show table preview
patients_delta.show(10, truncate=False)


25/09/11 19:20:36 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+------------------------------------+----------+---------+------+-----+-----------+-------+-------------+---------------+--------------+---------------+--------------+--------+--------+--------+----------+---------------+--------------+--------------+---+-----------+
|id                                  |birthdate |deathdate|gender|race |ethnicity  |marital|state        |city           |num_encounters|first_encounter|last_encounter|pain_min|pain_max|pain_avg|pain_count|num_medications|total_med_cost|num_conditions|age|is_deceased|
+------------------------------------+----------+---------+------+-----+-----------+-------+-------------+---------------+--------------+---------------+--------------+--------+--------+--------+----------+---------------+--------------+--------------+---+-----------+
|b9c610cd-28a6-4636-ccb6-c7a0d2a4cb85|2019-02-17|NULL     |M     |white|nonhispanic|unknown|Massachusetts|Springfield    |12            |2019-02-17     |2021-07-25    |0.0     |4.0     |1.8    

In [3]:
# Show schema
patients_delta.printSchema()


root
 |-- id: string (nullable = true)
 |-- birthdate: date (nullable = true)
 |-- deathdate: date (nullable = true)
 |-- gender: string (nullable = true)
 |-- race: string (nullable = true)
 |-- ethnicity: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- state: string (nullable = true)
 |-- city: string (nullable = true)
 |-- num_encounters: long (nullable = true)
 |-- first_encounter: date (nullable = true)
 |-- last_encounter: date (nullable = true)
 |-- pain_min: double (nullable = true)
 |-- pain_max: double (nullable = true)
 |-- pain_avg: double (nullable = true)
 |-- pain_count: long (nullable = true)
 |-- num_medications: long (nullable = true)
 |-- total_med_cost: double (nullable = true)
 |-- num_conditions: long (nullable = true)
 |-- age: long (nullable = true)
 |-- is_deceased: integer (nullable = true)



In [4]:
# Count all patients (rows)
print(patients_delta.count())


1163


In [7]:
# These PySpark DataFrame operations allow you to filter, select, and aggregate data
# in a way similar to SQL queries (WHERE, SELECT, GROUP BY, etc.)

# Patients older than 50
patients_delta.filter(patients_delta.age > 50).show(5)


+--------------------+----------+----------+------+-----+-----------+-------+-------------+---------------+--------------+---------------+--------------+--------+--------+--------+----------+---------------+--------------+--------------+---+-----------+
|                  id| birthdate| deathdate|gender| race|  ethnicity|marital|        state|           city|num_encounters|first_encounter|last_encounter|pain_min|pain_max|pain_avg|pain_count|num_medications|total_med_cost|num_conditions|age|is_deceased|
+--------------------+----------+----------+------+-----+-----------+-------+-------------+---------------+--------------+---------------+--------------+--------+--------+--------+----------+---------------+--------------+--------------+---+-----------+
|c0219ca9-576f-f7c...|1971-12-06|      NULL|     F|white|   hispanic|      M|Massachusetts|     Somerville|            38|     1990-01-29|    2021-09-29|     0.0|     2.0|     1.2|         5|             12|      43412.18|            31| 

In [8]:
# Patients with pain_avg > 3
patients_delta.filter(patients_delta.pain_avg > 3).select("id", "pain_avg").show(5)


+--------------------+--------+
|                  id|pain_avg|
+--------------------+--------+
|0288c42c-43a1-987...|    4.23|
|55a6a46e-a1a4-029...|    4.08|
|2f031d4a-b070-ce1...|    3.88|
|060e72d3-912e-55c...|    3.71|
|a7b903a5-5c6f-3f5...|     4.4|
+--------------------+--------+
only showing top 5 rows


In [12]:
# Average total medication cost
patients_delta.agg({"total_med_cost": "avg"}).show()


+-------------------+
|avg(total_med_cost)|
+-------------------+
|  63549.27981943252|
+-------------------+

