In [18]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, expr

# Initialize Spark session
spark = SparkSession.builder \
    .appName("FHIR Data Pipeline") \
    .getOrCreate()


In [19]:

# File path
file_path = "data/fhir/samples"

# Read the JSON file
data = spark.read.json(file_path, multiLine=True)

# Explode the entry array
entries = data.select(explode(col("entry")).alias("entry"))


### Patients

In [20]:

# Filter patient resources
patients = entries.filter(col("entry.resource.resourceType") == "Patient") \
                  .select(col("entry.resource.*"))

# Extract the "official" name
official_name = expr("""
    filter(name, x -> x.use = 'official')[0]
""")

# Extract geolocation fields
geolocation = expr("""
    filter(address[0].extension, x -> x.url = 'http://hl7.org/fhir/StructureDefinition/geolocation')[0].extension
""")

# Extract all languages from communication as a list
communication_languages = expr("""
    transform(communication, x -> x.language.text)
""")

# Extract identifier types
identifier_types = expr("""
    transform(identifier, x -> x.type.coding[0].display)
""")

# Create the enhanced patient DataFrame
patient_df = patients.select(
    col("id").alias("patient_id"),
    identifier_types.alias("identifier_types"),
    official_name.getField("family").alias("last_name"),
    official_name.getField("given").getItem(0).alias("first_name"),
    col("gender").alias("gender"),
    col("birthDate").alias("birth_date"),
    col("address").getItem(0).getField("city").alias("city"),
    col("address").getItem(0).getField("state").alias("state"),
    col("address").getItem(0).getField("country").alias("country"),
    col("address").getItem(0).getField("postalCode").alias("postal_code"),
    geolocation.getItem(0).getField("valueDecimal").alias("latitude"),
    geolocation.getItem(1).getField("valueDecimal").alias("longitude"),
    col("telecom").getItem(0).getField("value").alias("phone"),
    col("maritalStatus.text").alias("marital_status"),
    col("extension").getItem(0).getField("valueString").alias("mothers_maiden_name"),
    col("extension").getItem(1).getField("valueAddress").getField("city").alias("birthplace_city"),
    col("extension").getItem(1).getField("valueAddress").getField("state").alias("birthplace_state"),
    col("extension").getItem(1).getField("valueAddress").getField("country").alias("birthplace_country"),
    col("extension").getItem(2).getField("valueDecimal").alias("disability_adjusted_life_years"),
    col("extension").getItem(3).getField("valueDecimal").alias("quality_adjusted_life_years"),
    col("multipleBirthBoolean").alias("multiple_birth"),
    communication_languages.alias("languages")
)

# Show the resulting DataFrame
patient_df.show(truncate=False)


+------------------------------------+----------------------------------------------------------------------------------------+-------------+------------+------+----------+-------+------+-------+-----------+-----------------+------------------+------------+--------------+--------------------+---------------+----------------+------------------+------------------------------+---------------------------+--------------+---------+
|patient_id                          |identifier_types                                                                        |last_name    |first_name  |gender|birth_date|city   |state |country|postal_code|latitude         |longitude         |phone       |marital_status|mothers_maiden_name |birthplace_city|birthplace_state|birthplace_country|disability_adjusted_life_years|quality_adjusted_life_years|multiple_birth|languages|
+------------------------------------+----------------------------------------------------------------------------------------+-------------

In [21]:
# Save the DataFrame to Parquet format in the "patients" folder
patient_df.write.mode("overwrite").parquet("patients")

print("Patient DataFrame has been successfully saved in Parquet format in the 'patients' folder.")

Patient DataFrame has been successfully saved in Parquet format in the 'patients' folder.


### Encounters

In [37]:
from pyspark.sql.functions import from_json
from pyspark.sql.types import ArrayType, StructType, StructField, StringType

# Define the schema for the 'type' field
type_schema = ArrayType(
    StructType([
        StructField("coding", ArrayType(
            StructType([
                StructField("system", StringType(), True),
                StructField("code", StringType(), True),
                StructField("display", StringType(), True)
            ])
        ), True),
        StructField("text", StringType(), True)
    ])
)


# Filter encounter resources
encounters = entries.filter(col("entry.resource.resourceType") == "Encounter") \
                    .select(col("entry.resource.*"))

# Parse the 'type' field from JSON string to structured format
encounters = encounters.withColumn("type_parsed", from_json(col("type"), type_schema))

# Explode the participant array to create one row per participant
encounters_with_participants = encounters.select(
    col("id").alias("encounter_id"),
    col("status").alias("status"),
    col("class.code").alias("class_code"),  #todo - find a better column name
    col("type_parsed").getItem(0).getField("text").alias("type_text"),  #todo - find a better column name
    col("subject.reference").alias("patient_reference"),
    col("period.start").alias("start_time"),
    col("period.end").alias("end_time"),
    col("serviceProvider.reference").alias("service_provider_id"),
    col("serviceProvider.display").alias("service_provider_display"),
    explode(col("participant")).alias("participant")
)

# Extract participant details
encounter_with_participant_df = encounters_with_participants.select(
    col("encounter_id"),
    col("status"),
    col("class_code"),
    col("type_text"),
    col("start_time"),
    col("end_time"),
    col("patient_reference"),
    col("service_provider_id"),
    col("service_provider_display"),
    col("participant.individual.display").alias("participant_individual_display"),
    col("participant.individual.reference").alias("participant_individual_reference"),
    # col("participant.member.display").alias("participant_member_display"),
    # col("participant.member.reference").alias("participant_member_reference"),
    col("participant.period.start").alias("participant_period_start"),
    col("participant.period.end").alias("participant_period_end"),
    col("participant.type").getItem(0).getField("coding").getItem(0).getField("code").alias("participant_type_code"),
    col("participant.type").getItem(0).getField("coding").getItem(0).getField("display").alias("participant_type_display")
)

# Show the resulting DataFrame
encounter_with_participant_df.show(truncate=False)


+------------------------------------+--------+----------+------------------------------------------+-------------------------+-------------------------+---------------------------------------------+-------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------+------------------------------+-----------------------------------------------------------------+-------------------------+-------------------------+---------------------+------------------------+
|encounter_id                        |status  |class_code|type_text                                 |start_time               |end_time                 |patient_reference                            |service_provider_id                                                                                    |service_provider_display                                           |participant_individual_display|participant_individual_referen

In [38]:
# Save the DataFrame to Parquet
encounter_with_participant_df.write.mode("overwrite").parquet("participant_details")

print("Participant details have been successfully extracted and saved in the 'participant_details' folder.")


Participant details have been successfully extracted and saved in the 'participant_details' folder.
