# Healthcare utilization and prediction



the dataset used is Oh Canada dataset from https://synthea.mitre.org/downloads. 

In [72]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, col
from pyspark.sql.types import StructType, StructField, StringType, BooleanType, ArrayType, FloatType

In [73]:
# Initialize Spark Session
spark = SparkSession.builder \
    .appName("FHIR Data Pipeline") \
    .getOrCreate()

In [74]:
# Define the schema for the JSON file
schema = StructType([
    StructField("resourceType", StringType(), True),
    StructField("type", StringType(), True),
    StructField("entry", ArrayType(
        StructType([
            StructField("fullUrl", StringType(), True),
            StructField("resource", StructType([
                StructField("resourceType", StringType(), True),
                StructField("id", StringType(), True),
                StructField("text", StructType([
                    StructField("status", StringType(), True),
                    StructField("div", StringType(), True)
                ]), True),
                StructField("extension", ArrayType(
                    StructType([
                        StructField("url", StringType(), True),
                        StructField("valueString", StringType(), True)
                    ])
                ), True),
                StructField("identifier", ArrayType(
                    StructType([
                        StructField("system", StringType(), True),
                        StructField("value", StringType(), True)
                    ])
                ), True),
                StructField("name", ArrayType(
                    StructType([
                        StructField("use", StringType(), True),
                        StructField("family", StringType(), True),
                        StructField("given", ArrayType(StringType()), True),
                        StructField("prefix", ArrayType(StringType()), True)
                    ])
                ), True),
                StructField("telecom", ArrayType(
                    StructType([
                        StructField("system", StringType(), True),
                        StructField("value", StringType(), True),
                        StructField("use", StringType(), True)
                    ])
                ), True),
                StructField("gender", StringType(), True),
                StructField("birthDate", StringType(), True),
                StructField("address", ArrayType(
                    StructType([
                        StructField("extension", ArrayType(
                            StructType([
                                StructField("url", StringType(), True),
                                StructField("extension", ArrayType(
                                    StructType([
                                        StructField("url", StringType(), True),
                                        StructField("valueDecimal", FloatType(), True)
                                    ])
                                ), True)
                            ])
                        ), True),
                        StructField("line", ArrayType(StringType()), True),
                        StructField("city", StringType(), True),
                        StructField("state", StringType(), True),
                        StructField("postalCode", StringType(), True),
                        StructField("country", StringType(), True)
                    ])
                ), True),
                StructField("maritalStatus", StructType([
                    StructField("coding", ArrayType(
                        StructType([
                            StructField("system", StringType(), True),
                            StructField("code", StringType(), True),
                            StructField("display", StringType(), True)
                        ])
                    ), True),
                    StructField("text", StringType(), True)
                ]), True),
                StructField("multipleBirthBoolean", BooleanType(), True),
                StructField("communication", ArrayType(
                    StructType([
                        StructField("language", StructType([
                            StructField("coding", ArrayType(
                                StructType([
                                    StructField("system", StringType(), True),
                                    StructField("code", StringType(), True),
                                    StructField("display", StringType(), True)
                                ])
                            ), True),
                            StructField("text", StringType(), True)
                        ]), True)
                    ])
                ), True)
            ]), True),
            StructField("request", StructType([
                StructField("method", StringType(), True),
                StructField("url", StringType(), True)
            ]), True)
        ])
    ), True)
])


In [75]:
#Step 1: Read the JSON file
file_path = "data/fhir/Abdul218_Gusikowski974_9a9ab84b-7e52-c718-c9ff-9b0c5dea86b1.json"

df_raw = spark.read.option("multiLine", True).json(file_path)
df_raw.printSchema()
df_raw.head(1)

root
 |-- entry: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- fullUrl: string (nullable = true)
 |    |    |-- request: struct (nullable = true)
 |    |    |    |-- method: string (nullable = true)
 |    |    |    |-- url: string (nullable = true)
 |    |    |-- resource: struct (nullable = true)
 |    |    |    |-- abatementDateTime: string (nullable = true)
 |    |    |    |-- address: array (nullable = true)
 |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |-- city: string (nullable = true)
 |    |    |    |    |    |-- country: string (nullable = true)
 |    |    |    |    |    |-- extension: array (nullable = true)
 |    |    |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |    |    |-- extension: array (nullable = true)
 |    |    |    |    |    |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |    |    |    |    |    |-- url: stri

[Row(entry=[Row(fullUrl='urn:uuid:9a9ab84b-7e52-c718-c9ff-9b0c5dea86b1', request=Row(method='POST', url='Patient'), resource=Row(abatementDateTime=None, address=[Row(city='Sherbrooke', country='CA', extension=[Row(extension=[Row(url='latitude', valueDecimal=45.39148379000888), Row(url='longitude', valueDecimal=-71.89263429922619)], url='http://hl7.org/fhir/StructureDefinition/geolocation')], line=['964 Wiza Dale'], postalCode='J1N', state='Quebec')], authoredOn=None, billablePeriod=None, birthDate='1975-10-22', careTeam=None, category=None, claim=None, class=None, clinicalStatus=None, code=None, communication=[Row(language=Row(coding=[Row(code='en-US', display='English', system='urn:ietf:bcp:47')], text='English'))], component=None, contained=None, created=None, deceasedDateTime='1984-09-18T13:01:28-04:00', diagnosis=None, effectiveDateTime=None, encounter=None, extension=[Row(url='http://hl7.org/fhir/StructureDefinition/patient-mothersMaidenName', valueAddress=None, valueDecimal=None,

In [76]:
# Step 2: Parse the JSON and extract relevant fields
# Exploding entries from the Bundle
entries_df = df_raw.select(explode(col("entry")).alias("entry"))
entries_df.head()


Row(entry=Row(fullUrl='urn:uuid:9a9ab84b-7e52-c718-c9ff-9b0c5dea86b1', request=Row(method='POST', url='Patient'), resource=Row(abatementDateTime=None, address=[Row(city='Sherbrooke', country='CA', extension=[Row(extension=[Row(url='latitude', valueDecimal=45.39148379000888), Row(url='longitude', valueDecimal=-71.89263429922619)], url='http://hl7.org/fhir/StructureDefinition/geolocation')], line=['964 Wiza Dale'], postalCode='J1N', state='Quebec')], authoredOn=None, billablePeriod=None, birthDate='1975-10-22', careTeam=None, category=None, claim=None, class=None, clinicalStatus=None, code=None, communication=[Row(language=Row(coding=[Row(code='en-US', display='English', system='urn:ietf:bcp:47')], text='English'))], component=None, contained=None, created=None, deceasedDateTime='1984-09-18T13:01:28-04:00', diagnosis=None, effectiveDateTime=None, encounter=None, extension=[Row(url='http://hl7.org/fhir/StructureDefinition/patient-mothersMaidenName', valueAddress=None, valueDecimal=None, v

In [77]:
# Extract resource types
resource_types_df = entries_df.select(
    col("entry.resource.resourceType").alias("resourceType")
).distinct()

resource_types_df.orderBy('resourceType').show(truncate=False)

+--------------------+
|resourceType        |
+--------------------+
|Claim               |
|Condition           |
|DiagnosticReport    |
|Encounter           |
|ExplanationOfBenefit|
|Immunization        |
|MedicationRequest   |
|Observation         |
|Patient             |
|Procedure           |
+--------------------+



In [78]:
# Extract patient information from resource
patients_df = entries_df.filter(col("entry.resource.resourceType") == "Patient").select(
    col("entry.resource.id").alias("patient_id"),
    col("entry.resource.gender").alias("gender"),
    col("entry.resource.birthDate").alias("birth_date"),
    col("entry.resource.name").alias("name"),
    col("entry.resource.address").alias("address")
)

# Show patient data
patients_df.show(truncate=False)


+------------------------------------+------+----------+---------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|patient_id                          |gender|birth_date|name                                   |address                                                                                                                                                                    |
+------------------------------------+------+----------+---------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|9a9ab84b-7e52-c718-c9ff-9b0c5dea86b1|male  |1975-10-22|[{Gusikowski974, [Abdul218], official}]|[{Sherbrooke, CA, [{[{latitude, 45.39148379000888}, {longitude, -71.89263429922619}], http://hl7.

In [79]:
# Flatten patient names
names_df = patients_df.select(
    col("patient_id"),
    explode(col("name")).alias("name_details")
).select(
    col("patient_id"),
    col("name_details.family").alias("last_name"),
    col("name_details.given").alias("first_name")
)

# Flatten patient addresses
addresses_df = patients_df.select(
    col("patient_id"),
    explode(col("address")).alias("address_details")
).select(
    col("patient_id"),
    col("address_details.city").alias("city"),
    col("address_details.state").alias("state"),
    col("address_details.country").alias("country")
)

# Show flattened data
names_df.show(truncate=False)
addresses_df.show(truncate=False)


+------------------------------------+-------------+----------+
|patient_id                          |last_name    |first_name|
+------------------------------------+-------------+----------+
|9a9ab84b-7e52-c718-c9ff-9b0c5dea86b1|Gusikowski974|[Abdul218]|
+------------------------------------+-------------+----------+

+------------------------------------+----------+------+-------+
|patient_id                          |city      |state |country|
+------------------------------------+----------+------+-------+
|9a9ab84b-7e52-c718-c9ff-9b0c5dea86b1|Sherbrooke|Quebec|CA     |
+------------------------------------+----------+------+-------+



In [80]:
# Extract encounter data
encounters_df = entries_df.filter(col("entry.resource.resourceType") == "Encounter").select(
    col("entry.resource.id").alias("encounter_id"),
    col("entry.resource.subject.reference").alias("patient_reference"),
    col("entry.resource.period.start").alias("start_time"),
    col("entry.resource.period.end").alias("end_time"),
    col("entry.resource.serviceProvider.display").alias("service_provider")
)

# Show encounter data
encounters_df.show(truncate=False)


+------------------------------------+---------------------------------------------+-------------------------+-------------------------+---------------------------+
|encounter_id                        |patient_reference                            |start_time               |end_time                 |service_provider           |
+------------------------------------+---------------------------------------------+-------------------------+-------------------------+---------------------------+
|bfc04826-0c53-f1b6-cdb2-6a49dba3402e|urn:uuid:9a9ab84b-7e52-c718-c9ff-9b0c5dea86b1|1975-10-22T13:01:28-04:00|1975-10-22T13:16:28-04:00|CLSC J-OLIVIER-CAMIRAND    |
|808b2737-7039-9fcf-d26a-c199c8db91a0|urn:uuid:9a9ab84b-7e52-c718-c9ff-9b0c5dea86b1|1975-11-26T12:01:28-05:00|1975-11-26T12:16:28-05:00|CLSC J-OLIVIER-CAMIRAND    |
|7645e9a2-b178-d528-9729-be9fd44093e4|urn:uuid:9a9ab84b-7e52-c718-c9ff-9b0c5dea86b1|1976-01-28T12:01:28-05:00|1976-01-28T12:16:28-05:00|CLSC J-OLIVIER-CAMIRAND    |
|5095c5d9-

In [81]:
# Extract observation data
observations_df = entries_df.filter(col("entry.resource.resourceType") == "Observation").select(
    col("entry.resource.id").alias("observation_id"),
    col("entry.resource.subject.reference").alias("patient_reference"),
    col("entry.resource.code.text").alias("observation_text"),
    col("entry.resource.valueQuantity.value").alias("value"),
    col("entry.resource.valueQuantity.unit").alias("unit")
)

# Show observation data
observations_df.show(truncate=False)


+------------------------------------+---------------------------------------------+------------------------------------------------------------------------+------+-------+
|observation_id                      |patient_reference                            |observation_text                                                        |value |unit   |
+------------------------------------+---------------------------------------------+------------------------------------------------------------------------+------+-------+
|6686979d-3883-1892-715a-4c1a400db8c1|urn:uuid:9a9ab84b-7e52-c718-c9ff-9b0c5dea86b1|Body Height                                                             |52.6  |cm     |
|62d6c91c-a5a1-175b-8ffe-a7051e730e89|urn:uuid:9a9ab84b-7e52-c718-c9ff-9b0c5dea86b1|Pain severity - 0-10 verbal numeric rating [Score] - Reported           |3.0   |{score}|
|c77ae5fa-b25e-b423-df0a-438ce24ac549|urn:uuid:9a9ab84b-7e52-c718-c9ff-9b0c5dea86b1|Body Weight                                        

In [82]:
# List of resource types to extract
resource_types = [
    "CarePlan", "CareTeam", "Claim", "Condition", "DiagnosticReport",
    "Encounter", "ExplanationOfBenefit", "Immunization", "MedicationRequest",
    "Observation", "Patient", "Procedure"
]

# Dictionary to hold DataFrames for each resource type
resource_dfs = {}

# Loop through each resource type and filter
for resource_type in resource_types:
    resource_dfs[resource_type] = entries_df.filter(
        col("entry.resource.resourceType") == resource_type
    ).select(
        col("entry.resource.*")  # Select all fields within `resource`
    )


# Define the output folder path for storing results
output_folder = "checkpoint_1"

if not os.path.exists(output_folder):
        os.makedirs(output_folder)

# Save each resource DataFrame as Parquet in the specified folder
for resource_type, resource_df in resource_dfs.items():
    resource_df.write.parquet(f"{output_folder}/{resource_type.lower()}", mode="overwrite")

# Breaking down each parquet 

In [83]:
base_path = 'checkpoint_1'

output_folder = "checkpoint_2"
if not os.path.exists(output_folder):
        os.makedirs(output_folder)
        
# Loop through each resource type
for resource_type in resource_types:
    folder_path = os.path.join(base_path, resource_type.lower())
    
    # Read the Parquet file
    df = spark.read.parquet(folder_path)
    
    print(f"Resource: {resource_type}")
    
    # Print the list of columns
    all_columns = df.columns
    print(f"All Columns: {all_columns}")
    
    # Identify non-null columns
    non_null_columns = [column for column in all_columns if df.select(col(column)).distinct().count() > 1]
    print(f"Non-Null Columns: {non_null_columns}")
    
    # Create a DataFrame with only non-null columns
    df_non_null = df.select(*non_null_columns)
    
    print(f"DataFrame with Non-Null Columns for {resource_type}:")
    df_non_null.show(truncate=False)
    
    # df_non_null.write.parquet(f"{output_folder}/{resource_type.lower()}_non_null", mode="overwrite")

Resource: CarePlan
All Columns: ['abatementDateTime', 'address', 'authoredOn', 'billablePeriod', 'birthDate', 'careTeam', 'category', 'claim', 'class', 'clinicalStatus', 'code', 'communication', 'component', 'contained', 'created', 'deceasedDateTime', 'diagnosis', 'effectiveDateTime', 'encounter', 'extension', 'gender', 'hospitalization', 'id', 'identifier', 'insurance', 'insurer', 'intent', 'issued', 'item', 'maritalStatus', 'medicationCodeableConcept', 'multipleBirthBoolean', 'name', 'occurrenceDateTime', 'onsetDateTime', 'outcome', 'participant', 'patient', 'payment', 'performedPeriod', 'period', 'prescription', 'primarySource', 'priority', 'procedure', 'provider', 'reasonCode', 'reasonReference', 'recordedDate', 'referral', 'requester', 'resourceType', 'result', 'serviceProvider', 'status', 'subject', 'supportingInfo', 'telecom', 'text', 'total', 'type', 'use', 'vaccineCode', 'valueCodeableConcept', 'valueQuantity', 'verificationStatus']
Non-Null Columns: []
DataFrame with Non-Null