In [None]:
# Step 1: Download CCDA Files from Presigned S3 URLs
import requests
import os
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType
import xml.etree.ElementTree as ET
from pyspark.sql.functions import col
from pyspark.sql.functions import length
from pyspark.sql.functions import lit, concat, col
from pyspark.sql import Row
from pyspark.sql.functions import col, to_date

In [None]:
url_file = "/content/drive/MyDrive/millman/ccda_pre_signed_urls.csv"
ccda_dir = "/content/ccda_files"
os.makedirs(ccda_dir, exist_ok=True)

with open(url_file, 'r') as file:
    # Skipping  the header in the file
    next(file)
    for line in file:
        url = line.strip()
        if url:
            filename = url.split("/")[-1].split("?")[0]
            output_path = os.path.join(ccda_dir, filename)
            try:
                response = requests.get(url)
                response.raise_for_status()
                with open(output_path, 'wb') as f:
                    f.write(response.content)
            except requests.exceptions.HTTPError as e:
                print(f"Failed to download {filename}: {e}")


Failed to download 0ww66gj1-5627-705o-1719-2710c04560aa_034c3eab764e9bf9dae33996c3371e5e64ab73b3_masked.xml: 403 Client Error: Forbidden for url: https://mil-s3-portal-intelli-intelli-irixehr-staging-prod.s3.amazonaws.com/trove/out_sample/0ww66gj1-5627-705o-1719-2710c04560aa/0ww66gj1-5627-705o-1719-2710c04560aa_034c3eab764e9bf9dae33996c3371e5e64ab73b3_masked.xml?AWSAccessKeyId=ASIASIMKARLOUA4AW7TS&Signature=1xWfx7fUEiBuVHMa%2Blg1gAz0wP8%3D&x-amz-security-token=IQoJb3JpZ2luX2VjEFYaCXVzLWVhc3QtMSJGMEQCIALTzK5A5njwiNFzoXHZtpQqLziNHVMtmjqiL%2FXdEOczAiBD4cEQganUACkX%2BIztz8imqaX1NOm97pduQ5nDNpd0gyqmAwhPEAMaDDE1NTQ0NTEzNjA5MyIM422bX2OAr7zSWiMvKoMDaMCVA1cEEPMcEH%2BpUFjAe6mTzxXvaR07i6f3t7P7Q91OtGr2Gt76uiI36thBtbUjnOzWQF4KofE5KZIySZCO4wO7gqhFmUd2nk8PQosb%2B%2BrgSR0scNKopeicFDkepijetHcTPlUZKRpUJsTIeyt6GbN73vlqAvUAJeUYLvdemI5k%2FPUHfjPZUl2liJmnIyKDKn1VMK%2Fian%2FpF8TAejGZ%2FDTaezGZgMXx87NR8CpD3S7hF5%2Bp%2BwjAEc65jFFOIB6RB2GWNsdhlKq1gGgzbg7AATDQ8y3cDhsJSBecWpdSV9GbaX%2BWNek3dR2PNSAdlKfQd2xLOSN%2Bp

In [None]:
# Step 2: Parse Medications and Problems from CCDA into DataFrames
# Initialize Spark session
spark = SparkSession.builder.getOrCreate()

# Set CCDA directory path
ccda_dir = "/content/ccda_files"

# Updated parser using MemberID extracted from filename
def parse_ccda(filepath):
    with open(filepath, 'r') as f:
        root = ET.parse(f).getroot()

    medications = []
    problems = []

    # Extract MemberID from filename (assumes filename == MemberID.xml or MemberID.json)
    member_id = os.path.basename(filepath).split('.')[0]

    for med in root.findall(".//{*}substanceAdministration"):
        code = med.find(".//{*}code")
        if code is not None:
            medications.append((member_id, code.attrib.get("code"), code.attrib.get("displayName")))

    for obs in root.findall(".//{*}observation"):
        code = obs.find(".//{*}code")
        if code is not None:
            problems.append((member_id, code.attrib.get("code"), code.attrib.get("displayName")))

    return medications, problems

# Aggregating all parsed results
all_meds, all_probs = [], []

for fname in os.listdir(ccda_dir):
    filepath = os.path.join(ccda_dir, fname)
    meds, probs = parse_ccda(filepath)
    all_meds.extend(meds)
    all_probs.extend(probs)

# Define schema for Spark DataFrames
med_schema = StructType([
    StructField("MemberID", StringType(), True),
    StructField("MedicationCode", StringType(), True),
    StructField("MedicationName", StringType(), True)
])

prob_schema = StructType([
    StructField("MemberID", StringType(), True),
    StructField("ProblemCode", StringType(), True),
    StructField("ProblemName", StringType(), True)
])

# Create Spark DataFrames
meds_df = spark.createDataFrame(all_meds, schema=med_schema)
probs_df = spark.createDataFrame(all_probs, schema=prob_schema)

# Show few records
meds_df.show(5)
probs_df.show(5)

+--------+--------------+--------------+
|MemberID|MedicationCode|MedicationName|
+--------+--------------+--------------+
+--------+--------------+--------------+

+--------+-----------+-----------+
|MemberID|ProblemCode|ProblemName|
+--------+-----------+-----------+
+--------+-----------+-----------+



In [None]:
# Step 3: Load and Join with Claims Data
# Load claims data (diagnosis and prescription)
claims_df = spark.read.csv("/content/drive/MyDrive/millman/data_engineer_exam_claims_final.csv", header=True, inferSchema=True)
rx_claims_df = spark.read.csv("/content/drive/MyDrive/millman/data_engineer_exam_rx_final.csv", header=True, inferSchema=True)

# view first five records of both claim files
claims_df.show(5)
rx_claims_df.show(5)

# Join medications with Rx claims on MemberID
merged_meds = meds_df.join(rx_claims_df, on="MemberID", how="left")

# Join problems with Dx claims on MemberID
merged_probs = probs_df.join(claims_df, on="MemberID", how="left")

+--------------+---------------+-------+----------+--------------------+---------+---------+---------+---------+----------+-------+-----+--------+---------+------+---+------------+---------+-------------+-------------+--------+-----------+---------+------+-------+------+---+-----+-----------+----------+----------+----+-----+----------+---------------+---------+--------+--------+--------+--------+--------+--------+--------+--------+--------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+--------+--------+--------+--------+--------+--------+--------+--------+--------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---+------+---+----------+-------+-

In [None]:
# Step 4: Save to Parquet for Databricks Ingestion
# Save outputs
# Define output directory
output_dir = "/content/processed_data"

# Save as Parquet (preferred for Databricks)
merged_meds.write.mode("overwrite").parquet(f"{output_dir}/merged_medications.parquet")
merged_probs.write.mode("overwrite").parquet(f"{output_dir}/merged_problems.parquet")

# Optionally save as CSV (less efficient but human-readable)
merged_meds.write.mode("overwrite").option("header", True).csv(f"{output_dir}/merged_medications_csv")
merged_probs.write.mode("overwrite").option("header", True).csv(f"{output_dir}/merged_problems_csv")

#view first rows of both df
merged_meds.show(5)
merged_probs.show(5)

+--------+--------------+--------------+--------------+-------+----------+--------+--------+---+-------------+--------------+----------+---------+--------------+-------------+------+-------+----+---+-----+-----------+----------+----------+-----+----------+-----------------+---+------+---+----------+-------+-------+-----------+------+------+---+-------------+----------------+---------------+------------+--------+-----------+-------+-----------+------------+--------+-------+-----------+-------------+--------+----------------+------------------------+-------------------+---------+----------+-------+--------+--------------+-------------+-------------+----------+------------+--------------+--------------+---------------+----------------+-------------+----------------+---------------------+---------------------+---------------------------+----+----+
|MemberID|MedicationCode|MedicationName|SequenceNumber|ClaimID|ContractID|FromDate|PaidDate|NDC|EncounterFlag|MedicalCovered|ProviderID|MailOrde

In [None]:
# Validation: Filter and inspect medication codes based on length
# Filter valid medication codes (length > 2)
valid_meds = meds_joined.filter(length("med_code") > 2)

# Display valid medication codes
print("Valid Medication Codes (length > 2):")
valid_meds.show(10, truncate=False)

# Filter invalid medication codes (length <= 2)
invalid_meds = meds_joined.filter(length("med_code") <= 2)

# Display invalid medication codes
print("Invalid Medication Codes (length <= 2):")
invalid_meds.show(10, truncate=False)


Valid Medication Codes (length > 2):
+----------+--------+--------+--------------+-------+-------+----------+--------+--------+------+---------+---------+--------+-------+-----+--------+---------+------+---+------------+---------+-------------+----------+--------+-----------+---------+------+-------+----+---+-----+-----------+----------+----------+----+-----+----------+---------------+---------+--------+--------+--------+--------+--------+--------+--------+--------+--------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+--------+--------+--------+--------+--------+--------+--------+--------+--------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+

In [None]:
# MedicationStatement
meds_fhir = valid_meds \
    .withColumnRenamed("patient_id", "patient_id") \
    .withColumnRenamed("med_code", "medication_code") \
    .withColumnRenamed("med_name", "medication_display") \
    .withColumn("subject_reference", concat(lit("Patient/"), col("patient_id"))) \
    .withColumn("resource_type", lit("MedicationStatement")) \
    .select("resource_type", "subject_reference", "medication_code", "medication_display")

# Condition
probs_fhir = probs_joined \
    .withColumnRenamed("patient_id", "patient_id") \
    .withColumnRenamed("problem_code", "condition_code") \
    .withColumnRenamed("problem_name", "condition_display") \
    .withColumn("subject_reference", concat(lit("Patient/"), col("patient_id"))) \
    .withColumn("resource_type", lit("Condition")) \
    .select("resource_type", "subject_reference", "condition_code", "condition_display")

# Write to disk
meds_fhir.write.mode("overwrite").parquet("/content/data/output/fhir_medicationstatement")
probs_fhir.write.mode("overwrite").parquet("/content/data/output/fhir_condition")

In [None]:
# Unit Tests for Clinical Data Pipeline (Step 8: Optional)
overview_path = "/content/drive/MyDrive/millman/data_overview.csv"

overview_df = spark.read.csv(overview_path, header=True, inferSchema=True)


# Raw Claims Data Quality Tests

# Checks that all ClaimID values in the claims file are non-null
def test_claimid_not_null(spark, df):
    print("First 5 rows of claims data:")
    df.show(5, truncate=False)

    null_count = df.filter(col("ClaimID").isNull()).count()
    assert null_count == 0, f"Found {null_count} null ClaimID(s)"


# Checks that all MemberID values in a given DataFrame are non-null
def test_memberid_not_null(spark, df, name="claims"):
    print(f"First 5 rows of {name} data:")
    df.show(5, truncate=False)

    null_count = df.filter(col("MemberID").isNull()).count()
    assert null_count == 0, f"Found {null_count} null MemberID(s) in {name} data"


# Validates that all NDC codes in Rx claims follow a numeric pattern (at least 5 digits)
def test_ndc_format(rx_df):
    print("First 5 rows of Rx data:")
    rx_df.show(5, truncate=False)

    bad_ndc = rx_df.filter(~col("NDC").rlike("^[0-9]{5,}$"))
    count = bad_ndc.count()
    assert count == 0, f"{count} Rx claims have invalid NDC format"


# Ensures the overview file has the expected metadata columns
def test_overview_file_loaded(df):
    print("First 5 rows of overview file:")
    df.show(5, truncate=False)

    required_columns = ["File_Name", "Type", "Patient_Identifier_Information"]
    for col_name in required_columns:
        assert col_name in df.columns, f"Missing column: {col_name}"


# Referential Integrity Test

# Ensures all files listed in the overview file are present in the actual inputs
def test_referential_integrity_overview(overview_df, actual_files):
    # Filter out rows with invalid or descriptive text instead of actual file names
    overview_files = (
        overview_df
        .select("File_Name")
        .dropna()
        .rdd
        .map(lambda row: row.File_Name.strip())
        .filter(lambda name: name.endswith(".csv") or name.endswith(".xml"))
        .collect()
    )

    missing = [f for f in overview_files if f not in actual_files]

    print("Overview files listed:", overview_files)
    print("Actual input files:", actual_files)

    assert len(missing) == 0, f"Files listed in overview but not found: {missing}"



# Date Field Logic Test

# Validates that FromDate is not after ToDate in the claims file
def test_date_order(claims_df):
    print("Validating FromDate < ToDate in claims data...")

    df = claims_df \
        .withColumn("FromDateParsed", to_date(col("FromDate"), "M/d/yyyy")) \
        .withColumn("ToDateParsed", to_date(col("ToDate"), "M/d/yyyy"))

    bad_dates = df.filter(col("FromDateParsed") > col("ToDateParsed"))
    bad_dates.show(5, truncate=False)

    count = bad_dates.count()
    assert count == 0, f"Found {count} records where FromDate is after ToDate"


# Run All Tests

print("Running extended tests on raw input files...\n")

# Run each test with comments printed inline
test_claimid_not_null(spark, claims_df)
test_memberid_not_null(spark, claims_df, name="claims")
test_memberid_not_null(spark, rx_claims_df, name="rx_claims")
test_ndc_format(rx_claims_df)
test_overview_file_loaded(overview_df)

# Provide actual filenames used in your notebook for referential check
actual_file_names = [
    "ccda_pre_signed_urls.csv",
    "data_engineer_exam_claims_final.csv",
    "data_engineer_exam_rx_final.csv"
]
test_referential_integrity_overview(overview_df, actual_file_names)

# Validate dates
test_date_order(claims_df)

print("\nAll extended data quality tests passed.")


Running extended tests on raw input files...

First 5 rows of claims data:
+--------------+---------------+-------+----------+------------------------------------+---------+---------+---------+---------+----------+-------+-----+--------+---------+------+---+------------+---------+-------------+-------------+--------+-----------+---------+------+-------+------+---+-----+-----------+----------+----------+----+-----+----------+---------------+---------+--------+--------+--------+--------+--------+--------+--------+--------+--------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+--------+--------+--------+--------+--------+--------+--------+--------+--------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+----