# Imports and Setup
This initial section loads all necessary Python libraries and sets up key configuration variables. It defines the paths for the source data and the output database, ensuring the output directory is ready for use.

In [1]:
import duckdb
import pandas as pd
from pathlib import Path
import orjson
import multiprocessing
import os
from tqdm import tqdm

# --- 1. SETUP ---
# NOTE: Adjust these paths if your directory structure is different.
DATA_DIR = Path("../data")
FHIR_DIR = DATA_DIR / "fhir"
OUTPUT_DIR = Path("../output")

# Create the output directory if it doesn't exist
OUTPUT_DIR.mkdir(exist_ok=True) 
DB_FILE = OUTPUT_DIR / "synthea_fhir.duckdb"

# --- For a quick test, you can limit the number of files to process ---
# Set to None to process all files
MAX_FILES_TO_PROCESS = None

# Helper Functions
Here, we define a set of reusable helper functions. Each one is a small utility designed to safely extract a specific piece of information from the deeply nested FHIR JSON structure, which simplifies the main parsing logic.

In [2]:
def get_clean_id(ref_dict: dict) -> str | None:
    """Extracts and robustly cleans the ID from a raw FHIR reference dictionary."""
    if not (ref_dict and isinstance(ref_dict, dict) and 'reference' in ref_dict):
        return None
    ref_string = ref_dict['reference']
    last_delim_pos = max(ref_string.rfind('/'), ref_string.rfind(':'))
    return ref_string[last_delim_pos + 1:] if last_delim_pos != -1 else ref_string

def get_extension_value(resource: dict, url: str) -> str | float | None:
    """Finds a specific extension by URL and returns its value."""
    if not resource or not isinstance(resource, dict):
        return None
    for ext in resource.get('extension', []):
        if ext.get('url') == url:
            # ADD 'valueInteger' to this list
            for key in ['valueDecimal', 'valueString', 'valueDate', 'valueCode', 'valueInteger']:
                if key in ext:
                    return ext[key]
            if 'valueMoney' in ext and 'value' in ext['valueMoney']:
                return ext['valueMoney']['value']
    return None

def get_identifier(resource: dict, system_url: str) -> str | None:
    """Finds a specific identifier by its system URL."""
    for identifier in resource.get('identifier', []):
        if identifier.get('system') == system_url:
            return identifier.get('value')
    return None

def get_first_address(resource: dict) -> dict | None:
    """Safely gets the first address entry from a resource."""
    if addresses := resource.get('address'):
        return addresses[0]
    return None

def get_coding(codeable_concept: dict) -> dict:
    """Extracts the first coding from a CodeableConcept."""
    if codeable_concept and (codings := codeable_concept.get('coding')):
        return codings[0]
    return {}

def get_us_core_extension_value(resource: dict, url: str) -> str | None:
    """
    Specifically parses the complex US Core Race & Ethnicity extensions, 
    which have a nested valueCoding structure.
    """
    if not resource or not isinstance(resource, dict):
        return None
    for ext in resource.get('extension', []):
        if ext.get('url') == url:
            # The value is inside another 'extension' list
            nested_ext = next(iter(ext.get('extension', [])))
            if nested_ext and 'valueCoding' in nested_ext:
                return nested_ext['valueCoding'].get('display')
    return None

# Resource Parsing Functions
This section contains the core parsing logic, with a dedicated function for each FHIR resource type (e.g., Patient, Encounter). These functions take a raw FHIR resource and transform its data into a clean, flat structure that matches our database schema.

In [3]:
def parse_patient(resource: dict) -> dict:
    """Parses a FHIR Patient resource with robust error handling."""
    
    primary_name = resource.get('name', [{}])[0]
    maiden_name_obj = next((name for name in resource.get('name', []) if name.get('use') == 'maiden'), None)
    address = get_first_address(resource) or {}
    
    lat, lon = None, None
    if geo_ext := get_extension_value(address, "http://hl7.org/fhir/StructureDefinition/geolocation"):
        lat_ext = next((ext for ext in geo_ext.get('extension', []) if ext.get('url') == 'latitude'), None)
        lon_ext = next((ext for ext in geo_ext.get('extension', []) if ext.get('url') == 'longitude'), None)
        if lat_ext: lat = lat_ext.get('valueDecimal')
        if lon_ext: lon = lon_ext.get('valueDecimal')
        
    given_names = primary_name.get('given', [])
    first_name = given_names[0] if given_names else None
    middle_name = given_names[1] if len(given_names) > 1 else None

    return {
        "Id": resource.get('id'),
        "BirthDate": resource.get('birthDate'),
        "DeathDate": resource.get('deceasedDateTime'),
        "SSN": get_identifier(resource, "http://hl7.org/fhir/sid/us-ssn"),
        "Drivers": get_identifier(resource, "urn:oid:2.16.840.1.113883.4.3.25"),
        "Passport": get_identifier(resource, "http://standardhealthrecord.org/fhir/sid/passport-number"),
        "Prefix": next(iter(primary_name.get('prefix', [])), None),
        "First": first_name,
        "Middle": middle_name,
        "Last": primary_name.get('family'),
        "Suffix": next(iter(primary_name.get('suffix', [])), None),
        "Maiden": maiden_name_obj.get('family') if maiden_name_obj else None,
        "Marital": get_coding(resource.get('maritalStatus')).get('code'),
        "Race": get_us_core_extension_value(resource, "http://hl7.org/fhir/us/core/StructureDefinition/us-core-race"),
        "Ethnicity": get_us_core_extension_value(resource, "http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity"),
        "Gender": resource.get('gender'),
        "BirthPlace": address.get('city'),
        "Address": next(iter(address.get('line', [])), None),
        "City": address.get('city'),
        "State": address.get('state'),
        "County": get_extension_value(address, "http://hl7.org/fhir/StructureDefinition/iso21090-ADXP-county"),
        "FIPS": get_extension_value(address, "http://synthetichealth.github.io/synthea/fips-county-code"),
        "Zip": address.get('postalCode'),
        "Lat": lat,
        "Lon": lon,
        "Healthcare_Expenses": get_extension_value(resource, "http://synthetichealth.github.io/synthea/financial-information#healthcare-expenses"),
        "Healthcare_Coverage": get_extension_value(resource, "http://synthetichealth.github.io/synthea/financial-information#healthcare-coverage"),
        "Income": None
    }


def parse_sdoh_observation(resource: dict) -> dict | None:
    """
    Parses a FHIR Observation resource specifically to find the PRAPARE
    survey and extract the patient's income from its components.
    """
    if get_coding(resource.get('code')).get('code') != '93025-5':
        return None

    patient_id = get_clean_id(resource.get('subject'))
    income = None

    for component in resource.get('component', []):
        if get_coding(component.get('code')).get('code') == '63586-2':
            if value_quantity := component.get('valueQuantity'):
                income = value_quantity.get('value')
                break 
    
    if patient_id and income is not None:
        return {"Patient": patient_id, "Income": income}
    
    return None

def parse_condition(resource: dict) -> dict:
    """Parses a FHIR Condition resource."""
    code_info = get_coding(resource.get('code'))
    return {
        "Start": resource.get('onsetDateTime'),
        "Stop": resource.get('abatementDateTime'),
        "Patient": get_clean_id(resource.get('subject')),
        "Encounter": get_clean_id(resource.get('encounter')),
        "System": code_info.get('system'),
        "Code": code_info.get('code'),
        "Description": code_info.get('display'),
    }

def parse_procedure(resource: dict) -> dict:
    """Parses a FHIR Procedure resource."""
    code_info = get_coding(resource.get('code'))
    reason_info = get_coding(next(iter(resource.get('reasonCode', [])), None))
    return {
        "Start": resource.get('performedPeriod', {}).get('start'),
        "Stop": resource.get('performedPeriod', {}).get('end'),
        "Patient": get_clean_id(resource.get('subject')),
        "Encounter": get_clean_id(resource.get('encounter')),
        "System": code_info.get('system'),
        "Code": code_info.get('code'),
        "Description": code_info.get('display'),
        "Base_Cost": get_extension_value(resource, "http://synthetichealth.github.io/synthea/financial-information#base-cost"),
        "ReasonCode": reason_info.get('code'),
        "ReasonDescription": reason_info.get('display'),
    }

def parse_medication(resource: dict) -> dict:
    """Parses a FHIR MedicationRequest resource."""
    code_info = get_coding(resource.get('medicationCodeableConcept'))
    reason_info = get_coding(next(iter(resource.get('reasonCode', [])), None))
    return {
        "Start": resource.get('authoredOn'),
        "Stop": resource.get('dispenseRequest', {}).get('validityPeriod', {}).get('end'),
        "Patient": get_clean_id(resource.get('subject')),
        "Payer": None, 
        "Encounter": get_clean_id(resource.get('encounter')),
        "Code": code_info.get('code'),
        "Description": code_info.get('display'),
        "Base_Cost": get_extension_value(resource, "http://synthetichealth.github.io/synthea/financial-information#base-cost"),
        "Payer_Coverage": get_extension_value(resource, "http://synthetichealth.github.io/synthea/financial-information#payer-coverage"),
        "Dispenses": 1 + resource.get('dispenseRequest', {}).get('numberOfRepeatsAllowed', 0),
        "TotalCost": get_extension_value(resource, "http://synthetichealth.github.io/synthea/financial-information#total-cost"),
        "ReasonCode": reason_info.get('code'),
        "ReasonDescription": reason_info.get('display'),
    }
    
def parse_encounter(resource: dict) -> dict:
    """Parses a FHIR Encounter resource."""
    primary_type_coding = get_coding(next(iter(resource.get('type', [])), None))
    primary_reason_coding = get_coding(next(iter(resource.get('reasonCode', [])), None))
    return {
        "Id": resource.get('id'),
        "Start": resource.get('period', {}).get('start'),
        "Stop": resource.get('period', {}).get('end'),
        "Patient": get_clean_id(resource.get('subject')),
        "Organization": get_clean_id(resource.get('serviceProvider')),
        "Provider": get_clean_id(next(iter(resource.get('participant', [])), {}).get('individual')),
        "Payer": None,
        "EncounterClass": resource.get('class', {}).get('code'),
        "Code": primary_type_coding.get('code'),
        "Description": primary_type_coding.get('display'),
        "Base_Encounter_Cost": get_extension_value(resource, "http://synthetichealth.github.io/synthea/financial-information#base-encounter-cost"),
        "Total_Claim_Cost": None,
        "Payer_Coverage": get_extension_value(resource, "http://synthetichealth.github.io/synthea/financial-information#payer-coverage"),
        "ReasonCode": primary_reason_coding.get('code'),
        "ReasonDescription": primary_reason_coding.get('display'),
    }

def parse_explanation_of_benefit(resource: dict) -> dict:
    """Parses a FHIR ExplanationOfBenefit resource to get financial data."""
    total_cost_money = next(iter(resource.get('total', [])), {}).get('amount', {})
    
    encounter_ref = None
    if encounters := resource.get('encounter'):
        encounter_ref = encounters[0]
    elif items := resource.get('item'):
        if item_encounters := items[0].get('encounter'):
            encounter_ref = item_encounters[0]

    return {
        "Encounter": get_clean_id(encounter_ref),
        "Payer": resource.get('insurer', {}).get('display'),
        "Total_Claim_Cost": total_cost_money.get('value'),
    }

# Multiprocessing Worker Function
This defines the primary worker function that will be run in parallel. It reads a single FHIR bundle file, uses the appropriate parsing functions to extract data, and then enriches the records by linking related information together (e.g., adding financial data to encounters).

In [4]:
def process_file(file_path: Path) -> dict:
    """
    Processes a single FHIR JSON file, parsing all relevant resources
    and enriching encounters with EOB data and patients with SDOH data.
    """
    data = {
        "patients": [],
        "encounters": [],
        "conditions": [],
        "procedures": [],
        "medications": []
    }
    
    # Temp lists to hold records for enrichment pass
    eobs_raw = []
    sdoh_obs_raw = []

    try:
        with open(file_path, "rb") as f:
            bundle = orjson.loads(f.read())
        
        # --- First Pass: Parse all resources ---
        for entry in bundle.get('entry', []):
            resource = entry.get('resource', {})
            if not resource: continue
            
            resource_type = resource.get('resourceType')

            if resource_type == 'Patient':
                data['patients'].append(parse_patient(resource))
            elif resource_type == 'Encounter':
                data['encounters'].append(parse_encounter(resource))
            elif resource_type == 'Condition':
                data['conditions'].append(parse_condition(resource))
            elif resource_type == 'Procedure':
                data['procedures'].append(parse_procedure(resource))
            elif resource_type == 'MedicationRequest':
                data['medications'].append(parse_medication(resource))
            elif resource_type == 'ExplanationOfBenefit':
                eobs_raw.append(parse_explanation_of_benefit(resource))
            elif resource_type == 'Observation':
                # Check if it's the specific SDOH survey we want
                if sdoh_data := parse_sdoh_observation(resource):
                    sdoh_obs_raw.append(sdoh_data)

        # --- Second Pass: Enrich Patients with Income ---
        if sdoh_obs_raw and data["patients"]:
            income_lookup = {obs['Patient']: obs['Income'] for obs in sdoh_obs_raw if obs}
            for patient in data["patients"]:
                if income := income_lookup.get(patient["Id"]):
                    patient["Income"] = income

        # --- Third Pass: Enrich Encounters with EOB data ---
        if eobs_raw and data["encounters"]:
            eob_lookup = {eob['Encounter']: eob for eob in eobs_raw if eob and eob.get('Encounter')}
            for encounter in data["encounters"]:
                if eob_data := eob_lookup.get(encounter["Id"]):
                    encounter["Payer"] = eob_data.get("Payer")
                    encounter["Total_Claim_Cost"] = eob_data.get("Total_Claim_Cost")

    except Exception as e:
        pass # Silently ignore files that fail to parse
        
    return data

# Database Initialization
This cell prepares the DuckDB database. It establishes a connection, clears out any old tables from previous runs, and then creates the new, empty tables with the correct structure to hold the processed FHIR data.

In [5]:
print(f"Connecting to DuckDB database: {DB_FILE}")
con = duckdb.connect(str(DB_FILE), read_only=False)

print("Dropping existing tables...")
con.execute("DROP TABLE IF EXISTS patients;")
con.execute("DROP TABLE IF EXISTS encounters;")
con.execute("DROP TABLE IF EXISTS conditions;")
con.execute("DROP TABLE IF EXISTS procedures;")
con.execute("DROP TABLE IF EXISTS medications;")

print("Creating new tables...")

# Patients Table
con.execute("""
CREATE TABLE patients (
    Id VARCHAR PRIMARY KEY,
    BirthDate DATE,
    DeathDate DATE,
    SSN VARCHAR,
    Drivers VARCHAR,
    Passport VARCHAR,
    Prefix VARCHAR,
    First VARCHAR,
    Middle VARCHAR,          
    Last VARCHAR,
    Suffix VARCHAR,
    Maiden VARCHAR,          
    Marital VARCHAR(1),
    Race VARCHAR,
    Ethnicity VARCHAR,
    Gender VARCHAR,
    BirthPlace VARCHAR,
    Address VARCHAR,
    City VARCHAR,
    State VARCHAR,
    County VARCHAR,
    FIPS VARCHAR,            
    Zip VARCHAR,
    Lat FLOAT,
    Lon FLOAT,
    Healthcare_Expenses FLOAT,
    Healthcare_Coverage FLOAT,
    Income BIGINT            
);
""")

# Conditions Table
con.execute("""
CREATE TABLE conditions (
    Start DATE,
    Stop DATE,
    Patient VARCHAR,
    Encounter VARCHAR,
    System VARCHAR,
    Code VARCHAR,
    Description VARCHAR
);
""")

# Procedures Table
con.execute("""
CREATE TABLE procedures (
    Start TIMESTAMP,
    Stop TIMESTAMP,
    Patient VARCHAR,
    Encounter VARCHAR,
    System VARCHAR,
    Code VARCHAR,
    Description VARCHAR,
    Base_Cost FLOAT,
    ReasonCode VARCHAR,
    ReasonDescription VARCHAR
);
""")

# Medications Table
con.execute("""
CREATE TABLE medications (
    Start TIMESTAMP,
    Stop TIMESTAMP,
    Patient VARCHAR,
    Payer VARCHAR,
    Encounter VARCHAR,
    Code VARCHAR,
    Description VARCHAR,
    Base_Cost FLOAT,
    Payer_Coverage FLOAT,
    Dispenses INTEGER,
    TotalCost FLOAT,
    ReasonCode VARCHAR,
    ReasonDescription VARCHAR
);
""")

# Encounters Table (from original script, kept for completeness)
con.execute("""
CREATE TABLE encounters (
    Id VARCHAR PRIMARY KEY,
    Start TIMESTAMP,
    Stop TIMESTAMP,
    Patient VARCHAR,
    Organization VARCHAR,
    Provider VARCHAR,
    Payer VARCHAR,
    EncounterClass VARCHAR,
    Code VARCHAR,
    Description VARCHAR,
    Base_Encounter_Cost FLOAT,
    Total_Claim_Cost FLOAT,
    Payer_Coverage FLOAT,
    ReasonCode VARCHAR,
    ReasonDescription VARCHAR
);
""")

print("✅ All tables created successfully.")

Connecting to DuckDB database: ../output/synthea_fhir.duckdb
Dropping existing tables...
Creating new tables...
✅ All tables created successfully.


# Data Processing and Insertion
This is the main execution cell that brings everything together. It finds all the FHIR data files, groups them into smaller batches, and then uses multiple CPU cores to process these batches in parallel. As each batch is completed, the clean data is efficiently inserted into the database.

In [6]:
# --- Set a batch size ---
# This is the number of files to process in each chunk.
# Adjust this based on your system's RAM. 5000 is a safe start.
BATCH_SIZE = 5000

# --- Find all files to process ---
print("Finding all FHIR JSON bundles...")
fhir_files = list(FHIR_DIR.rglob("*.json"))
# The MAX_FILES_TO_PROCESS from Cell 1 can still be used for testing
if MAX_FILES_TO_PROCESS:
    fhir_files = fhir_files[:MAX_FILES_TO_PROCESS]
total_files = len(fhir_files)
print(f"Found {total_files:,} files to process.")

# --- Create chunks of files ---
file_chunks = [fhir_files[i:i + BATCH_SIZE] for i in range(0, total_files, BATCH_SIZE)]
num_chunks = len(file_chunks)
print(f"Processing in {num_chunks} batches of up to {BATCH_SIZE} files each.")

# --- Set up multiprocessing ---
num_processes = os.cpu_count()
print(f"Using {num_processes} worker processes.")

# --- Process each chunk in a loop ---
for i, chunk in enumerate(file_chunks):
    print(f"\n--- Processing Batch {i+1}/{num_chunks} ({len(chunk)} files) ---")
    
    # Process the current chunk in parallel
    with multiprocessing.Pool(processes=num_processes) as pool:
        results = list(tqdm(pool.imap_unordered(process_file, chunk), total=len(chunk)))

    # Aggregate results for THIS BATCH ONLY
    batch_data = {
        "patients": [item for res in results for item in res.get("patients", [])],
        "encounters": [item for res in results for item in res.get("encounters", [])],
        "conditions": [item for res in results for item in res.get("conditions", [])],
        "procedures": [item for res in results for item in res.get("procedures", [])],
        "medications": [item for res in results for item in res.get("medications", [])],
    }

    # Insert this batch's data into DuckDB
    for name, data_list in batch_data.items():
        if data_list:
            print(f"  Inserting {len(data_list):,} records into '{name}'...")
            df = pd.DataFrame(data_list)
            # Use DuckDB's efficient append from DataFrame
            con.append(name, df)
        else:
            print(f"  No data for '{name}' in this batch.")
            
    print(f"✅ Batch {i+1}/{num_chunks} complete.")

print("\n🎉 All batches processed successfully!")

Finding all FHIR JSON bundles...
Found 111,280 files to process.
Processing in 23 batches of up to 5000 files each.
Using 32 worker processes.

--- Processing Batch 1/23 (5000 files) ---


100%|██████████| 5000/5000 [00:37<00:00, 131.66it/s]


  Inserting 5,000 records into 'patients'...
  Inserting 264,528 records into 'encounters'...
  Inserting 162,851 records into 'conditions'...
  Inserting 716,865 records into 'procedures'...
  Inserting 198,529 records into 'medications'...
✅ Batch 1/23 complete.

--- Processing Batch 2/23 (5000 files) ---


100%|██████████| 5000/5000 [00:50<00:00, 98.43it/s] 


  Inserting 5,000 records into 'patients'...
  Inserting 275,301 records into 'encounters'...
  Inserting 163,708 records into 'conditions'...
  Inserting 728,094 records into 'procedures'...
  Inserting 195,792 records into 'medications'...
✅ Batch 2/23 complete.

--- Processing Batch 3/23 (5000 files) ---


100%|██████████| 5000/5000 [00:51<00:00, 97.79it/s] 


  Inserting 5,000 records into 'patients'...
  Inserting 269,196 records into 'encounters'...
  Inserting 164,887 records into 'conditions'...
  Inserting 725,307 records into 'procedures'...
  Inserting 212,188 records into 'medications'...
✅ Batch 3/23 complete.

--- Processing Batch 4/23 (5000 files) ---


100%|██████████| 5000/5000 [00:48<00:00, 102.14it/s]


  Inserting 5,000 records into 'patients'...
  Inserting 268,963 records into 'encounters'...
  Inserting 163,279 records into 'conditions'...
  Inserting 726,682 records into 'procedures'...
  Inserting 196,848 records into 'medications'...
✅ Batch 4/23 complete.

--- Processing Batch 5/23 (5000 files) ---


100%|██████████| 5000/5000 [00:48<00:00, 103.27it/s]


  Inserting 5,000 records into 'patients'...
  Inserting 257,642 records into 'encounters'...
  Inserting 161,481 records into 'conditions'...
  Inserting 692,195 records into 'procedures'...
  Inserting 201,145 records into 'medications'...
✅ Batch 5/23 complete.

--- Processing Batch 6/23 (5000 files) ---


100%|██████████| 5000/5000 [00:49<00:00, 101.73it/s]


  Inserting 5,000 records into 'patients'...
  Inserting 262,146 records into 'encounters'...
  Inserting 164,216 records into 'conditions'...
  Inserting 710,565 records into 'procedures'...
  Inserting 193,012 records into 'medications'...
✅ Batch 6/23 complete.

--- Processing Batch 7/23 (5000 files) ---


100%|██████████| 5000/5000 [00:51<00:00, 97.17it/s] 


  Inserting 5,000 records into 'patients'...
  Inserting 269,844 records into 'encounters'...
  Inserting 167,990 records into 'conditions'...
  Inserting 731,086 records into 'procedures'...
  Inserting 212,842 records into 'medications'...
✅ Batch 7/23 complete.

--- Processing Batch 8/23 (5000 files) ---


100%|██████████| 5000/5000 [00:53<00:00, 94.32it/s] 


  Inserting 5,000 records into 'patients'...
  Inserting 264,227 records into 'encounters'...
  Inserting 162,398 records into 'conditions'...
  Inserting 712,069 records into 'procedures'...
  Inserting 201,207 records into 'medications'...
✅ Batch 8/23 complete.

--- Processing Batch 9/23 (5000 files) ---


100%|██████████| 5000/5000 [00:53<00:00, 94.27it/s] 


  Inserting 5,000 records into 'patients'...
  Inserting 257,265 records into 'encounters'...
  Inserting 160,992 records into 'conditions'...
  Inserting 699,884 records into 'procedures'...
  Inserting 194,240 records into 'medications'...
✅ Batch 9/23 complete.

--- Processing Batch 10/23 (5000 files) ---


100%|██████████| 5000/5000 [00:53<00:00, 92.65it/s] 


  Inserting 4,999 records into 'patients'...
  Inserting 266,441 records into 'encounters'...
  Inserting 163,710 records into 'conditions'...
  Inserting 714,411 records into 'procedures'...
  Inserting 206,762 records into 'medications'...
✅ Batch 10/23 complete.

--- Processing Batch 11/23 (5000 files) ---


100%|██████████| 5000/5000 [00:51<00:00, 97.80it/s] 


  Inserting 5,000 records into 'patients'...
  Inserting 254,768 records into 'encounters'...
  Inserting 160,485 records into 'conditions'...
  Inserting 694,020 records into 'procedures'...
  Inserting 188,678 records into 'medications'...
✅ Batch 11/23 complete.

--- Processing Batch 12/23 (5000 files) ---


100%|██████████| 5000/5000 [00:50<00:00, 98.52it/s] 


  Inserting 5,000 records into 'patients'...
  Inserting 267,143 records into 'encounters'...
  Inserting 160,659 records into 'conditions'...
  Inserting 711,532 records into 'procedures'...
  Inserting 191,427 records into 'medications'...
✅ Batch 12/23 complete.

--- Processing Batch 13/23 (5000 files) ---


100%|██████████| 5000/5000 [00:50<00:00, 98.44it/s] 


  Inserting 5,000 records into 'patients'...
  Inserting 262,094 records into 'encounters'...
  Inserting 160,538 records into 'conditions'...
  Inserting 712,069 records into 'procedures'...
  Inserting 190,452 records into 'medications'...
✅ Batch 13/23 complete.

--- Processing Batch 14/23 (5000 files) ---


100%|██████████| 5000/5000 [00:50<00:00, 98.45it/s] 


  Inserting 5,000 records into 'patients'...
  Inserting 276,078 records into 'encounters'...
  Inserting 163,067 records into 'conditions'...
  Inserting 733,659 records into 'procedures'...
  Inserting 194,203 records into 'medications'...
✅ Batch 14/23 complete.

--- Processing Batch 15/23 (5000 files) ---


100%|██████████| 5000/5000 [00:50<00:00, 99.88it/s] 


  Inserting 5,000 records into 'patients'...
  Inserting 271,555 records into 'encounters'...
  Inserting 160,913 records into 'conditions'...
  Inserting 726,396 records into 'procedures'...
  Inserting 191,632 records into 'medications'...
✅ Batch 15/23 complete.

--- Processing Batch 16/23 (5000 files) ---


100%|██████████| 5000/5000 [00:51<00:00, 97.87it/s] 


  Inserting 5,000 records into 'patients'...
  Inserting 267,262 records into 'encounters'...
  Inserting 162,440 records into 'conditions'...
  Inserting 725,214 records into 'procedures'...
  Inserting 207,770 records into 'medications'...
✅ Batch 16/23 complete.

--- Processing Batch 17/23 (5000 files) ---


100%|██████████| 5000/5000 [00:50<00:00, 98.84it/s] 


  Inserting 5,000 records into 'patients'...
  Inserting 266,390 records into 'encounters'...
  Inserting 165,025 records into 'conditions'...
  Inserting 727,900 records into 'procedures'...
  Inserting 207,250 records into 'medications'...
✅ Batch 17/23 complete.

--- Processing Batch 18/23 (5000 files) ---


100%|██████████| 5000/5000 [00:51<00:00, 97.54it/s] 


  Inserting 4,999 records into 'patients'...
  Inserting 260,360 records into 'encounters'...
  Inserting 162,615 records into 'conditions'...
  Inserting 706,722 records into 'procedures'...
  Inserting 202,853 records into 'medications'...
✅ Batch 18/23 complete.

--- Processing Batch 19/23 (5000 files) ---


100%|██████████| 5000/5000 [00:49<00:00, 100.17it/s]


  Inserting 5,000 records into 'patients'...
  Inserting 260,086 records into 'encounters'...
  Inserting 161,714 records into 'conditions'...
  Inserting 702,627 records into 'procedures'...
  Inserting 202,218 records into 'medications'...
✅ Batch 19/23 complete.

--- Processing Batch 20/23 (5000 files) ---


100%|██████████| 5000/5000 [00:52<00:00, 94.97it/s] 


  Inserting 5,000 records into 'patients'...
  Inserting 275,262 records into 'encounters'...
  Inserting 165,851 records into 'conditions'...
  Inserting 738,982 records into 'procedures'...
  Inserting 202,866 records into 'medications'...
✅ Batch 20/23 complete.

--- Processing Batch 21/23 (5000 files) ---


100%|██████████| 5000/5000 [00:50<00:00, 98.75it/s] 


  Inserting 5,000 records into 'patients'...
  Inserting 263,543 records into 'encounters'...
  Inserting 158,681 records into 'conditions'...
  Inserting 704,953 records into 'procedures'...
  Inserting 190,799 records into 'medications'...
✅ Batch 21/23 complete.

--- Processing Batch 22/23 (5000 files) ---


100%|██████████| 5000/5000 [00:55<00:00, 89.97it/s] 


  Inserting 5,000 records into 'patients'...
  Inserting 275,524 records into 'encounters'...
  Inserting 165,624 records into 'conditions'...
  Inserting 732,203 records into 'procedures'...
  Inserting 215,933 records into 'medications'...
✅ Batch 22/23 complete.

--- Processing Batch 23/23 (1280 files) ---


100%|██████████| 1280/1280 [00:13<00:00, 93.24it/s] 


  Inserting 1,280 records into 'patients'...
  Inserting 70,472 records into 'encounters'...
  Inserting 42,316 records into 'conditions'...
  Inserting 192,549 records into 'procedures'...
  Inserting 52,383 records into 'medications'...
✅ Batch 23/23 complete.

🎉 All batches processed successfully!


# Verification and Cleanup
After the data loading is complete, this final section runs checks to verify the process was successful. It counts the total number of rows inserted into each table and displays a small sample from each one. Finally, it closes the connection to the database.

In [7]:
print("\n--- Verifying Data Insertion ---")

# Define the list of tables to check, in case the kernel was restarted
TABLE_NAMES = ["patients", "encounters", "conditions", "procedures", "medications"]

for table_name in TABLE_NAMES:
    try:
        count = con.execute(f"SELECT COUNT(*) FROM {table_name};").fetchone()[0]
        print(f"\nTotal rows in '{table_name}': {count:,}")
        if count > 0:
            display(con.execute(f"SELECT * FROM {table_name} LIMIT 5;").fetchdf())
    except Exception as e:
        print(f"\nCould not verify table '{table_name}': {e}")
        
# --- Close the database connection ---
# Make sure the connection 'con' is still open before trying to close it.
# If you get an error here, it's safe to ignore as the connection may already be closed.
try:
    con.close()
    print("\nDatabase connection closed.")
except Exception as e:
    print(f"\nCould not close connection (it may already be closed): {e}")


--- Verifying Data Insertion ---

Total rows in 'patients': 111,278


Unnamed: 0,Id,BirthDate,DeathDate,SSN,Drivers,Passport,Prefix,First,Middle,Last,...,City,State,County,FIPS,Zip,Lat,Lon,Healthcare_Expenses,Healthcare_Coverage,Income
0,295e0a8b-359e-9096-9a39-265d03df2cac,2025-07-19,NaT,999-67-8964,,,,Aaron697,Dale454,Sauer652,...,San Antonio,TX,,,78210,,,,,
1,7b0e0003-89c8-6020-da41-033cf174c76f,1986-04-03,NaT,999-90-1478,S99952781,,Mr.,Aaron697,Bradly656,Cronin387,...,Dallas,TX,,,75166,,,,,3029.0
2,7fc520c4-cb59-fa5d-c639-8accb3a4acb7,1997-11-22,NaT,999-94-8302,S99952940,,Mr.,Aaron697,Rodrick370,Halvorson124,...,Lakeshore Gardens-Hidden Acres,TX,,,0,,,,,24184.0
3,efa6d2a3-c5dc-70ac-ee73-910a1e03370b,1996-10-25,NaT,999-15-9922,S99937036,,Mr.,Aaron697,,Kemmer137,...,Dallas,TX,,,75204,,,,,71529.0
4,b9bd30eb-8e1f-961b-ebf2-394583e8b0be,1983-08-16,NaT,999-51-6065,S99924520,,Mr.,Aaron697,Mohamed943,Conroy74,...,Irving,TX,,,75247,,,,,26062.0



Total rows in 'encounters': 5,926,090


Unnamed: 0,Id,Start,Stop,Patient,Organization,Provider,Payer,EncounterClass,Code,Description,Base_Encounter_Cost,Total_Claim_Cost,Payer_Coverage,ReasonCode,ReasonDescription
0,bb0765f1-acb7-5022-d9ea-dd097eedca75,2025-07-19 07:09:37,2025-07-19 07:24:37,295e0a8b-359e-9096-9a39-265d03df2cac,synthea|6be86010-1af1-30d3-bf1d-8317e71604e6,us-npi|9999748590,UnitedHealthcare,AMB,410620009,Well child visit (procedure),,767.400024,,,
1,d47805b8-9644-4d9b-4cde-043d3f3878d7,2025-08-23 07:09:37,2025-08-23 07:24:37,295e0a8b-359e-9096-9a39-265d03df2cac,synthea|6be86010-1af1-30d3-bf1d-8317e71604e6,us-npi|9999748590,UnitedHealthcare,AMB,410620009,Well child visit (procedure),,526.409973,,,
2,d9f75434-be7a-663f-8702-8ac72afe10fd,1997-05-18 10:35:30,1997-05-23 08:41:12,7b0e0003-89c8-6020-da41-033cf174c76f,synthea|a3014a4e-f7c0-3a70-b91c-cf184b56e922,us-npi|9999927798,Medicaid,IMP,185347001,Encounter for problem (procedure),,6600.910156,,74400008.0,Appendicitis (disorder)
3,c68f3876-a9d1-0b3c-b905-732dfd208924,2004-05-27 07:35:30,2004-05-27 08:33:49,7b0e0003-89c8-6020-da41-033cf174c76f,synthea|449ffd2f-0774-34c5-b311-459d2e80ef64,us-npi|9999714899,Medicaid,AMB,162673000,General examination of patient (procedure),,783.460022,,,
4,69231b75-3b06-6489-9b73-2bb5143eea4c,2008-06-05 07:35:30,2008-06-05 08:19:07,7b0e0003-89c8-6020-da41-033cf174c76f,synthea|449ffd2f-0774-34c5-b311-459d2e80ef64,us-npi|9999714899,Anthem,AMB,162673000,General examination of patient (procedure),,1310.219971,,,



Total rows in 'conditions': 3,625,440


Unnamed: 0,Start,Stop,Patient,Encounter,System,Code,Description
0,2025-07-19,2025-07-19,295e0a8b-359e-9096-9a39-265d03df2cac,bb0765f1-acb7-5022-d9ea-dd097eedca75,http://snomed.info/sct,314529007,Medication review due (situation)
1,2025-08-23,2025-08-23,295e0a8b-359e-9096-9a39-265d03df2cac,d47805b8-9644-4d9b-4cde-043d3f3878d7,http://snomed.info/sct,314529007,Medication review due (situation)
2,1997-05-18,NaT,7b0e0003-89c8-6020-da41-033cf174c76f,d9f75434-be7a-663f-8702-8ac72afe10fd,http://snomed.info/sct,428251008,History of appendectomy (situation)
3,2004-05-27,NaT,7b0e0003-89c8-6020-da41-033cf174c76f,c68f3876-a9d1-0b3c-b905-732dfd208924,http://snomed.info/sct,105531004,Housing unsatisfactory (finding)
4,2004-05-27,NaT,7b0e0003-89c8-6020-da41-033cf174c76f,c68f3876-a9d1-0b3c-b905-732dfd208924,http://snomed.info/sct,224299000,Received higher education (finding)



Total rows in 'procedures': 15,965,984


Unnamed: 0,Start,Stop,Patient,Encounter,System,Code,Description,Base_Cost,ReasonCode,ReasonDescription
0,2025-07-19 07:09:37,2025-07-19 07:24:37,295e0a8b-359e-9096-9a39-265d03df2cac,bb0765f1-acb7-5022-d9ea-dd097eedca75,http://snomed.info/sct,430193006,Medication reconciliation (procedure),,,
1,2025-08-23 07:09:37,2025-08-23 07:24:37,295e0a8b-359e-9096-9a39-265d03df2cac,d47805b8-9644-4d9b-4cde-043d3f3878d7,http://snomed.info/sct,430193006,Medication reconciliation (procedure),,,
2,2017-06-15 07:35:30,2017-06-15 07:50:30,7b0e0003-89c8-6020-da41-033cf174c76f,cb4c078c-eca3-fd33-e13e-9e982235d01a,http://snomed.info/sct,430193006,Medication reconciliation (procedure),,,
3,2017-06-15 07:35:30,2017-06-15 08:23:33,7b0e0003-89c8-6020-da41-033cf174c76f,cb4c078c-eca3-fd33-e13e-9e982235d01a,http://snomed.info/sct,710824005,Assessment of health and social care needs (pr...,,,
4,2017-06-15 08:23:33,2017-06-15 08:49:56,7b0e0003-89c8-6020-da41-033cf174c76f,cb4c078c-eca3-fd33-e13e-9e982235d01a,http://snomed.info/sct,866148006,Screening for domestic abuse (procedure),,,



Total rows in 'medications': 4,451,029


Unnamed: 0,Start,Stop,Patient,Payer,Encounter,Code,Description,Base_Cost,Payer_Coverage,Dispenses,TotalCost,ReasonCode,ReasonDescription
0,2020-07-02 10:30:49,NaT,7b0e0003-89c8-6020-da41-033cf174c76f,,faaf4aef-a993-05ba-3680-e7173e7935ba,,,,,1,,103697008.0,Patient referral for dental care (procedure)
1,2021-10-17 08:58:54,NaT,7b0e0003-89c8-6020-da41-033cf174c76f,,e9a384c1-2f41-fc39-0ad3-d8573fa1f9df,309309.0,ciprofloxacin 500 MG Oral Tablet,,,1,,,
2,2023-07-06 09:59:01,NaT,7b0e0003-89c8-6020-da41-033cf174c76f,,591e9763-3ed7-9b92-08d6-736bbc53c757,,,,,1,,,
3,2024-02-10 06:46:44,NaT,7b0e0003-89c8-6020-da41-033cf174c76f,,caa45e5b-5897-ad69-c264-799205140f57,313782.0,Acetaminophen 325 MG Oral Tablet,,,1,,,
4,1999-02-25 09:37:14,NaT,7fc520c4-cb59-fa5d-c639-8accb3a4acb7,,2b85dcbb-ce54-60e2-3864-64fe3e121236,665078.0,Loratadine 5 MG Chewable Tablet,,,1,,,



Database connection closed.
