# Cell 1: Imports and Setup

In [1]:
# %%
import duckdb
import pandas as pd
from pathlib import Path

# --- Configuration ---
# Use the same paths as the previous notebook for consistency
DATA_DIR = Path("../data")
CSV_DIR = DATA_DIR / "csv"
OUTPUT_DIR = Path("../output")
DB_FILE = OUTPUT_DIR / "synthea_fhir.duckdb"

# List of the base table names we want to process and compare
TABLE_NAMES = ["patients", "encounters", "conditions", "procedures", "medications"]

# Cell 2: Load CSVs into New DuckDB Tables

In [2]:
# %%
print(f"Connecting to DuckDB database: {DB_FILE}")
con = duckdb.connect(str(DB_FILE), read_only=False)

print("\n--- Loading data from CSV files into new tables ---")

for table_name in TABLE_NAMES:
    csv_file = CSV_DIR / f"{table_name}.csv"
    new_table_name = f"{table_name}_csv"
    
    if csv_file.exists():
        print(f"Loading {csv_file.name} into table '{new_table_name}'...")
        
        # Drop the table if it already exists from a previous run
        con.execute(f"DROP TABLE IF EXISTS {new_table_name};")
        
        # Use DuckDB's powerful CSV reader to create the table and load data
        # This is highly efficient and automatically infers the schema.
        con.execute(f"""
            CREATE TABLE {new_table_name} AS 
            SELECT * FROM read_csv_auto('{str(csv_file)}', header=true);
        """)
        print(f"✅ Successfully created and loaded '{new_table_name}'.")
    else:
        print(f"⚠️ Warning: CSV file not found at {csv_file}")

print("\n--- CSV loading complete. ---")

Connecting to DuckDB database: ../output/synthea_fhir.duckdb

--- Loading data from CSV files into new tables ---
Loading patients.csv into table 'patients_csv'...
✅ Successfully created and loaded 'patients_csv'.
Loading encounters.csv into table 'encounters_csv'...
✅ Successfully created and loaded 'encounters_csv'.
Loading conditions.csv into table 'conditions_csv'...
✅ Successfully created and loaded 'conditions_csv'.
Loading procedures.csv into table 'procedures_csv'...
✅ Successfully created and loaded 'procedures_csv'.
Loading medications.csv into table 'medications_csv'...
✅ Successfully created and loaded 'medications_csv'.

--- CSV loading complete. ---


# Cell 3: Compare FHIR and CSV Tables

In [3]:
# %%
print("\n--- Comparing FHIR-parsed tables with CSV-loaded tables ---")

for table in TABLE_NAMES:
    fhir_table = table
    csv_table = f"{table}_csv"
    
    print(f"\n{'='*50}")
    print(f"📊 Comparing '{fhir_table}' (FHIR) vs. '{csv_table}' (CSV)")
    print(f"{'='*50}")

    try:
        # 1. Compare Row Counts
        fhir_count = con.execute(f"SELECT COUNT(*) FROM {fhir_table}").fetchone()[0]
        csv_count = con.execute(f"SELECT COUNT(*) FROM {csv_table}").fetchone()[0]
        
        print(f"\n1. Row Count Comparison:")
        print(f"  - {fhir_table}: {fhir_count:,} rows")
        print(f"  - {csv_table}: {csv_count:,} rows")
        if fhir_count == csv_count:
            print("  - ✅ Match!")
        else:
            print(f"  - ⚠️ Mismatch! Difference of {abs(fhir_count - csv_count):,} rows.")

        # 2. Compare Table Schemas
        print("\n2. Schema Comparison (Column Names & Types):")
        fhir_schema = con.execute(f"DESCRIBE {fhir_table};").fetchdf()
        csv_schema = con.execute(f"DESCRIBE {csv_table};").fetchdf()

        # Simple check for same number of columns
        if len(fhir_schema) != len(csv_schema):
            print("  - ⚠️ Mismatch in number of columns!")
        
        # For a detailed view, we can show both schemas
        print(f"--- Schema for '{fhir_table}' (FHIR) ---")
        display(fhir_schema)
        print(f"--- Schema for '{csv_table}' (CSV) ---")
        display(csv_schema)
        
    except duckdb.CatalogException as e:
        print(f"\n❌ Error: Could not perform comparison for table '{table}'. It might not exist. Details: {e}")
    except Exception as e:
        print(f"\n❌ An unexpected error occurred during comparison for table '{table}': {e}")


# --- Close the connection when done ---
print(f"\n{'='*50}")
con.close()
print("✅ Comparison complete. Database connection closed.")


--- Comparing FHIR-parsed tables with CSV-loaded tables ---

📊 Comparing 'patients' (FHIR) vs. 'patients_csv' (CSV)

1. Row Count Comparison:
  - patients: 111,277 rows
  - patients_csv: 111,278 rows
  - ⚠️ Mismatch! Difference of 1 rows.

2. Schema Comparison (Column Names & Types):
--- Schema for 'patients' (FHIR) ---


Unnamed: 0,column_name,column_type,null,key,default,extra
0,Id,VARCHAR,NO,PRI,,
1,BirthDate,DATE,YES,,,
2,DeathDate,DATE,YES,,,
3,SSN,VARCHAR,YES,,,
4,Drivers,VARCHAR,YES,,,
5,Passport,VARCHAR,YES,,,
6,Prefix,VARCHAR,YES,,,
7,First,VARCHAR,YES,,,
8,Middle,VARCHAR,YES,,,
9,Last,VARCHAR,YES,,,


--- Schema for 'patients_csv' (CSV) ---


Unnamed: 0,column_name,column_type,null,key,default,extra
0,Id,VARCHAR,YES,,,
1,BIRTHDATE,DATE,YES,,,
2,DEATHDATE,DATE,YES,,,
3,SSN,VARCHAR,YES,,,
4,DRIVERS,VARCHAR,YES,,,
5,PASSPORT,VARCHAR,YES,,,
6,PREFIX,VARCHAR,YES,,,
7,FIRST,VARCHAR,YES,,,
8,MIDDLE,VARCHAR,YES,,,
9,LAST,VARCHAR,YES,,,



📊 Comparing 'encounters' (FHIR) vs. 'encounters_csv' (CSV)

1. Row Count Comparison:
  - encounters: 5,926,035 rows
  - encounters_csv: 5,926,090 rows
  - ⚠️ Mismatch! Difference of 55 rows.

2. Schema Comparison (Column Names & Types):
--- Schema for 'encounters' (FHIR) ---


Unnamed: 0,column_name,column_type,null,key,default,extra
0,Id,VARCHAR,NO,PRI,,
1,Start,TIMESTAMP,YES,,,
2,Stop,TIMESTAMP,YES,,,
3,Patient,VARCHAR,YES,,,
4,Organization,VARCHAR,YES,,,
5,Provider,VARCHAR,YES,,,
6,Payer,VARCHAR,YES,,,
7,EncounterClass,VARCHAR,YES,,,
8,Code,VARCHAR,YES,,,
9,Description,VARCHAR,YES,,,


--- Schema for 'encounters_csv' (CSV) ---


Unnamed: 0,column_name,column_type,null,key,default,extra
0,Id,VARCHAR,YES,,,
1,START,TIMESTAMP WITH TIME ZONE,YES,,,
2,STOP,TIMESTAMP WITH TIME ZONE,YES,,,
3,PATIENT,VARCHAR,YES,,,
4,ORGANIZATION,VARCHAR,YES,,,
5,PROVIDER,VARCHAR,YES,,,
6,PAYER,VARCHAR,YES,,,
7,ENCOUNTERCLASS,VARCHAR,YES,,,
8,CODE,BIGINT,YES,,,
9,DESCRIPTION,VARCHAR,YES,,,



📊 Comparing 'conditions' (FHIR) vs. 'conditions_csv' (CSV)

1. Row Count Comparison:
  - conditions: 3,625,404 rows
  - conditions_csv: 3,625,440 rows
  - ⚠️ Mismatch! Difference of 36 rows.

2. Schema Comparison (Column Names & Types):
--- Schema for 'conditions' (FHIR) ---


Unnamed: 0,column_name,column_type,null,key,default,extra
0,Start,DATE,YES,,,
1,Stop,DATE,YES,,,
2,Patient,VARCHAR,YES,,,
3,Encounter,VARCHAR,YES,,,
4,System,VARCHAR,YES,,,
5,Code,VARCHAR,YES,,,
6,Description,VARCHAR,YES,,,


--- Schema for 'conditions_csv' (CSV) ---


Unnamed: 0,column_name,column_type,null,key,default,extra
0,START,DATE,YES,,,
1,STOP,DATE,YES,,,
2,PATIENT,VARCHAR,YES,,,
3,ENCOUNTER,VARCHAR,YES,,,
4,SYSTEM,VARCHAR,YES,,,
5,CODE,BIGINT,YES,,,
6,DESCRIPTION,VARCHAR,YES,,,



📊 Comparing 'procedures' (FHIR) vs. 'procedures_csv' (CSV)

1. Row Count Comparison:
  - procedures: 15,965,796 rows
  - procedures_csv: 15,965,984 rows
  - ⚠️ Mismatch! Difference of 188 rows.

2. Schema Comparison (Column Names & Types):
--- Schema for 'procedures' (FHIR) ---


Unnamed: 0,column_name,column_type,null,key,default,extra
0,Start,TIMESTAMP,YES,,,
1,Stop,TIMESTAMP,YES,,,
2,Patient,VARCHAR,YES,,,
3,Encounter,VARCHAR,YES,,,
4,System,VARCHAR,YES,,,
5,Code,VARCHAR,YES,,,
6,Description,VARCHAR,YES,,,
7,Base_Cost,FLOAT,YES,,,
8,ReasonCode,VARCHAR,YES,,,
9,ReasonDescription,VARCHAR,YES,,,


--- Schema for 'procedures_csv' (CSV) ---


Unnamed: 0,column_name,column_type,null,key,default,extra
0,START,TIMESTAMP WITH TIME ZONE,YES,,,
1,STOP,TIMESTAMP WITH TIME ZONE,YES,,,
2,PATIENT,VARCHAR,YES,,,
3,ENCOUNTER,VARCHAR,YES,,,
4,SYSTEM,VARCHAR,YES,,,
5,CODE,BIGINT,YES,,,
6,DESCRIPTION,VARCHAR,YES,,,
7,BASE_COST,DOUBLE,YES,,,
8,REASONCODE,BIGINT,YES,,,
9,REASONDESCRIPTION,VARCHAR,YES,,,



📊 Comparing 'medications' (FHIR) vs. 'medications_csv' (CSV)

1. Row Count Comparison:
  - medications: 4,451,010 rows
  - medications_csv: 4,451,029 rows
  - ⚠️ Mismatch! Difference of 19 rows.

2. Schema Comparison (Column Names & Types):
--- Schema for 'medications' (FHIR) ---


Unnamed: 0,column_name,column_type,null,key,default,extra
0,Start,TIMESTAMP,YES,,,
1,Stop,TIMESTAMP,YES,,,
2,Patient,VARCHAR,YES,,,
3,Payer,VARCHAR,YES,,,
4,Encounter,VARCHAR,YES,,,
5,Code,VARCHAR,YES,,,
6,Description,VARCHAR,YES,,,
7,Base_Cost,FLOAT,YES,,,
8,Payer_Coverage,FLOAT,YES,,,
9,Dispenses,INTEGER,YES,,,


--- Schema for 'medications_csv' (CSV) ---


Unnamed: 0,column_name,column_type,null,key,default,extra
0,START,TIMESTAMP WITH TIME ZONE,YES,,,
1,STOP,TIMESTAMP WITH TIME ZONE,YES,,,
2,PATIENT,VARCHAR,YES,,,
3,PAYER,VARCHAR,YES,,,
4,ENCOUNTER,VARCHAR,YES,,,
5,CODE,BIGINT,YES,,,
6,DESCRIPTION,VARCHAR,YES,,,
7,BASE_COST,DOUBLE,YES,,,
8,PAYER_COVERAGE,DOUBLE,YES,,,
9,DISPENSES,BIGINT,YES,,,



✅ Comparison complete. Database connection closed.
