# Compare mimiciv-derrived and mimic-fhir


In [1]:
import os

from pathling import PathlingContext

from sof import ViewCtx
from sof.ptl import PtlSqlCtx
from sof.duckdb import DuckDBSqlCtx

BASE_DIR = os.path.abspath('..')
VIEW_SRC_DIR = os.path.join(BASE_DIR, 'src')
SPARK_CONF_DIR = os.path.join(BASE_DIR, 'env/spark-conf')
MIMIC_FHIR_PATH = "/Users/szu004/datasets/work/mimic-iv/mimic4-ptl"
print(f"BASE_DIR: {BASE_DIR}")

BASE_DIR: /Users/szu004/dev/sof-mimic


In [2]:
def create_ptl_sql_ctx():
    os.environ['SPARK_CONF_DIR'] = SPARK_CONF_DIR
    pc = PathlingContext.create()
    spark = pc.spark
    return PtlSqlCtx(spark=spark, ds=pc.read.parquet(MIMIC_FHIR_PATH))

ptl_sql_ctx = create_ptl_sql_ctx()
ptl_view_ctx = (ViewCtx.Builder(sql_ctx=ptl_sql_ctx)
            .load_sof(os.path.join(VIEW_SRC_DIR, 'sof/*.json'))
            .load_sql(os.path.join(VIEW_SRC_DIR, 'mimic-fhir/*.sql'))
            .load_sql(os.path.join(VIEW_SRC_DIR, 'study/*.sql'))
            .build())

:: loading settings :: url = jar:file:/Users/szu004/miniconda3/envs/sof_mimic/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/szu004/.ivy2/cache
The jars for the packages stored in: /Users/szu004/.ivy2/jars
au.csiro.pathling#library-runtime added as a dependency
io.delta#delta-spark_2.12 added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-d31d667f-f4b4-4057-a057-842a971ed122;1.0
	confs: [default]
	found au.csiro.pathling#library-runtime;7.1.0-SNAPSHOT in local-m2-cache
	found io.delta#delta-spark_2.12;3.2.0 in local-m2-cache
	found io.delta#delta-storage;3.2.0 in local-m2-cache
	found org.antlr#antlr4-runtime;4.9.3 in local-m2-cache
	found org.apache.hadoop#hadoop-aws;3.3.4 in local-m2-cache
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in local-m2-cache
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in local-m2-cache
:: resolution report :: resolve 125ms :: artifacts dl 6ms
	:: modules in use:
	au.csiro.pathling#library-runtime;7.1.0-SNAPSHOT from local-m2-cache in [default]
	co

2024-09-11 19:55:58 WARN  NativeCodeLoader:60 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
2024-09-11 19:56:00 WARN  SimpleFunctionRegistry:72 - The function date_diff replaced a previously registered function.


                                                                                

In [3]:
import duckdb
def create_duck_sql_ctx():
    duckdb.sql("ATTACH 'dbname=mimic4 user=szu004' AS db (TYPE POSTGRES, READ_ONLY)")
    duckdb.sql("USE db")
    return DuckDBSqlCtx()

duck_sql_ctx = create_duck_sql_ctx()
duck_view_ctx = (ViewCtx.Builder(sql_ctx=duck_sql_ctx)
            .load_sql(os.path.join(VIEW_SRC_DIR, 'mimic-2.2/*.sql'))
            .load_sql(os.path.join(VIEW_SRC_DIR, 'study/*.sql'))
            .build())

In [4]:
def compare(*views):
    def do(query):
        for view in views:
            duck_view_ctx.get_view(view)
            ptl_view_ctx.get_view(view)
        print("DuckDB:")       
        print(duck_sql_ctx.select(query).to_df())
        print("Ptl:")
        print(ptl_sql_ctx.select(query).toPandas())
    return  do

In [5]:
compare('md_icustay_detail')(
"""
SELECT COUNT(*) AS count_stays FROM md_icustay_detail
"""
)

Creating SQL view md_icustay_detail, depends on: None
Creating SOF view rv_icu_encounter
2024-09-11 19:56:04 WARN  SparkStringUtils:72 - Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
Creating SQL view dv_icu_encounter, depends on: ['rv_icu_encounter']
Creating SOF view rv_patient
Creating SQL view dv_patient, depends on: ['rv_patient']
Creating SQL view md_icustay_detail, depends on: ['dv_icu_encounter', 'dv_patient']
2024-09-11 19:56:04 WARN  HiveConf:4122 - HiveConf of name hive.stats.jdbc.timeout does not exist
2024-09-11 19:56:04 WARN  HiveConf:4122 - HiveConf of name hive.stats.retries.wait does not exist
2024-09-11 19:56:05 WARN  ObjectStore:7812 - Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
2024-09-11 19:56:05 WARN  ObjectStore:7900 - setMetaStoreSchemaVersion called but recording version is 

                                                                                

2024-09-11 19:56:09 WARN  SessionState:907 - METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
2024-09-11 19:56:09 WARN  HiveConf:4122 - HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist
2024-09-11 19:56:09 WARN  HiveConf:4122 - HiveConf of name hive.stats.jdbc.timeout does not exist
2024-09-11 19:56:09 WARN  HiveConf:4122 - HiveConf of name hive.stats.retries.wait does not exist
DuckDB:
   count_stays
0        73181
Ptl:
   count_stays
0        73181


In [6]:
compare('md_icustay_detail')(
"""
SELECT COUNT(*) AS count_stays FROM md_icustay_detail WHERE (dischtime - admittime) < INTERVAL  '12 hours'
"""
)

DuckDB:
   count_stays
0         3058
Ptl:
   count_stays
0         3058


In [7]:
compare('md_icustay_detail')(
    """
    SELECT stay_id, admittime, dischtime, dischtime-admittime AS duration FROM md_icustay_detail ORDER BY admittime, dischtime LIMIT 10
    """
)

DuckDB:
    stay_id           admittime           dischtime        duration
0  30588857 2110-01-11 10:16:06 2110-01-12 17:17:47 1 days 07:01:41
1  38319097 2110-01-12 00:54:00 2110-01-14 22:07:39 2 days 21:13:39
2  36817845 2110-01-13 09:09:40 2110-01-15 21:45:33 2 days 12:35:53
3  39753527 2110-01-14 07:22:00 2110-01-15 13:08:36 1 days 05:46:36
4  32140208 2110-01-17 00:28:00 2110-01-18 01:21:22 1 days 00:53:22
5  38339174 2110-01-17 22:49:13 2110-01-18 17:18:51 0 days 18:29:38
6  39953418 2110-01-18 14:46:27 2110-01-25 12:42:11 6 days 21:55:44
7  34901199 2110-01-18 17:47:47 2110-01-20 22:25:09 2 days 04:37:22
8  37546608 2110-01-20 14:01:00 2110-01-21 17:21:26 1 days 03:20:26
9  37310703 2110-01-20 21:06:30 2110-01-22 16:08:24 1 days 19:01:54
Ptl:
                                          stay_id           admittime  \
0  Encounter/a70be676-f4ba-57f9-ab78-237ae71f66a1 2110-01-12 01:16:06   
1  Encounter/0bad8340-3ac7-57f2-8cd6-c4347d1a077e 2110-01-12 15:54:00   
2  Encounter/2d1167d

In [8]:
compare('md_icustay_detail')(
    """
    SELECT race,COUNT(*) AS subject_count FROM md_icustay_detail WHERE race IS NOT NULL
    GROUP BY race
    ORDER BY subject_count DESC
    """
)

DuckDB:
       race  subject_count
0     WHITE          49569
1     BLACK           7960
2  HISPANIC           2741
3     ASIAN           2155
Ptl:
       race  subject_count
0     WHITE          50512
1     BLACK           7999
2  HISPANIC           2842
3     ASIAN           2188


In [9]:
compare('md_icustay_detail')(
    """
    SELECT race,COUNT(*) AS subject_count FROM md_icustay_detail WHERE race IS NOT NULL AND first_icu_stay AND first_hosp_stay
    GROUP BY race
    ORDER BY subject_count DESC
    """
)

DuckDB:
       race  subject_count
0     WHITE          34204
1     BLACK           4640
2  HISPANIC           1735
3     ASIAN           1496
Ptl:
       race  subject_count
0     WHITE          34856
1     BLACK           4677
2  HISPANIC           1800
3     ASIAN           1525


In [10]:
compare('md_icustay_detail')(
    """
    SELECT COUNT(*) AS count_stays FROM md_icustay_detail WHERE first_icu_stay AND first_hosp_stay
    """
)

DuckDB:
   count_stays
0        50920
Ptl:
   count_stays
0        50920


In [11]:
compare('st_subject')(
"""
SELECT race_category,COUNT(*) AS subject_count FROM st_subject
GROUP BY race_category
ORDER BY subject_count DESC
"""    
)

Creating SQL view md_oxygen_delivery, depends on: None
Creating SQL view st_subject, depends on: ['md_icustay_detail', 'md_oxygen_delivery']
Creating SOF view rv_obs_o2_flow
Creating SQL view dv_obs_o2_flow, depends on: ['rv_obs_o2_flow']
Creating SOF view rv_o2_delivery_device
Creating SQL view dv_o2_delivery_device, depends on: ['rv_o2_delivery_device']
Creating SQL view md_oxygen_delivery, depends on: ['dv_obs_o2_flow', 'dv_o2_delivery_device']


[Stage 45:>             (23 + 11) / 670][Stage 46:>               (0 + 0) / 670]

2024-09-11 19:56:13 WARN  GarbageCollectionMetrics:72 - To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


                                                                                

Creating SQL view st_subject, depends on: ['md_icustay_detail', 'md_oxygen_delivery']
DuckDB:
  race_category  subject_count
0         WHITE          31713
1         BLACK           4284
2      HISPANIC           1600
3         ASIAN           1388
Ptl:




  race_category  subject_count
0         WHITE          32330
1         BLACK           4317
2      HISPANIC           1660
3         ASIAN           1415


                                                                                

In [12]:
compare('coh_subject')("""
SELECT race_category,COUNT(*) AS subject_count FROM coh_subject
GROUP BY race_category
ORDER BY subject_count DESC
""")

Creating SQL view stx_reading_o2_flow, depends on: ['md_oxygen_delivery', 'st_subject']
Creating SQL view md_vitalsigns, depends on: None
Creating SQL view stx_reading_spo2, depends on: ['st_subject', 'md_vitalsigns']
Creating SQL view md_bg, depends on: None
Creating SQL view stx_reading_so2, depends on: ['st_subject', 'md_bg']
Creating SQL view coh_subject, depends on: ['st_subject', 'stx_reading_o2_flow', 'stx_reading_spo2', 'stx_reading_so2']
Creating SQL view stx_reading_o2_flow, depends on: ['md_oxygen_delivery', 'st_subject']
Creating SOF view rv_obs_vitalsigns
Creating SQL view dv_obs_vitalsigns, depends on: ['rv_obs_vitalsigns']
Creating SQL view md_vitalsigns, depends on: ['dv_obs_vitalsigns']


                                                                                

Creating SQL view stx_reading_spo2, depends on: ['st_subject', 'md_vitalsigns']
Creating SOF view rv_obs_bg
Creating SQL view dv_obs_bg, depends on: ['rv_obs_bg']
Creating SQL view md_bg, depends on: ['dv_obs_bg']


                                                                                

Creating SQL view stx_reading_so2, depends on: ['st_subject', 'md_bg']
Creating SQL view coh_subject, depends on: ['st_subject', 'stx_reading_o2_flow', 'stx_reading_spo2', 'stx_reading_so2']
DuckDB:


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

  race_category  subject_count
0         WHITE           5842
1         BLACK            448
2      HISPANIC            245
3         ASIAN            189
Ptl:


                                                                                

  race_category  subject_count
0         WHITE           6017
1         BLACK            457
2      HISPANIC            250
3         ASIAN            196


In [13]:
compare('stx_reading_o2_flow')("""
SELECT COUNT(*) AS reading_count FROM stx_reading_o2_flow
""")

DuckDB:
   reading_count
0         173079
Ptl:




   reading_count
0         176356


                                                                                

In [14]:
compare('stx_reading_so2')("""
SELECT COUNT(*) AS reading_count FROM stx_reading_so2
""")

DuckDB:
   reading_count
0          32317
Ptl:




   reading_count
0          33655


                                                                                

In [15]:
compare('stx_reading_spo2')("""
SELECT COUNT(*) AS reading_count FROM stx_reading_spo2
""")

DuckDB:
   reading_count
0        2255878
Ptl:


                                                                                

   reading_count
0        2302977


                                                                                