In [1]:
import duckdb
duckdb.sql("ATTACH 'dbname=mimic1 user=szu004' AS db (TYPE POSTGRES, READ_ONLY)")
duckdb.sql("USE db")

In [2]:
# ALL ICU stays (paper reports 765,40)
duckdb.sql("SELECT COUNT(*) as icu_stay_count FROM mimic_derived.icustay_detail")

┌────────────────┐
│ icu_stay_count │
│     int64      │
├────────────────┤
│          76540 │
└────────────────┘

In [3]:
duckdb.sql("SELECT COUNT(DISTINCT subject_id) as icu_stay_count FROM mimic_derived.icustay_detail")

┌────────────────┐
│ icu_stay_count │
│     int64      │
├────────────────┤
│          53150 │
└────────────────┘

In [4]:
# Ventilation interventions
stays_with_interventions = duckdb.sql("""
WITH iws AS (
    SELECT stay_id, starttime as  inttime, ventilation_status AS int_type, row_number() OVER (PARTITION BY stay_id ORDER BY starttime) AS int_sequence
    FROM mimic_derived.ventilation
    WHERE ventilation_status NOT in ('None', 'SupplementalOxygen')
)
SELECT stay_id, inttime, int_type FROM iws WHERE int_sequence = 1
""")
stays_with_interventions.count("*")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│        27096 │
└──────────────┘

In [5]:
# Patients with index period
patient_with_index_period=duckdb.sql("""
WITH patient_with_intervention AS (
    SELECT isd.*, v.inttime, v.int_type FROM mimic_derived.icustay_detail AS isd
    LEFT OUTER JOIN stays_with_interventions AS v ON isd.stay_id = v.stay_id
    WHERE first_icu_stay AND first_hosp_stay
)
SELECT subject_id, stay_id, gender, ethnicity, 
    admittime AS ip_starttime,
    GREATEST(admittime, LEAST( dischtime, inttime, admittime + interval '5 days')) AS ip_endtime,
    inttime, int_type
    FROM patient_with_intervention
""")
patient_with_index_period.df()                     

Unnamed: 0,subject_id,stay_id,gender,ethnicity,ip_starttime,ip_endtime,inttime,int_type
0,10001884,37510196,F,BLACK/AFRICAN AMERICAN,2131-01-07 20:39:00,2131-01-11 04:00:00,2131-01-11 04:00:00,InvasiveVent
1,10002430,38392119,M,WHITE,2129-06-13 00:00:00,2129-06-13 07:00:00,2129-06-13 07:00:00,HFNC
2,10002495,36753294,M,UNKNOWN,2141-05-22 20:17:00,2141-05-23 20:22:00,2141-05-23 20:22:00,NonInvasiveVent
3,10002760,31831386,M,UNABLE TO OBTAIN,2141-04-20 07:15:00,2141-04-20 14:36:00,2141-04-20 14:36:00,InvasiveVent
4,10003400,32128372,F,BLACK/AFRICAN AMERICAN,2137-02-24 10:00:00,2137-02-25 23:37:00,2137-02-25 23:37:00,InvasiveVent
...,...,...,...,...,...,...,...,...
53145,19960743,31609242,F,WHITE,2141-01-28 12:45:00,2141-01-29 12:09:00,2141-01-29 12:09:00,InvasiveVent
53146,19968075,31756531,M,UNKNOWN,2153-04-22 09:27:00,2153-04-22 15:00:00,2153-04-22 15:00:00,InvasiveVent
53147,19970265,38356273,M,WHITE,2115-03-28 17:37:00,2115-03-28 18:32:00,2115-03-28 18:32:00,InvasiveVent
53148,19995780,36805359,M,WHITE,2125-10-20 11:30:00,2125-10-20 13:00:00,2125-10-20 13:00:00,InvasiveVent


In [6]:
## VALIDATE: Patients with index period
# Unique patients (subject id)
assert duckdb.sql("SELECT subject_id, COUNT(*) FROM patient_with_index_period GROUP BY subject_id HAVING COUNT(*) > 1").fetchone() is None
# Unique stays (stay id)
assert duckdb.sql("SELECT stay_id, COUNT(*) FROM patient_with_index_period GROUP BY stay_id HAVING COUNT(*) > 1").fetchone() is None
# valid index period (ip_endtime >= ip_starttime)
assert duckdb.sql("SELECT * FROM patient_with_index_period WHERE ip_endtime < ip_starttime").fetchone() is None
# max lenth of index period is 5 days
assert duckdb.sql("SELECT * FROM patient_with_index_period WHERE ip_endtime - ip_starttime > interval '5 days'").fetchone() is None

In [7]:
# Valida nasal canula o2 flow
nasal_canula_o2_flow = duckdb.sql("""
WITH nc_o2 AS (
    SELECT *, LEAST(o2_flow, o2_flow_additional) AS o2_flow_nc FROM mimic_derived.oxygen_delivery WHERE o2_delivery_device_1 = 'Nasal cannula' AND o2_delivery_device_2 IS NULL
) 
SELECT subject_id, stay_id, charttime, o2_flow_nc AS o2_flow FROM nc_o2 WHERE o2_flow_nc <= 6
""")
nasal_canula_o2_flow

┌────────────┬──────────┬─────────────────────┬─────────┐
│ subject_id │ stay_id  │      charttime      │ o2_flow │
│   int32    │  int32   │      timestamp      │ double  │
├────────────┼──────────┼─────────────────────┼─────────┤
│   10000032 │ 39553978 │ 2180-07-23 14:20:00 │     2.0 │
│   10000032 │ 39553978 │ 2180-07-23 18:00:00 │     2.0 │
│   10000032 │ 39553978 │ 2180-07-23 20:00:00 │     2.0 │
│   10000980 │ 39765666 │ 2189-06-27 10:00:00 │     3.0 │
│   10001217 │ 37067082 │ 2157-11-20 19:47:00 │     2.0 │
│   10001217 │ 37067082 │ 2157-11-20 22:00:00 │     2.0 │
│   10001217 │ 37067082 │ 2157-11-21 00:00:00 │     2.0 │
│   10001217 │ 37067082 │ 2157-11-21 02:00:00 │     2.0 │
│   10001217 │ 37067082 │ 2157-11-21 08:00:00 │     2.0 │
│   10001217 │ 37067082 │ 2157-11-21 18:00:00 │     3.0 │
│       ·    │     ·    │          ·          │      ·  │
│       ·    │     ·    │          ·          │      ·  │
│       ·    │     ·    │          ·          │      ·  │
│   10254774 │

In [8]:
patient_with_o2_data = duckdb.sql("""
SELECT * FROM patient_with_index_period AS pwi 
WHERE EXISTS( SELECT 1 FROM nasal_canula_o2_flow AS nco WHERE pwi.stay_id = nco.stay_id AND nco.charttime BETWEEN pwi.ip_starttime AND pwi.ip_endtime)
""")
patient_with_o2_data

┌────────────┬──────────┬─────────┬───────────┬───┬─────────────────────┬─────────────────────┬─────────────────┐
│ subject_id │ stay_id  │ gender  │ ethnicity │ … │     ip_endtime      │       inttime       │    int_type     │
│   int32    │  int32   │ varchar │  varchar  │   │      timestamp      │      timestamp      │     varchar     │
├────────────┼──────────┼─────────┼───────────┼───┼─────────────────────┼─────────────────────┼─────────────────┤
│   10017531 │ 35526828 │ M       │ WHITE     │ … │ 2159-09-23 17:00:00 │ 2159-09-23 17:00:00 │ InvasiveVent    │
│   10157331 │ 35818405 │ M       │ WHITE     │ … │ 2175-05-31 20:00:00 │ 2175-05-31 20:00:00 │ InvasiveVent    │
│   10259667 │ 35629197 │ M       │ WHITE     │ … │ 2171-04-21 00:00:00 │ 2171-04-21 00:00:00 │ NonInvasiveVent │
│   10026868 │ 31030669 │ M       │ WHITE     │ … │ 2149-10-30 14:42:00 │ NULL                │ NULL            │
│   10029514 │ 39053313 │ M       │ UNKNOWN   │ … │ 2112-10-10 21:58:00 │ NULL          

In [9]:
# Initial cohort from the paper: 25,340
patient_with_o2_data.count("*")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│        23703 │
└──────────────┘

In [10]:
patient_with_race = duckdb.sql("""
WITH patients AS (
    SELECT *,
    CASE
        WHEN ethnicity LIKE 'ASIAN%' THEN 'ASIAN'
        WHEN ethnicity LIKE 'BLACK%' THEN 'BLACK'
        WHEN ethnicity LIKE 'WHITE%' THEN 'WHITE'
        WHEN ethnicity LIKE 'HISPANIC%' THEN 'HISPANIC'
        END AS race_category
    FROM patient_with_o2_data)
SELECT subject_id, stay_id, race_category, gender, ip_starttime, ip_endtime  FROM patients WHERE race_category IS NOT NULL
""")
patient_with_race

┌────────────┬──────────┬───────────────┬─────────┬─────────────────────┬─────────────────────┐
│ subject_id │ stay_id  │ race_category │ gender  │    ip_starttime     │     ip_endtime      │
│   int32    │  int32   │    varchar    │ varchar │      timestamp      │      timestamp      │
├────────────┼──────────┼───────────────┼─────────┼─────────────────────┼─────────────────────┤
│   10017531 │ 35526828 │ WHITE         │ M       │ 2159-09-22 19:30:00 │ 2159-09-23 17:00:00 │
│   10157331 │ 35818405 │ WHITE         │ M       │ 2175-05-30 22:31:00 │ 2175-05-31 20:00:00 │
│   10259667 │ 35629197 │ WHITE         │ M       │ 2171-04-18 12:35:00 │ 2171-04-21 00:00:00 │
│   10026868 │ 31030669 │ WHITE         │ M       │ 2149-10-25 14:42:00 │ 2149-10-30 14:42:00 │
│   10047172 │ 34369439 │ WHITE         │ M       │ 2163-06-15 01:28:00 │ 2163-06-20 01:28:00 │
│   10048001 │ 31975834 │ WHITE         │ M       │ 2175-02-05 00:14:00 │ 2175-02-10 00:14:00 │
│   10057218 │ 36369483 │ WHITE         

In [11]:
# Patients with documented rare/enthnicity by race
duckdb.sql("""
SELECT race_category, COUNT(*) FROM patient_with_race 
GROUP BY race_category
""")

┌───────────────┬──────────────┐
│ race_category │ count_star() │
│    varchar    │    int64     │
├───────────────┼──────────────┤
│ BLACK         │         1970 │
│ HISPANIC      │          751 │
│ WHITE         │        17024 │
│ ASIAN         │          609 │
└───────────────┴──────────────┘

In [12]:
reading_o2_flow = duckdb.sql("""
SELECT pwr.subject_id, od.charttime as chart_time, od.o2_flow FROM patient_with_index_period AS pwr
JOIN nasal_canula_o2_flow AS od ON  pwr.stay_id = od.stay_id
WHERE od.charttime BETWEEN pwr.ip_starttime AND pwr.ip_endtime AND od.o2_flow IS NOT NULL
ORDER BY pwr.subject_id, od.charttime
""")
reading_o2_flow

┌────────────┬─────────────────────┬─────────┐
│ subject_id │     chart_time      │ o2_flow │
│   int32    │      timestamp      │ double  │
├────────────┼─────────────────────┼─────────┤
│   10000032 │ 2180-07-23 14:20:00 │     2.0 │
│   10000032 │ 2180-07-23 18:00:00 │     2.0 │
│   10000032 │ 2180-07-23 20:00:00 │     2.0 │
│   10000980 │ 2189-06-27 10:00:00 │     3.0 │
│   10001217 │ 2157-11-20 19:47:00 │     2.0 │
│   10001217 │ 2157-11-20 22:00:00 │     2.0 │
│   10001217 │ 2157-11-21 00:00:00 │     2.0 │
│   10001217 │ 2157-11-21 02:00:00 │     2.0 │
│   10001217 │ 2157-11-21 08:00:00 │     2.0 │
│   10001217 │ 2157-11-21 18:00:00 │     3.0 │
│       ·    │          ·          │      ·  │
│       ·    │          ·          │      ·  │
│       ·    │          ·          │      ·  │
│   10634612 │ 2132-02-14 11:00:00 │     1.5 │
│   10634612 │ 2132-02-14 14:00:00 │     1.5 │
│   10635271 │ 2137-08-10 21:11:00 │     4.0 │
│   10635271 │ 2137-08-11 01:00:00 │     4.0 │
│   10635271 

### Sp02

In [13]:
reading_spo2 = duckdb.sql("""
SELECT pwr.subject_id, vs.charttime as chart_time, vs.spo2 FROM patient_with_index_period AS pwr
JOIN mimic_derived.vitalsign AS vs ON  pwr.stay_id = vs.stay_id
WHERE vs.charttime BETWEEN pwr.ip_starttime AND pwr.ip_endtime AND vs.spo2 IS NOT NULL
ORDER BY pwr.subject_id, vs.charttime
""")
reading_spo2

┌────────────┬─────────────────────┬────────┐
│ subject_id │     chart_time      │  spo2  │
│   int32    │      timestamp      │ double │
├────────────┼─────────────────────┼────────┤
│   10000032 │ 2180-07-23 14:13:00 │   98.0 │
│   10000032 │ 2180-07-23 14:30:00 │   97.0 │
│   10000032 │ 2180-07-23 15:00:00 │   97.0 │
│   10000032 │ 2180-07-23 16:00:00 │   94.0 │
│   10000032 │ 2180-07-23 17:00:00 │   95.0 │
│   10000032 │ 2180-07-23 18:00:00 │   95.0 │
│   10000032 │ 2180-07-23 19:00:00 │   98.0 │
│   10000032 │ 2180-07-23 20:00:00 │   99.0 │
│   10000032 │ 2180-07-23 21:00:00 │   95.0 │
│   10000032 │ 2180-07-23 22:00:00 │   95.0 │
│       ·    │          ·          │     ·  │
│       ·    │          ·          │     ·  │
│       ·    │          ·          │     ·  │
│   10063848 │ 2177-07-29 13:00:00 │   97.0 │
│   10063848 │ 2177-07-29 14:00:00 │   96.0 │
│   10063848 │ 2177-07-29 15:00:00 │   96.0 │
│   10063848 │ 2177-07-29 16:00:00 │   98.0 │
│   10063848 │ 2177-07-29 17:00:00

In [14]:
reading_so2 = duckdb.sql("""
SELECT pwr.subject_id, bg.charttime as chart_time, bg.so2 FROM patient_with_index_period AS pwr
JOIN mimic_derived.bg AS bg ON  pwr.subject_id = bg.subject_id
WHERE bg.charttime BETWEEN pwr.ip_starttime AND pwr.ip_endtime AND bg.so2 IS NOT NULL AND bg.specimen = 'ART.'
ORDER BY pwr.subject_id, bg.charttime
""")
reading_so2

┌────────────┬─────────────────────┬────────┐
│ subject_id │     chart_time      │  so2   │
│   int32    │      timestamp      │ double │
├────────────┼─────────────────────┼────────┤
│   10002155 │ 2129-08-05 07:48:00 │   94.0 │
│   10002443 │ 2183-10-18 02:35:00 │   90.0 │
│   10003046 │ 2154-01-02 08:57:00 │   98.0 │
│   10003046 │ 2154-01-02 10:33:00 │   98.0 │
│   10004235 │ 2196-02-24 16:25:00 │   96.0 │
│   10004401 │ 2144-01-27 02:39:00 │   95.0 │
│   10005817 │ 2132-12-15 16:34:00 │   97.0 │
│   10005817 │ 2132-12-15 18:47:00 │   97.0 │
│   10005817 │ 2132-12-15 20:20:00 │   95.0 │
│   10006053 │ 2111-11-14 05:00:00 │   98.0 │
│       ·    │          ·          │     ·  │
│       ·    │          ·          │     ·  │
│       ·    │          ·          │     ·  │
│   17970921 │ 2144-06-20 09:02:00 │   99.0 │
│   17973277 │ 2167-08-21 00:24:00 │   98.0 │
│   17974379 │ 2132-04-20 18:59:00 │   93.0 │
│   17975221 │ 2120-06-24 15:24:00 │   96.0 │
│   17975345 │ 2179-06-20 16:49:00

## Perform final extract

In [15]:
coh_subject = duckdb.sql("""
SELECT subject_id, gender, race_category FROM patient_with_race AS pwr
WHERE 
    EXISTS( SELECT 1 FROM reading_o2_flow AS rof WHERE pwr.subject_id = rof.subject_id)
    AND EXISTS( SELECT 1 FROM reading_spo2 AS rs WHERE pwr.subject_id = rs.subject_id)
    AND EXISTS( SELECT 1 FROM reading_so2 AS rso WHERE pwr.subject_id = rso.subject_id)
""")
coh_subject

┌────────────┬─────────┬───────────────┐
│ subject_id │ gender  │ race_category │
│   int32    │ varchar │    varchar    │
├────────────┼─────────┼───────────────┤
│   11236474 │ F       │ BLACK         │
│   12538134 │ F       │ ASIAN         │
│   13678296 │ M       │ WHITE         │
│   18227591 │ F       │ WHITE         │
│   18322840 │ M       │ WHITE         │
│   18440411 │ F       │ WHITE         │
│   10318354 │ F       │ WHITE         │
│   10980779 │ F       │ WHITE         │
│   11726221 │ M       │ WHITE         │
│   12680785 │ M       │ WHITE         │
│       ·    │ ·       │   ·           │
│       ·    │ ·       │   ·           │
│       ·    │ ·       │   ·           │
│   18588825 │ F       │ WHITE         │
│   11765665 │ F       │ HISPANIC      │
│   11852913 │ M       │ WHITE         │
│   14992632 │ M       │ WHITE         │
│   15202347 │ M       │ WHITE         │
│   15588339 │ M       │ WHITE         │
│   15921116 │ M       │ WHITE         │
│   15968932 │ M

In [16]:
duckdb.sql("""
SELECT race_category, COUNT(*) FROM coh_subject GROUP BY  race_category
""")

┌───────────────┬──────────────┐
│ race_category │ count_star() │
│    varchar    │    int64     │
├───────────────┼──────────────┤
│ BLACK         │          272 │
│ HISPANIC      │          126 │
│ ASIAN         │          111 │
│ WHITE         │         3109 │
└───────────────┴──────────────┘

In [17]:
coh_reading_o2_flow = duckdb.sql("""
SELECT * FROM reading_o2_flow AS rd WHERE EXISTS( SELECT 1 FROM coh_subject AS cs WHERE rd.subject_id = cs.subject_id)
""")
coh_reading_spo2 = duckdb.sql("""
SELECT * FROM reading_spo2 AS rd WHERE EXISTS( SELECT 1 FROM coh_subject AS cs WHERE rd.subject_id = cs.subject_id)
""")
coh_reading_so2 = duckdb.sql("""
SELECT * FROM reading_so2 AS rd WHERE EXISTS( SELECT 1 FROM coh_subject AS cs WHERE rd.subject_id = cs.subject_id)
""")

### Save the extreact to cvs

In [18]:
coh_subject.to_csv('../data/psql_mimic-1.0/subject.csv')
coh_reading_o2_flow.to_csv('../data/psql_mimic-1.0/reading_o2_flow.csv')
coh_reading_spo2.to_csv('../data/psql_mimic-1.0/reading_spo2.csv')
coh_reading_so2.to_csv('../data/psql_mimic-1.0/reading_so2.csv')

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))