In [11]:
from db import duck_query_df
from collections import OrderedDict
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from tableone import TableOne

all_tabs = duck_query_df("SHOW ALL TABLES;")
all_tabs.head()

#con.sql("CALL pg_clear_cache();")
#con.close()


Unnamed: 0,database,schema,name,column_names,column_types,temporary
0,db,mimiciv_derived,acei,"[subject_id, hadm_id, acei, starttime, stoptime]","[INTEGER, INTEGER, VARCHAR, TIMESTAMP, TIMESTAMP]",False
1,db,mimiciv_derived,age,"[subject_id, hadm_id, admittime, anchor_age, a...","[INTEGER, INTEGER, TIMESTAMP, SMALLINT, SMALLI...",False
2,db,mimiciv_derived,antibiotic,"[subject_id, hadm_id, stay_id, antibiotic, rou...","[INTEGER, INTEGER, INTEGER, VARCHAR, VARCHAR, ...",False
3,db,mimiciv_derived,bg,"[subject_id, hadm_id, charttime, specimen, so2...","[INTEGER, INTEGER, TIMESTAMP, VARCHAR, DOUBLE,...",False
4,db,mimiciv_derived,blood_differential,"[subject_id, hadm_id, charttime, specimen_id, ...","[INTEGER, INTEGER, TIMESTAMP, INTEGER, DOUBLE,...",False


In [12]:
hosp = duck_query_df("""
SELECT
    pat.subject_id,
    adm.hadm_id,
    DENSE_RANK() OVER (PARTITION BY pat.subject_id ORDER BY adm.admittime) AS hosp_stay_num,
    CASE
        WHEN FIRST_VALUE(adm.hadm_id) OVER (PARTITION BY pat.subject_id ORDER BY adm.admittime) = adm.hadm_id THEN 1
        ELSE 0
    END AS pat_count,
    pat.anchor_age + (EXTRACT(YEAR FROM adm.admittime) - pat.anchor_year) AS age,
    pat.gender,
    EXTRACT(EPOCH FROM (adm.dischtime - adm.admittime)) / 3600 / 24 AS hosp_los,
    pat.dod,
    (pat.dod::date - adm.dischtime::date) AS days_to_death,
    -- mortality flags
    CASE WHEN (pat.dod::date - adm.dischtime::date) = 0 THEN 1 ELSE 0 END AS hospital_mortality
FROM db.mimiciv_hosp.patients pat
INNER JOIN db.mimiciv_hosp.admissions adm
    ON pat.subject_id = adm.subject_id;
""")
hosp.head()

Unnamed: 0,subject_id,hadm_id,hosp_stay_num,pat_count,age,gender,hosp_los,dod,days_to_death,hospital_mortality
0,14843956,23720196,1,1,42,M,1.225694,NaT,,0
1,14844428,27843938,1,1,29,F,0.218056,NaT,,0
2,14844428,28513088,2,0,29,F,4.616667,NaT,,0
3,14845749,24983273,1,1,22,F,0.700694,NaT,,0
4,14845749,20255080,2,0,22,F,23.997222,NaT,,0


In [13]:
# Add column for 1 year mortality
hosp['one_year_mortality'] = hosp['days_to_death'].notnull().astype(int)

# Temp dataframe with the days to death for only the last ICU stay
last_dod = hosp.groupby('subject_id')[['hosp_stay_num']].max().reset_index()
last_dod = last_dod.merge(hosp[['subject_id', 'hosp_stay_num', 'days_to_death']],
                          on=['subject_id', 'hosp_stay_num'], how='inner')
last_dod.rename(columns={'days_to_death': 'days_to_death_last_stay_id'}, inplace=True)

hosp = hosp.merge(last_dod, how='left', on=['subject_id', 'hosp_stay_num'])
del last_dod
hosp.sort_values(['subject_id', 'hosp_stay_num'], inplace=True)

# fix some data type issues
int_cols = hosp.dtypes.values=="Int64"
hosp.loc[:, int_cols] = hosp.loc[:, int_cols].astype(float)
hosp.loc[:, int_cols] = hosp.loc[:, int_cols].astype(int, errors="ignore")

hosp.head()

Unnamed: 0,subject_id,hadm_id,hosp_stay_num,pat_count,age,gender,hosp_los,dod,days_to_death,hospital_mortality,one_year_mortality,days_to_death_last_stay_id
92794,10000032,22595853,1,1,52,F,0.786111,2180-09-09,125.0,0,1,
92795,10000032,22841357,2,0,52,F,1.015278,2180-09-09,74.0,0,1,
92796,10000032,29079034,3,0,52,F,2.222222,2180-09-09,46.0,0,1,
92797,10000032,25742920,4,0,52,F,1.754167,2180-09-09,33.0,0,1,33.0
151147,10000068,25022803,1,1,19,F,0.298611,NaT,,0,0,


In [14]:
icu = duck_query_df("""
SELECT
    pat.subject_id,
    adm.hadm_id,
    icu.stay_id,
    ROW_NUMBER() OVER (PARTITION BY pat.subject_id ORDER BY icu.intime) AS icu_stay_num,
    DENSE_RANK() OVER (PARTITION BY pat.subject_id ORDER BY adm.admittime) AS hosp_stay_num,
    CASE
        WHEN FIRST_VALUE(icu.stay_id) OVER (PARTITION BY pat.subject_id ORDER BY icu.intime) = icu.stay_id THEN 1
        ELSE 0
    END AS pat_count,
    pat.anchor_age + (EXTRACT(YEAR FROM icu.intime) - pat.anchor_year) AS age,
    pat.gender,
    icu.first_careunit,
    icu.los AS icu_los,
    EXTRACT(EPOCH FROM (adm.dischtime - adm.admittime)) / 3600 / 24 AS hosp_los,
    pat.dod,
    (pat.dod::date - adm.dischtime::date) AS days_to_death,
    -- mortality flags
    CASE WHEN (pat.dod::date - adm.dischtime::date) = 0 THEN 1 ELSE 0 END AS hospital_mortality,
    CASE WHEN (pat.dod::date - icu.outtime::date) = 0 THEN 1 ELSE 0 END AS icu_mortality
FROM db.mimiciv_hosp.patients pat
INNER JOIN db.mimiciv_hosp.admissions adm
    ON pat.subject_id = adm.subject_id
INNER JOIN db.mimiciv_icu.icustays icu
    ON adm.hadm_id = icu.hadm_id;
""")
icu.head()

Unnamed: 0,subject_id,hadm_id,stay_id,icu_stay_num,hosp_stay_num,pat_count,age,gender,first_careunit,icu_los,hosp_los,dod,days_to_death,hospital_mortality,icu_mortality
0,17232262,22129008,39561273,1,1,1,69,F,Trauma SICU (TSICU),1.354907,3.454861,NaT,,0,0
1,17232262,25674800,38491966,2,2,0,70,F,Medical Intensive Care Unit (MICU),1.999537,8.206944,NaT,,0,0
2,17233369,22741144,37719606,1,1,1,76,F,Surgical Intensive Care Unit (SICU),0.993368,0.997917,NaT,,0,0
3,17234133,22749804,38411653,1,1,1,77,M,Neuro Intermediate,1.552442,1.583333,NaT,,0,0
4,17235477,24772660,35049091,1,1,1,21,F,Neuro Intermediate,1.872002,2.449306,NaT,,0,0


In [15]:
# add 1 year mortality
icu['one_year_mortality'] = icu['days_to_death'].notnull().astype(int)

# create a dataframe with the days to death for only the last ICU stay
last_dod = icu.groupby('subject_id')[['icu_stay_num']].max().reset_index()
last_dod = last_dod.merge(icu[['subject_id', 'icu_stay_num', 'days_to_death']], on=['subject_id', 'icu_stay_num'], how='inner')
last_dod.rename(columns={'days_to_death': 'days_to_death_last_stay_id'}, inplace=True)

icu = icu.merge(last_dod, how='left', on=['subject_id', 'icu_stay_num'])
del last_dod
icu.sort_values(['subject_id', 'icu_stay_num'], inplace=True)

# add a grouping variable for table one so we can have hospital mortality as a group and a row
icu['hosp_mort'] = icu['hospital_mortality']

# fix some data type issues
int_cols = icu.dtypes.values=="Int64"
icu.loc[:, int_cols] = icu.loc[:, int_cols].astype(float)
icu.loc[:, int_cols] = icu.loc[:, int_cols].astype(int, errors="ignore")

icu.head()

Unnamed: 0,subject_id,hadm_id,stay_id,icu_stay_num,hosp_stay_num,pat_count,age,gender,first_careunit,icu_los,hosp_los,dod,days_to_death,hospital_mortality,icu_mortality,one_year_mortality,days_to_death_last_stay_id,hosp_mort
1744,10000032,29079034,39553978,1,1,1,52,F,Medical Intensive Care Unit (MICU),0.410266,2.222222,2180-09-09,46.0,0,0,1,46.0,0
26508,10000690,25860671,37081114,1,1,1,86,F,Medical Intensive Care Unit (MICU),3.893252,9.821528,2152-01-30,444.0,0,0,1,444.0,0
26509,10000980,26913865,39765666,1,1,1,76,F,Medical Intensive Care Unit (MICU),0.497535,5.806944,2193-08-26,1515.0,0,0,1,1515.0,0
74545,10001217,24597018,37067082,1,1,1,55,F,Surgical Intensive Care Unit (SICU),1.118032,6.794444,NaT,,0,0,0,,0
74546,10001217,27703517,34592300,2,2,0,55,F,Surgical Intensive Care Unit (SICU),0.948113,5.914583,NaT,,0,0,0,,0


In [16]:
columns = [
    "pat_count",
    # , "hadm_count",
    "age", "gender",
    # "first_careunit",
    # "icu_los",
    "hosp_los",
    # 'icu_mortality',
    'hospital_mortality',
    "one_year_mortality",
    # "days_to_death_last_stay_id"
]

categorical = [
    "pat_count",
    # "hadm_count",
    "gender",
    # "first_careunit",
    # mortality flags
    # 'icu_mortality',
    'hospital_mortality',
    'one_year_mortality',
]

order = {
    "pat_count": ['1', '0'],
    # "hadm_count": [1, 0],
    "gender": ["F", "M"],
    # "icu_mortality": [1, 0],
    "hospital_mortality": ['1', '0'],
    "one_year_mortality": ['1', '0'],
}

limit = {
    "pat_count": 1, # "hadm_count": 1,
    # "gender": 1,
    # "icu_mortality": 1,
    "hospital_mortality": 1,
    "one_year_mortality": 1,
}

rename = {
    "pat_count": "Distinct patients", "hadm_count": "Distinct hospitalizations",
    "age": "Age", "gender": "Administrative Gender", "insurance": "Insurance",
    "first_careunit": "First ICU stay, unit type",
    "icu_los": "ICU length of stay", "hosp_los": "Hospital length of stay",
    "icu_mortality": "In-ICU mortality",
    "hospital_mortality": "In-hospital mortality",
    "one_year_mortality": "One year mortality",
    # "days_to_death_last_stay_id": "Time to death (days)",
}

print('ICU demographics')
icu_table = TableOne(icu, columns=columns, categorical=categorical, order=order, limit=limit, rename=rename)
display(icu_table)
print('Hospital demographics')
hosp_table = TableOne(hosp, columns=columns, categorical=categorical, order=order, limit=limit, rename=rename)
display(hosp_table)

compare_table = hosp_table.tableone.drop(columns='Missing').copy()
compare_table.rename(columns={'Overall': 'Hospital stays'}, inplace=True)

compare_table = compare_table.merge(
    icu_table.tableone.drop(columns='Missing'),
    left_index=True, right_index=True, how='inner'
)
compare_table.rename(columns={'Overall': 'ICU stays'}, inplace=True)
display(compare_table)

ICU demographics


Unnamed: 0,Unnamed: 1,Missing,Overall
n,,,94458
"Distinct patients, n (%)",1,,65366 (69.2)
"Age, mean (SD)",,0.0,64.8 (16.7)
"Administrative Gender, n (%)",F,,41583 (44.0)
"Administrative Gender, n (%)",M,,52875 (56.0)
"Hospital length of stay, mean (SD)",,0.0,11.8 (15.5)
"In-hospital mortality, n (%)",1,,11616 (12.3)
"One year mortality, n (%)",1,,37967 (40.2)


Hospital demographics


Unnamed: 0,Unnamed: 1,Missing,Overall
n,,,546028
"Distinct patients, n (%)",1,,223452 (40.9)
"Age, mean (SD)",,0.0,59.2 (19.1)
"Administrative Gender, n (%)",F,,284097 (52.0)
"Administrative Gender, n (%)",M,,261931 (48.0)
"Hospital length of stay, mean (SD)",,0.0,4.8 (7.2)
"In-hospital mortality, n (%)",1,,12149 (2.2)
"One year mortality, n (%)",1,,144966 (26.5)


Unnamed: 0,Unnamed: 1,Hospital stays,ICU stays
n,,546028,94458
"Distinct patients, n (%)",1,223452 (40.9),65366 (69.2)
"Age, mean (SD)",,59.2 (19.1),64.8 (16.7)
"Administrative Gender, n (%)",F,284097 (52.0),41583 (44.0)
"Administrative Gender, n (%)",M,261931 (48.0),52875 (56.0)
"Hospital length of stay, mean (SD)",,4.8 (7.2),11.8 (15.5)
"In-hospital mortality, n (%)",1,12149 (2.2),11616 (12.3)
"One year mortality, n (%)",1,144966 (26.5),37967 (40.2)


In [8]:
print("Charlson Comorbidity Index Table:")
charlson = duck_query_df("SELECT * FROM db.mimiciv_derived.charlson;")
charlson.head()

print("First Day Labs Table:")
first_labs = duck_query_df("SELECT * FROM db.mimiciv_derived.first_day_lab;")
first_labs.head()

Charlson Comorbidity Index Table:
First Day Labs Table:


Unnamed: 0,subject_id,stay_id,hematocrit_min,hematocrit_max,hemoglobin_min,hemoglobin_max,platelets_min,platelets_max,wbc_min,wbc_max,...,bilirubin_indirect_min,bilirubin_indirect_max,ck_cpk_min,ck_cpk_max,ck_mb_min,ck_mb_max,ggt_min,ggt_max,ld_ldh_min,ld_ldh_max
0,12466550,30000153,29.1,39.1,9.8,13.0,162.0,177.0,15.2,17.9,...,,,605.0,605.0,20.0,20.0,,,,
1,13180007,30000213,23.9,27.6,7.4,8.5,219.0,243.0,5.7,7.7,...,,,,,,,,,300.0,300.0
2,18421337,30000484,24.6,31.0,8.1,9.9,357.0,403.0,24.2,30.1,...,,,40.0,74.0,,,,,419.0,419.0
3,12207593,30000646,37.8,39.3,12.9,13.2,266.0,337.0,7.9,10.6,...,,,64.0,64.0,1.0,1.0,,,,
4,15726459,30000831,38.0,40.8,12.4,13.3,285.0,311.0,14.2,21.3,...,,,,,2.0,3.0,,,,


In [29]:
cohort = duck_query_df("""
CREATE TEMPORARY TABLE cohort AS
    WITH FirstICUStay AS (
        -- Select the minimum ICUSTAY_ID for each SUBJECT_ID to enforce 'First ICU Stay'
        SELECT
            ic.subject_id,
            ic.hadm_id,
            ic.stay_id,
            ic.intime,
            ic.outtime,
            ic.los,
            ROW_NUMBER() OVER (PARTITION BY ic.subject_id ORDER BY ic.intime) AS rn
        FROM
            db.mimiciv_icu.icustays ic
    ),
    AdultPatients AS (
        -- Calculate age and filter for adult patients
        SELECT
            fs.*,
            pa.gender,
            (
                (CAST(STRFTIME(fs.intime, '%Y') AS INTEGER) - pa.anchor_year) 
                + pa.anchor_age
            ) AS age_at_admission
        FROM
            FirstICUStay fs
        INNER JOIN 
            db.mimiciv_hosp.patients pa ON fs.subject_id = pa.subject_id
        WHERE
            fs.rn = 1 -- Only the first ICU stay
    )
    SELECT
        ap.subject_id,
        ap.hadm_id,
        ap.stay_id,
        ap.intime,
        ap.outtime,
        ap.los,
        ap.gender,
        ap.age_at_admission AS age,
        -- Target Label: In-Hospital Mortality (IHM)
        adm.hospital_expire_flag AS y_ihm
    FROM
        AdultPatients ap
    INNER JOIN 
        db.mimiciv_hosp.admissions adm ON ap.hadm_id = adm.hadm_id
    WHERE
        ap.age_at_admission >= 18 -- Inclusion: Adult patients
    AND ap.los * 24 >= 12 -- Exclusion: Minimum length of stay
    ORDER BY
        ap.subject_id, ap.intime;
SELECT * FROM cohort;
""")

# Execute and store the core cohort data
print(f"Phase 1 Complete. Cohort size: {len(cohort):,} patients.")
cohort.head()

static_feat = duck_query_df("""
SELECT
    coh.stay_id,
    coh.subject_id,
    coh.age,
    coh.gender,
    coh.y_ihm,
    -- Comorbidities
    cci.charlson_comorbidity_index,
    -- Severity Scores (First 24 Hours - Static)
    o.oasis,
    saps.sapsii,
    -- First Day Vitals (min/max/mean)
    g.gcs_min, -- Example GCS feature
    fvl.heart_rate_min, -- Min Heart Rate in first 24h
    fvl.sbp_max,        -- Max Systolic BP in first 24h
    fvl.resprate_mean   -- Mean Respiratory Rate in first 24h
FROM
    cohort coh
LEFT JOIN
    db.mimiciv_derived.charlson cci ON coh.hadm_id = cci.hadm_id
LEFT JOIN
    db.mimiciv_derived.oasis o ON coh.stay_id = o.stay_id
LEFT JOIN
    db.mimiciv_derived.sapsii saps ON coh.stay_id = saps.stay_id
LEFT JOIN
    db.mimiciv_derived.first_day_gcs g ON coh.stay_id = g.stay_id
LEFT JOIN
    db.mimiciv_derived.first_day_vitalsign fvl ON coh.stay_id = fvl.stay_id
LEFT JOIN
    db.mimiciv_derived.first_day_lab fdl ON coh.stay_id = fdl.stay_id;
""")

print(f"Extracted {len(static_feat):,} static, derived feature vectors.")
static_feat.head()

Phase 1 Complete. Cohort size: 62,907 patients.


CatalogException: Catalog Error: Table with name cohort does not exist!
Did you mean "db.mimiciv_derived.crrt"?

In [28]:
static_feat = duck_query_df("""
SELECT
    coh.stay_id,
    coh.subject_id,
    coh.age,
    coh.gender,
    coh.y_ihm,
    -- Comorbidities
    cci.charlson_comorbidity_index,
    -- Severity Scores (First 24 Hours - Static)
    o.oasis,
    saps.sapsii,
    -- First Day Vitals (min/max/mean)
    g.gcs_min, -- Example GCS feature
    fvl.heart_rate_min, -- Min Heart Rate in first 24h
    fvl.sbp_max,        -- Max Systolic BP in first 24h
    fvl.resprate_mean   -- Mean Respiratory Rate in first 24h
FROM
    cohort coh
LEFT JOIN
    db.mimiciv_derived.charlson cci ON coh.hadm_id = cci.hadm_id
LEFT JOIN
    db.mimiciv_derived.oasis o ON coh.stay_id = o.stay_id
LEFT JOIN
    db.mimiciv_derived.sapsii saps ON coh.stay_id = saps.stay_id
LEFT JOIN
    db.mimiciv_derived.first_day_gcs g ON coh.stay_id = g.stay_id
LEFT JOIN
    db.mimiciv_derived.first_day_vitalsign fvl ON coh.stay_id = fvl.stay_id
LEFT JOIN
    db.mimiciv_derived.first_day_lab fdl ON coh.stay_id = fdl.stay_id;
""")

print(f"Extracted {len(static_feat):,} static, derived feature vectors.")
static_feat.head()

CatalogException: Catalog Error: Table with name cohort does not exist!
Did you mean "db.mimiciv_derived.crrt"?