In [22]:
# Hospital stay demographics

from db import run_query

with open("sql/hosp_demographics.sql", "r") as f:
	hosp = run_query(f.read())
hosp.head()

Unnamed: 0,subject_id,hadm_id,hosp_stay_num,pat_count,age,gender,insurance,hosp_los,dod,days_to_death,hospital_mortality
0,10000032,22595853,1,1,52.0,F,Medicaid,0.786111,2180-09-09,125.0,0
1,10000032,22841357,2,0,52.0,F,Medicaid,1.015278,2180-09-09,74.0,0
2,10000032,29079034,3,0,52.0,F,Medicaid,2.222222,2180-09-09,46.0,0
3,10000032,25742920,4,0,52.0,F,Medicaid,1.754167,2180-09-09,33.0,0
4,10000068,25022803,1,1,19.0,F,,0.298611,,,0


In [23]:
# Add 1 year mortality to table

hosp['one_year_mortality'] = hosp['days_to_death'].notnull().astype(int)
hosp.head()

Unnamed: 0,subject_id,hadm_id,hosp_stay_num,pat_count,age,gender,insurance,hosp_los,dod,days_to_death,hospital_mortality,one_year_mortality
0,10000032,22595853,1,1,52.0,F,Medicaid,0.786111,2180-09-09,125.0,0,1
1,10000032,22841357,2,0,52.0,F,Medicaid,1.015278,2180-09-09,74.0,0,1
2,10000032,29079034,3,0,52.0,F,Medicaid,2.222222,2180-09-09,46.0,0,1
3,10000032,25742920,4,0,52.0,F,Medicaid,1.754167,2180-09-09,33.0,0,1
4,10000068,25022803,1,1,19.0,F,,0.298611,,,0,0


In [24]:
# Days to death at last hospital stay

last_dod = hosp.groupby('subject_id')[['hosp_stay_num']].max().reset_index()
last_dod = last_dod.merge(hosp[['subject_id', 'hosp_stay_num', 'days_to_death']], on=['subject_id', 'hosp_stay_num'], how='inner')
last_dod.rename(columns={'days_to_death': 'days_to_death_last_stay_id'}, inplace=True)

last_dod.head()

Unnamed: 0,subject_id,hosp_stay_num,days_to_death_last_stay_id
0,10000032,4,33.0
1,10000068,1,
2,10000084,2,47.0
3,10000108,1,
4,10000117,2,


In [25]:
# Combine into hospital mortality table

hosp = hosp.merge(last_dod, how='left', on=['subject_id', 'hosp_stay_num'])
del last_dod
hosp.sort_values(['subject_id', 'hosp_stay_num'], inplace=True)

hosp.head()

Unnamed: 0,subject_id,hadm_id,hosp_stay_num,pat_count,age,gender,insurance,hosp_los,dod,days_to_death,hospital_mortality,one_year_mortality,days_to_death_last_stay_id
0,10000032,22595853,1,1,52.0,F,Medicaid,0.786111,2180-09-09,125.0,0,1,
1,10000032,22841357,2,0,52.0,F,Medicaid,1.015278,2180-09-09,74.0,0,1,
2,10000032,29079034,3,0,52.0,F,Medicaid,2.222222,2180-09-09,46.0,0,1,
3,10000032,25742920,4,0,52.0,F,Medicaid,1.754167,2180-09-09,33.0,0,1,33.0
4,10000068,25022803,1,1,19.0,F,,0.298611,,,0,0,


In [26]:
int_cols = hosp.dtypes.values=="Int64"
hosp.loc[:, int_cols] = hosp.loc[:, int_cols].astype(float)
hosp.loc[:, int_cols] = hosp.loc[:, int_cols].astype(int, errors="ignore")

hosp.info()
hosp.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 546028 entries, 0 to 546027
Data columns (total 13 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   subject_id                  546028 non-null  int64  
 1   hadm_id                     546028 non-null  int64  
 2   hosp_stay_num               546028 non-null  int64  
 3   pat_count                   546028 non-null  int64  
 4   age                         546028 non-null  float64
 5   gender                      546028 non-null  object 
 6   insurance                   536673 non-null  object 
 7   hosp_los                    546028 non-null  float64
 8   dod                         144966 non-null  object 
 9   days_to_death               144966 non-null  float64
 10  hospital_mortality          546028 non-null  int64  
 11  one_year_mortality          546028 non-null  int64  
 12  days_to_death_last_stay_id  36882 non-null   float64
dtypes: float64(4),

Unnamed: 0,subject_id,hadm_id,hosp_stay_num,pat_count,age,gender,insurance,hosp_los,dod,days_to_death,hospital_mortality,one_year_mortality,days_to_death_last_stay_id
0,10000032,22595853,1,1,52.0,F,Medicaid,0.786111,2180-09-09,125.0,0,1,
1,10000032,22841357,2,0,52.0,F,Medicaid,1.015278,2180-09-09,74.0,0,1,
2,10000032,29079034,3,0,52.0,F,Medicaid,2.222222,2180-09-09,46.0,0,1,
3,10000032,25742920,4,0,52.0,F,Medicaid,1.754167,2180-09-09,33.0,0,1,33.0
4,10000068,25022803,1,1,19.0,F,,0.298611,,,0,0,


In [16]:
# ICU stay demographics

from db import temp_query
import os
import pandas as pd

with open("sql/icu_demographics-1.sql", "r") as f:
	temp_query(f.read())

print("Stage 1: Done!")

with open("sql/icu_demographics-2.sql", "r") as f:
	temp_query(f.read())

print("Stage 2: Done!")

with open("sql/icu_demographics-3.sql", "r") as f:
	temp_query(f.read())

print("Stage 3: Done!")

stages = "SELECT * FROM tmp_pat_adm_icu;"
icu = pd.read_sql(stages, os.getenv("engine"))

print("Final dataframe loaded:")
print(df.head())
print(f"Total rows: {len(df):,}")

Stage 1: Done!


ProgrammingError: (psycopg2.errors.UndefinedTable) relation "tmp_pat_adm" does not exist
LINE 10: FROM tmp_pat_adm;
              ^

[SQL: DROP TABLE IF EXISTS tmp_pat_adm_windows;

CREATE TEMP TABLE tmp_pat_adm_windows AS
SELECT
      *,
      DENSE_RANK() OVER (PARTITION BY subject_id ORDER BY admittime) AS hosp_stay_num,
      EXTRACT(EPOCH FROM (dischtime - admittime)) / 3600 / 24 AS hosp_los,
      DATE(dod) - DATE(dischtime) AS days_to_death,
      CASE WHEN DATE(dod) - DATE(dischtime) = 0 THEN 1 ELSE 0 END AS hospital_mortality
FROM tmp_pat_adm;]
(Background on this error at: https://sqlalche.me/e/20/f405)

In [36]:
import pandas as pd
from sqlalchemy import create_engine, text

# -------------------------------------------------------
# 1. PostgreSQL connection
# -------------------------------------------------------
engine = create_engine(
    "postgresql://postgres:PSQLpwd4!@dxp4800.kudu-altair.ts.net:5432/mimiciv"
)

# Helper function to execute SQL blocks
def run_sql(sql):
    with engine.begin() as conn:
        conn.execute(text(sql))

# -------------------------------------------------------
# Stage 1 — patients + admissions (light join)
# -------------------------------------------------------
stage1 = """
DROP TABLE IF EXISTS tmp_pat_adm;

CREATE TEMP TABLE tmp_pat_adm AS
SELECT
      pat.subject_id,
      adm.hadm_id,
      pat.anchor_age,
      pat.anchor_year,
      pat.gender,
      adm.insurance,
      adm.admittime,
      adm.dischtime,
      pat.dod
FROM mimiciv_hosp.patients pat
JOIN mimiciv_hosp.admissions adm
    ON pat.subject_id = adm.subject_id;
"""

run_sql(stage1)
print("Stage 1 done.")


# -------------------------------------------------------
# Stage 2 — admission-level windows
# -------------------------------------------------------
stage2 = """
DROP TABLE IF EXISTS tmp_pat_adm_windows;

CREATE TEMP TABLE tmp_pat_adm_windows AS
SELECT
      *,
      DENSE_RANK() OVER (PARTITION BY subject_id ORDER BY admittime) AS hosp_stay_num,
      EXTRACT(EPOCH FROM (dischtime - admittime)) / 3600 / 24 AS hosp_los,
      DATE(dod) - DATE(dischtime) AS days_to_death,
      CASE WHEN DATE(dod) - DATE(dischtime) = 0 THEN 1 ELSE 0 END AS hospital_mortality
FROM tmp_pat_adm;
"""

run_sql(stage2)
print("Stage 2 done.")


# -------------------------------------------------------
# Stage 3 — join ICU + ICU windows
# -------------------------------------------------------
stage3 = """
DROP TABLE IF EXISTS tmp_pat_adm_icu;

CREATE TEMP TABLE tmp_pat_adm_icu AS
SELECT
      t.subject_id,
      t.hadm_id,
      icu.stay_id,
      ROW_NUMBER() OVER (PARTITION BY t.subject_id ORDER BY icu.intime) AS icu_stay_num,
      CASE
          WHEN FIRST_VALUE(icu.stay_id) OVER (PARTITION BY t.subject_id ORDER BY icu.intime)
               = icu.stay_id THEN 1 ELSE 0
      END AS pat_count,
      t.hosp_stay_num,
      t.anchor_age + (EXTRACT(YEAR FROM icu.intime) - t.anchor_year) AS age,
      t.gender,
      t.insurance,
      icu.first_careunit,
      icu.los AS icu_los,
      t.hosp_los,
      t.dod,
      t.days_to_death,
      t.hospital_mortality,
      CASE WHEN DATE(t.dod) - DATE(icu.outtime) = 0 THEN 1 ELSE 0 END AS icu_mortality
FROM tmp_pat_adm_windows t
JOIN mimiciv_icu.icustays icu
    ON t.hadm_id = icu.hadm_id;
"""

run_sql(stage3)
print("Stage 3 done.")


# -------------------------------------------------------
# Stage 4 — Load final data to pandas
# -------------------------------------------------------
query_final = "SELECT * FROM tmp_pat_adm_icu;"
icu = pd.read_sql(query_final, engine)

print("Final dataframe loaded:")
print(icu.head())
print(f"Total rows: {len(icu):,}")


Stage 1 done.
Stage 2 done.
Stage 3 done.
Final dataframe loaded:
   subject_id   hadm_id  ...  hospital_mortality  icu_mortality
0    10000032  29079034  ...                   0              0
1    10000690  25860671  ...                   0              0
2    10000980  26913865  ...                   0              0
3    10001217  24597018  ...                   0              0
4    10001217  27703517  ...                   0              0

[5 rows x 16 columns]
Total rows: 94,458


In [37]:
# Add 1 year mortality
icu['one_year_mortality'] = icu['days_to_death'].notnull().astype(int)

# Days to death at last ICU stay
last_dod = icu.groupby('subject_id')[['icu_stay_num']].max().reset_index()
last_dod = last_dod.merge(icu[['subject_id', 'icu_stay_num', 'days_to_death']], on=['subject_id', 'icu_stay_num'], how='inner')
last_dod.rename(columns={'days_to_death': 'days_to_death_last_stay_id'}, inplace=True)

icu = icu.merge(last_dod, how='left', on=['subject_id', 'icu_stay_num'])
del last_dod
icu.sort_values(['subject_id', 'icu_stay_num'], inplace=True)

# add a grouping variable for table one so we can have hospital mortality as a group and a row
icu['hosp_mort'] = icu ['hospital_mortality']

# fix some data type issues
int_cols = icu.dtypes.values=="Int64"
icu.loc[:, int_cols] = icu.loc[:, int_cols].astype(float)
icu.loc[:, int_cols] = icu.loc[:, int_cols].astype(int, errors="ignore")

print('Edited dataframe loaded:')
icu.info()
icu.head()

print(f"Total rows: {len(icu):,}")

Edited dataframe loaded:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94458 entries, 0 to 94457
Data columns (total 19 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   subject_id                  94458 non-null  int64  
 1   hadm_id                     94458 non-null  int64  
 2   stay_id                     94458 non-null  int64  
 3   icu_stay_num                94458 non-null  int64  
 4   pat_count                   94458 non-null  int64  
 5   hosp_stay_num               94458 non-null  int64  
 6   age                         94458 non-null  float64
 7   gender                      94458 non-null  object 
 8   insurance                   92935 non-null  object 
 9   first_careunit              94458 non-null  object 
 10  icu_los                     94444 non-null  float64
 11  hosp_los                    94458 non-null  float64
 12  dod                         37967 non-null  object 
 13  days_t

In [None]:
from tableone import TableOne

data = icu

columns = [
    "pat_count",
    # "hadm_count",
    "age", "gender", "insurance",
    # "first_careunit",
    # "icu_los",
    "hosp_los",
    # 'icu_mortality',
    'hospital_mortality',
    "one_year_mortality",
    "days_to_death_last_stay_id"
]

categorical = [
    "pat_count",
    # "hadm_count",
    "gender", "insurance",
    # "first_careunit",
    # mortality flags
    # 'icu_mortality',
    'hospital_mortality',
    'one_year_mortality',
]

order = {
    "pat_count": [1, 0],
    # "hadm_count": [1, 0],
    "gender": ["F", "M"],
    # "icu_mortality": [1, 0],
    "hospital_mortality": [1, 0],
    "one_year_mortality": [1, 0],
}

limit = {
    "pat_count": 1, #"hadm_count": 1,
    "gender": 1,
    #"icu_mortality": 1,
    "hospital_mortality": 1,
    "one_year_mortality": 1,
}

rename = {
    "pat_count": "Distinct patients", "hadm_count": "Distinct hospitalizations",
    "age": "Age", "gender": "Administrative Gender", "insurance": "Insurance",
    "first_careunit": "First ICU stay, unit type",
    "icu_los": "ICU length of stay", "hosp_los": "Hospital length of stay",
    "icu_mortality": "In-ICU mortality",
    "hospital_mortality": "In-hospital mortality",
    "one_year_mortality": "One year mortality",
    "days_to_death_last_stay_id": "Time to death (days)",
}

print('Hospital demographics')
hosp_table = TableOne(hosp, columns=columns, categorical=categorical, order=order, limit=limit, rename=rename)
display(hosp_table)

#print('ICU demographics')
#icu_table = TableOne(data, columns=columns, categorical=categorical, order=order, limit=limit, rename=rename)
#display(icu_table)

Hospital demographics




Unnamed: 0,Unnamed: 1,Missing,Overall
n,,,546028
"Distinct patients, n (%)",0,,322576 (59.1)
"Age, mean (SD)",,0.0,59.2 (19.1)
"Administrative Gender, n (%)",F,,284097 (52.0)
"Insurance, n (%)",Medicaid,,104229 (19.1)
"Insurance, n (%)",Medicare,,244576 (44.8)
"Insurance, n (%)",No charge,,463 (0.1)
"Insurance, n (%)",,,9355 (1.7)
"Insurance, n (%)",Other,,14006 (2.6)
"Insurance, n (%)",Private,,173399 (31.8)


ICU demographics


In [43]:
icu.info()
icu.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94458 entries, 0 to 94457
Data columns (total 19 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   subject_id                  94458 non-null  int64  
 1   hadm_id                     94458 non-null  int64  
 2   stay_id                     94458 non-null  int64  
 3   icu_stay_num                94458 non-null  int64  
 4   pat_count                   94458 non-null  int64  
 5   hosp_stay_num               94458 non-null  int64  
 6   age                         94458 non-null  float64
 7   gender                      94458 non-null  object 
 8   insurance                   92935 non-null  object 
 9   first_careunit              94458 non-null  object 
 10  icu_los                     94444 non-null  float64
 11  hosp_los                    94458 non-null  float64
 12  dod                         37967 non-null  object 
 13  days_to_death               379

Unnamed: 0,subject_id,hadm_id,stay_id,icu_stay_num,pat_count,hosp_stay_num,age,gender,insurance,first_careunit,icu_los,hosp_los,dod,days_to_death,hospital_mortality,icu_mortality,one_year_mortality,days_to_death_last_stay_id,hosp_mort
0,10000032,29079034,39553978,1,1,3,52.0,F,Medicaid,Medical Intensive Care Unit (MICU),0.410266,2.222222,2180-09-09,46.0,0,0,1,46.0,0
1,10000690,25860671,37081114,1,1,3,86.0,F,Medicare,Medical Intensive Care Unit (MICU),3.893252,9.821528,2152-01-30,444.0,0,0,1,444.0,0
2,10000980,26913865,39765666,1,1,2,76.0,F,Medicare,Medical Intensive Care Unit (MICU),0.497535,5.806944,2193-08-26,1515.0,0,0,1,1515.0,0
3,10001217,24597018,37067082,1,1,1,55.0,F,Private,Surgical Intensive Care Unit (SICU),1.118032,6.794444,,,0,0,0,,0
4,10001217,27703517,34592300,2,0,2,55.0,F,Private,Surgical Intensive Care Unit (SICU),0.948113,5.914583,,,0,0,0,,0
