In [1]:
from collections import OrderedDict

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
# !pip install tableone
from tableone import TableOne
# !pip install psycopg2-binary
import psycopg2

In [2]:
# create a database connection
sqluser = 'asem'
dbname = 'mimiciv'
hostname = 'localhost'
password='qwerasdf'
hosp_schema_name = 'mimiciv_hosp'

# Connect to local postgres version of mimic
con = psycopg2.connect(dbname=dbname, user=sqluser, host=hostname, password=password)

In [3]:
query = \
"""
SELECT
      pat.subject_id
    , adm.hadm_id
    , DENSE_RANK() OVER hadm_window AS hosp_stay_num
    , CASE
        WHEN FIRST_VALUE(adm.hadm_id) OVER hadm_window = adm.hadm_id THEN 1
        ELSE 0
      END AS pat_count
    , pat.anchor_age + (EXTRACT(YEAR FROM adm.admittime) - pat.anchor_year) AS age
    , pat.gender
    , adm.insurance
    , mimiciv_derived.DATETIME_DIFF(adm.dischtime, adm.admittime, 'HOUR') / 24 AS hosp_los
    , pat.dod
    , mimiciv_derived.DATETIME_DIFF(pat.dod, CAST(adm.dischtime AS DATE), 'DAY') AS days_to_death
    -- mortality flags
    , CASE WHEN mimiciv_derived.DATETIME_DIFF(pat.dod, CAST(adm.dischtime AS DATE), 'DAY') = 0 THEN 1 ELSE 0 END AS hospital_mortality
FROM mimiciv_hosp.patients pat
INNER JOIN mimiciv_hosp.admissions adm
    ON pat.subject_id = adm.subject_id
WINDOW hadm_window AS (PARTITION BY pat.subject_id ORDER BY adm.admittime)
"""

hosp = pd.read_sql_query(query,con)

In [4]:
# Admission IDs with Length of Stay at least 12 hours.
hadm_id_los_geq_12h = hosp[hosp.hosp_los > 0.5].hadm_id
hadm_id_los_geq_12h

0         22595853
1         22841357
2         29079034
3         25742920
5         23052089
            ...   
431226    29734428
431227    25744818
431228    26071774
431229    21033226
431230    23865745
Name: hadm_id, Length: 392689, dtype: int64

In [5]:
adm_df = pd.read_sql_query("""
SELECT subject_id, hadm_id, admittime, dischtime FROM mimiciv_hosp.admissions
""", con)
adm_df

Unnamed: 0,subject_id,hadm_id,admittime,dischtime
0,10000032,22595853,2180-05-06 22:23:00,2180-05-07 17:15:00
1,10000032,22841357,2180-06-26 18:27:00,2180-06-27 18:49:00
2,10000032,25742920,2180-08-05 23:44:00,2180-08-07 17:50:00
3,10000032,29079034,2180-07-23 12:35:00,2180-07-25 17:55:00
4,10000068,25022803,2160-03-03 23:16:00,2160-03-04 06:26:00
...,...,...,...,...
431226,19999828,25744818,2149-01-08 16:44:00,2149-01-18 17:00:00
431227,19999828,29734428,2147-07-18 16:23:00,2147-08-04 18:10:00
431228,19999840,21033226,2164-09-10 13:47:00,2164-09-17 13:42:00
431229,19999840,26071774,2164-07-25 00:27:00,2164-07-28 12:15:00


In [6]:

# add 1 year mortality
hosp['one_year_mortality'] = hosp['days_to_death'].notnull().astype(int)

# create a dataframe with the days to death for only the last ICU stay
last_dod = hosp.groupby('subject_id')[['hosp_stay_num']].max().reset_index()
last_dod = last_dod.merge(hosp[['subject_id', 'hosp_stay_num', 'days_to_death']], on=['subject_id', 'hosp_stay_num'], how='inner')
last_dod.rename(columns={'days_to_death': 'days_to_death_last_stay_id'}, inplace=True)

hosp = hosp.merge(last_dod, how='left', on=['subject_id', 'hosp_stay_num'])
del last_dod
hosp.sort_values(['subject_id', 'hosp_stay_num'], inplace=True)

# fix some data type issues
int_cols = hosp.dtypes.values=="Int64"
hosp.loc[:, int_cols] = hosp.loc[:, int_cols].astype(float)
hosp.loc[:, int_cols] = hosp.loc[:, int_cols].astype(int, errors="ignore")


In [7]:
query = \
"""
SELECT
      pat.subject_id
    , adm.hadm_id
    , icu.stay_id
    , ROW_NUMBER() OVER (PARTITION BY pat.subject_id ORDER BY icu.intime) AS icu_stay_num
    , DENSE_RANK() OVER (PARTITION BY pat.subject_id ORDER BY adm.admittime) AS hosp_stay_num
    , CASE
        WHEN FIRST_VALUE(icu.stay_id) OVER icustay_window = icu.stay_id THEN 1
        ELSE 0
      END AS pat_count
    , pat.anchor_age + (EXTRACT(YEAR FROM icu.intime) - pat.anchor_year) AS age
    , pat.gender
    , adm.insurance
    , icu.first_careunit
    , icu.los AS icu_los
    , mimiciv_derived.DATETIME_DIFF(adm.dischtime, adm.admittime, 'HOUR') / 24 AS hosp_los
    , pat.dod
    , mimiciv_derived.DATETIME_DIFF(pat.dod, CAST(adm.dischtime AS DATE), 'DAY') AS days_to_death
    -- mortality flags
    , CASE WHEN mimiciv_derived.DATETIME_DIFF(pat.dod, CAST(adm.dischtime AS DATE), 'DAY') = 0 THEN 1 ELSE 0 END AS hospital_mortality
    , CASE WHEN mimiciv_derived.DATETIME_DIFF(pat.dod, CAST(icu.outtime AS DATE), 'DAY') = 0 THEN 1 ELSE 0 END AS icu_mortality
FROM mimiciv_hosp.patients pat
INNER JOIN mimiciv_hosp.admissions adm
    ON pat.subject_id = adm.subject_id
INNER JOIN mimiciv_icu.icustays icu
    ON adm.hadm_id = icu.hadm_id
WINDOW hadm_window AS (PARTITION BY pat.subject_id ORDER BY adm.admittime)
     , icustay_window AS (PARTITION BY pat.subject_id ORDER BY icu.intime)
"""

data = pd.read_sql_query(query,con)

In [8]:
# add 1 year mortality
data['one_year_mortality'] = data['days_to_death'].notnull().astype(int)

# create a dataframe with the days to death for only the last ICU stay
last_dod = data.groupby('subject_id')[['icu_stay_num']].max().reset_index()
last_dod = last_dod.merge(data[['subject_id', 'icu_stay_num', 'days_to_death']], on=['subject_id', 'icu_stay_num'], how='inner')
last_dod.rename(columns={'days_to_death': 'days_to_death_last_stay_id'}, inplace=True)

data = data.merge(last_dod, how='left', on=['subject_id', 'icu_stay_num'])
del last_dod
data.sort_values(['subject_id', 'icu_stay_num'], inplace=True)

# add a grouping variable for table one so we can have hospital mortality as a group and a row
data['hosp_mort'] = data['hospital_mortality']

# fix some data type issues
int_cols = data.dtypes.values=="Int64"
data.loc[:, int_cols] = data.loc[:, int_cols].astype(float)
data.loc[:, int_cols] = data.loc[:, int_cols].astype(int, errors="ignore")

In [9]:


columns = [
    "pat_count",
    # , "hadm_count",
    "age", "gender", "insurance",
    # "first_careunit",
    # "icu_los",
    "hosp_los",
    # 'icu_mortality',
    'hospital_mortality',
    "one_year_mortality",
    # "days_to_death_last_stay_id"
]

categorical = [
    "pat_count",
    # "hadm_count",
    "gender", "insurance",
    # "first_careunit",
    # mortality flags
    # 'icu_mortality',
    'hospital_mortality',
    'one_year_mortality',
]

order = {
    "pat_count": [1, 0],
    "hadm_count": [1, 0],
    "gender": ["F", "M"],
    # "icu_mortality": [1, 0],
    "hospital_mortality": [1, 0],
    "one_year_mortality": [1, 0],
}

limit = {
    "pat_count": 1, "hadm_count": 1,
    "gender": 1,
    # "icu_mortality": 1,
    "hospital_mortality": 1,
    "one_year_mortality": 1,
}

rename = {
    "pat_count": "Distinct patients", "hadm_count": "Distinct hospitalizations",
    "age": "Age", "gender": "Administrative Gender", "insurance": "Insurance",
    "first_careunit": "First ICU stay, unit type",
    "icu_los": "ICU length of stay", "hosp_los": "Hospital length of stay",
    "icu_mortality": "In-ICU mortality",
    "hospital_mortality": "In-hospital mortality",
    "one_year_mortality": "One year mortality",
    # "days_to_death_last_stay_id": "Time to death (days)",
}

print('ICU demographics')
icu_table = TableOne(data, columns=columns, categorical=categorical, order=order, limit=limit, rename=rename)
display(icu_table)
print('Hospital demographics')
hosp_table = TableOne(hosp, columns=columns, categorical=categorical, order=order, limit=limit, rename=rename)
display(hosp_table)



ICU demographics




Unnamed: 0,Unnamed: 1,Missing,Overall
n,,,73181
"Distinct patients, n (%)",1,0.0,50920 (69.6)
"Age, mean (SD)",,0.0,64.7 (16.9)
"Administrative Gender, n (%)",F,0.0,32363 (44.2)
"Insurance, n (%)",Medicaid,0.0,5528 (7.6)
"Insurance, n (%)",Medicare,,33091 (45.2)
"Insurance, n (%)",Other,,34562 (47.2)
"Hospital length of stay, mean (SD)",,0.0,11.0 (13.3)
"In-hospital mortality, n (%)",1,0.0,8511 (11.6)
"One year mortality, n (%)",1,0.0,28274 (38.6)


Hospital demographics




Unnamed: 0,Unnamed: 1,Missing,Overall
n,,,431231
"Distinct patients, n (%)",1,0.0,180733 (41.9)
"Age, mean (SD)",,0.0,58.8 (19.2)
"Administrative Gender, n (%)",F,0.0,224990 (52.2)
"Insurance, n (%)",Medicaid,0.0,41330 (9.6)
"Insurance, n (%)",Medicare,,160560 (37.2)
"Insurance, n (%)",Other,,229341 (53.2)
"Hospital length of stay, mean (SD)",,0.0,4.5 (6.6)
"In-hospital mortality, n (%)",1,0.0,8869 (2.1)
"One year mortality, n (%)",1,0.0,106218 (24.6)


### Observable Measurements (Prediction Target)

In [10]:
## NUMERIC FEATURES
## TODO: add mergers across tables and within-tables.

blood_gas = ['so2', 'po2', 'pco2', 'fio2', 'fio2_chartevents', 'aado2', 'aado2_calc', 'pao2fio2ratio', 'ph', 'baseexcess', 'bicarbonate', 'totalco2', 'hematocrit', 'hemoglobin', 'carboxyhemoglobin', 'methemoglobin', 'chloride', 'calcium', 'temperature', 'potassium', 'sodium', 'lactate', 'glucose']

blood_chemistry = ['albumin','globulin','total_protein','aniongap','bicarbonate','bun','calcium','chloride','creatinine','glucose','sodium','potassium']

cardiac_markers = ['troponin_t','ntprobnp','ck_mb']

cbc = ['hematocrit','hemoglobin','mch','mchc','mcv','platelet','rbc','rdw','rdwsd','wbc']

vital_signs = ['heart_rate','sbp','dbp','mbp','sbp_ni','dbp_ni','mbp_ni','resp_rate','temperature','spo2','glucose']

# Glasgow Coma Scale, a measure of neurological function
coma_signs = ['gcs','gcs_motor','gcs_verbal','gcs_eyes','gcs_unable']

renal_out = ['uo_rt_6hr', 'uo_rt_12hr', 'uo_rt_24hr']

renal_creat = ['creat']

renal_aki = ['aki_stage_smoothed']

In [11]:
def filter_measurements(df, hadm_id_selection=None, subject_id_selection=None):
    n = OrderedDict()
    n['n0'] = len(df)
    
    cols = df.columns
    
    # (1) Filter measurements based on admission_id selection.
    if hadm_id_selection is not None:
        df = df[df.hadm_id.isin(hadm_id_selection)]
    
    n['n adm. filter'] = len(df)
    
    
    # Merge with Admission Table
    df_ = df.merge(adm_df, on='hadm_id', how='left')

    # (2) Filter measurements based on subject_id selection
    if subject_id_selection is not None:
        df_ = df_[df_.subject_id.isin(subject_id_selection)]
    
    n['n subj. filter'] = len(df_)
        
    # (3) Filter measurements with time_bin outside the hosp stay.
    mask = df_.time_bin.between(df_.admittime, df_.dischtime)
    
    df_ = df_[mask]
    df = df_[cols]
    n['n in-patient time'] = len(df)
    
    print('\n'.join(f'{filt}: {num} ({100 * num / n["n0"]:.1f})' for filt, num in n.items()))
    print('\n========\n')
    return df

### Numeric Feature Tables

In [12]:
agg_interval = 'hour'
# For higher resolution binning see: 
# https://stackoverflow.com/questions/7299342/what-is-the-fastest-way-to-truncate-timestamps-to-5-minutes-in-postgres

# Time-binning can be specific for each table

In [13]:
renal_out_query = f"""
select icu.hadm_id,
       {', '.join(map(lambda e: f'avg(r.{e}) as {e}', renal_out))},
       date_trunc('{agg_interval}', r.charttime) time_bin
from mimiciv_derived.kdigo_uo as r
inner join mimiciv_icu.icustays as icu
 on icu.stay_id = r.stay_id
group by icu.hadm_id, time_bin
"""

renal_creat_query = f"""
select icu.hadm_id,
       {', '.join(map(lambda e: f'avg(r.{e}) as {e}', renal_creat))},
       date_trunc('{agg_interval}', r.charttime) time_bin
from mimiciv_derived.kdigo_creatinine as r
inner join mimiciv_icu.icustays as icu
 on icu.stay_id = r.stay_id
group by icu.hadm_id, time_bin
"""

renal_aki_query = f"""
select icu.hadm_id,
       {', '.join(map(lambda e: f'avg(r.{e}) as {e}', renal_aki))},
       date_trunc('{agg_interval}', r.charttime) time_bin
from mimiciv_derived.kdigo_stages as r
inner join mimiciv_icu.icustays as icu
 on icu.stay_id = r.stay_id
group by icu.hadm_id, time_bin
"""

renal_out_df = pd.read_sql_query(renal_out_query, con)
renal_creat_df = pd.read_sql_query(renal_creat_query, con)
renal_aki_df = pd.read_sql_query(renal_aki_query, con)


In [14]:
renal_hadm_id = pd.Series(renal_aki_df.hadm_id.unique())

In [15]:
adm_df['adm_has_renal_data'] = adm_df.hadm_id.isin(renal_hadm_id)
adm_df

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,adm_has_renal_data
0,10000032,22595853,2180-05-06 22:23:00,2180-05-07 17:15:00,False
1,10000032,22841357,2180-06-26 18:27:00,2180-06-27 18:49:00,False
2,10000032,25742920,2180-08-05 23:44:00,2180-08-07 17:50:00,False
3,10000032,29079034,2180-07-23 12:35:00,2180-07-25 17:55:00,True
4,10000068,25022803,2160-03-03 23:16:00,2160-03-04 06:26:00,False
...,...,...,...,...,...
431226,19999828,25744818,2149-01-08 16:44:00,2149-01-18 17:00:00,True
431227,19999828,29734428,2147-07-18 16:23:00,2147-08-04 18:10:00,False
431228,19999840,21033226,2164-09-10 13:47:00,2164-09-17 13:42:00,True
431229,19999840,26071774,2164-07-25 00:27:00,2164-07-28 12:15:00,False


In [19]:
subject_has_renal = adm_df.groupby('subject_id', as_index=False).agg(n_adm=('hadm_id', 'nunique'), adm_has_renal_data=('adm_has_renal_data', 'sum'))
subjects_with_renal_info = subject_has_renal[subject_has_renal['adm_has_renal_data'] > 0]
subjects_with_renal_info['p_adm_with_renal_info'] = subjects_with_renal_info['adm_has_renal_data'] / subjects_with_renal_info['n_adm']
subject_id_with_renal_info = subjects_with_renal_info.subject_id
# subjects_with_renal_info['p_adm_with_renal_info'].hist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subjects_with_renal_info['p_adm_with_renal_info'] = subjects_with_renal_info['adm_has_renal_data'] / subjects_with_renal_info['n_adm']


In [17]:
renal_out_df = filter_measurements(renal_out_df, hadm_id_selection=hadm_id_los_geq_12h, 
                                   subject_id_selection=subject_id_with_renal_info)
renal_creat_df = filter_measurements(renal_creat_df, hadm_id_selection=hadm_id_los_geq_12h, 
                                     subject_id_selection=subject_id_with_renal_info)
renal_aki_df = filter_measurements(renal_aki_df, hadm_id_selection=hadm_id_los_geq_12h, subject_id_selection=subject_id_with_renal_info)

n0: 3294878 (100.0)
n adm. filter: 3292534 (99.9)
n subj. filter: 3292534 (99.9)
n in-patient time: 3285718 (99.7)


n0: 568158 (100.0)
n adm. filter: 566564 (99.7)
n subj. filter: 566564 (99.7)
n in-patient time: 502249 (88.4)


n0: 3687859 (100.0)
n adm. filter: 3684212 (99.9)
n subj. filter: 3684212 (99.9)
n in-patient time: 3613898 (98.0)




In [18]:
blood_gas_query = f"""
select hadm_id,
       {', '.join(map(lambda e: f'avg(bg.{e}) as {e}', blood_gas))},
       date_trunc('{agg_interval}', bg.charttime) time_bin
from mimiciv_derived.bg as bg
group by hadm_id, time_bin
"""

bg_df = pd.read_sql_query(blood_gas_query,con)

In [19]:
bg_df = filter_measurements(bg_df, hadm_id_selection=hadm_id_los_geq_12h, 
                            subject_id_selection=subject_id_with_renal_info)

n0: 491564 (100.0)
n adm. filter: 442007 (89.9)
n subj. filter: 409545 (83.3)
n in-patient time: 404226 (82.2)




In [20]:
blood_chemistry_query = f"""
select hadm_id,
       {', '.join(map(lambda e: f'avg(ch.{e}) as {e}', blood_chemistry))},
       date_trunc('{agg_interval}', ch.charttime) time_bin
from mimiciv_derived.chemistry as ch
group by hadm_id, time_bin
"""

ch_df = pd.read_sql_query(blood_chemistry_query,con)

In [21]:
ch_df = filter_measurements(ch_df, hadm_id_selection=hadm_id_los_geq_12h, 
                            subject_id_selection=subject_id_with_renal_info)

n0: 2530982 (100.0)
n adm. filter: 1987259 (78.5)
n subj. filter: 1334968 (52.7)
n in-patient time: 1313463 (51.9)




In [22]:
cardiac_marker_query = \
f"""
WITH trop AS
(
    SELECT specimen_id, MAX(valuenum) AS troponin_t
    FROM mimiciv_hosp.labevents
    WHERE itemid = 51003
    GROUP BY specimen_id
)
SELECT
    c.hadm_id
    , date_trunc('{agg_interval}', c.charttime) time_bin
    , avg(trop.troponin_t) as troponin_t
    , avg(c.ntprobnp) as ntprobnp
    , avg(c.ck_mb) as ck_mb
FROM mimiciv_hosp.admissions a
LEFT JOIN mimiciv_derived.cardiac_marker c
  ON a.hadm_id = c.hadm_id
LEFT JOIN trop
  ON c.specimen_id = trop.specimen_id
GROUP BY c.hadm_id, time_bin
"""

cardiac_df = pd.read_sql_query(cardiac_marker_query,con)

In [23]:
cardiac_df = filter_measurements(cardiac_df, hadm_id_selection=hadm_id_los_geq_12h, 
                                 subject_id_selection=subject_id_with_renal_info)

n0: 179011 (100.0)
n adm. filter: 177838 (99.3)
n subj. filter: 122907 (68.7)
n in-patient time: 116983 (65.3)




In [24]:
temp_query = \
f"""
WITH temp_t AS (
    SELECT c.hadm_id
        , c.charttime
        , CASE
            WHEN LOWER(c.valueuom)  like '%°f%' THEN (c.valuenum - 32) / 1.8
                ELSE c.valuenum
          END AS temperature
    FROM mimiciv_hosp.admissions a
    INNER JOIN mimiciv_icu.chartevents c
      ON a.hadm_id = c.hadm_id
    INNER JOIN mimiciv_icu.d_items di
      ON c.itemid = di.itemid
    AND c.itemid IN
    (
    227632, -- Arctic Sun/Alsius Temp #1 C
    227634, -- Arctic Sun/Alsius Temp #2 C
    223761 -- Temperature Fahrenheit
    )
    AND valuenum > 10 AND valuenum < 120
)
SELECT temp_t.hadm_id
     , AVG(temp_t.temperature) temperature
     , DATE_TRUNC('{agg_interval}', temp_t.charttime) time_bin
FROM temp_t
GROUP BY temp_t.hadm_id, time_bin
"""
temp_df = pd.read_sql_query(temp_query,con)

In [25]:
temp_df = filter_measurements(temp_df, hadm_id_selection=hadm_id_los_geq_12h, 
                              subject_id_selection=subject_id_with_renal_info)

n0: 1534954 (100.0)
n adm. filter: 1533292 (99.9)
n subj. filter: 1533292 (99.9)
n in-patient time: 1526910 (99.5)




In [26]:
cbc_query = f"""
select hadm_id,
       {', '.join(map(lambda e: f'avg(cbc.{e}) as {e}', cbc))},
       date_trunc('{agg_interval}', cbc.charttime) time_bin
from mimiciv_derived.complete_blood_count as cbc
group by hadm_id, time_bin
"""
cbc_df = pd.read_sql_query(cbc_query, con)

In [27]:
cbc_df = filter_measurements(cbc_df, hadm_id_selection=hadm_id_los_geq_12h, 
                             subject_id_selection=subject_id_with_renal_info)

n0: 2463587 (100.0)
n adm. filter: 1925075 (78.1)
n subj. filter: 1259401 (51.1)
n in-patient time: 1238087 (50.3)




In [28]:
vital_query = f"""
select icu.hadm_id,
       {', '.join(map(lambda e: f'avg(v.{e}) as {e}', vital_signs))},
       date_trunc('{agg_interval}', v.charttime) time_bin
from mimiciv_derived.vitalsign as v
inner join mimiciv_icu.icustays as icu
 on icu.stay_id = v.stay_id
group by icu.hadm_id, time_bin
"""
vital_df = pd.read_sql_query(vital_query, con)

In [29]:
vital_df = filter_measurements(vital_df, hadm_id_selection=hadm_id_los_geq_12h, 
                               subject_id_selection=subject_id_with_renal_info)

n0: 5886450 (100.0)
n adm. filter: 5880273 (99.9)
n subj. filter: 5880273 (99.9)
n in-patient time: 5861008 (99.6)




In [30]:
gcs_query = f"""
select icu.hadm_id,
       {', '.join(map(lambda e: f'avg(gcs.{e}) as {e}', coma_signs))},
       date_trunc('{agg_interval}', gcs.charttime) time_bin
from mimiciv_derived.gcs as gcs
inner join mimiciv_icu.icustays as icu
 on icu.stay_id = gcs.stay_id
group by icu.hadm_id, time_bin
"""
gcs_df = pd.read_sql_query(gcs_query, con)

In [31]:
gcs_df = filter_measurements(gcs_df, hadm_id_selection=hadm_id_los_geq_12h, 
                             subject_id_selection=subject_id_with_renal_info)

n0: 1628182 (100.0)
n adm. filter: 1626537 (99.9)
n subj. filter: 1626537 (99.9)
n in-patient time: 1621695 (99.6)




In [32]:

bg_df.to_csv('bg.csv.gz', compression='gzip')
ch_df.to_csv('ch.csv.gz', compression='gzip')
cardiac_df.to_csv('cardiac.csv.gz', compression='gzip')
temp_df.to_csv('temp.csv.gz', compression='gzip')
cbc_df.to_csv('cbc.csv.gz', compression='gzip')
vital_df.to_csv('vital.csv.gz', compression='gzip')
gcs_df.to_csv('gcs.csv.gz', compression='gzip')
renal_out_df.to_csv('renal_out.csv.gz', compression='gzip')
renal_creat_df.to_csv('renal_creat.csv.gz', compression='gzip')
renal_aki_df.to_csv('renal_aki.csv.gz', compression='gzip')

### Interventions



In [91]:
## Inputs - Canonicalise

input_query = \
"""
SELECT
    a.hadm_id
    , inp.starttime as start_time
    , inp.endtime as end_time
    , di.label
    , inp.rate 
    , inp.amount
    , inp.rateuom
    , inp.amountuom
FROM mimiciv_hosp.admissions a
INNER JOIN mimiciv_icu.icustays i
    ON a.hadm_id = i.hadm_id
LEFT JOIN mimiciv_icu.inputevents inp
    ON i.stay_id = inp.stay_id
LEFT JOIN mimiciv_icu.d_items di
    ON inp.itemid = di.itemid
"""


## Procedures - Canonicalise and Refine
icuproc_query = \
"""
SELECT
    a.hadm_id
    , pe.starttime as start_time
    , pe.endtime as end_time
    , di.label
    , pe.value
FROM mimiciv_hosp.admissions a
INNER JOIN mimiciv_icu.icustays i
    ON a.hadm_id = i.hadm_id
LEFT JOIN mimiciv_icu.procedureevents pe
    ON i.stay_id = pe.stay_id
LEFT JOIN mimiciv_icu.d_items di
    ON pe.itemid = di.itemid
"""

hospicdproc_query = \
"""
select pi.hadm_id
, (pi.chartdate)::timestamp as start_time
, (pi.chartdate + interval '1 hour')::timestamp as end_time
, pi.icd_code
, pi.icd_version
, di.long_title
FROM mimiciv_hosp.procedures_icd pi
INNER JOIN mimiciv_hosp.d_icd_procedures di
  ON pi.icd_version = di.icd_version
  AND pi.icd_code = di.icd_code
INNER JOIN mimiciv_hosp.admissions a
  ON pi.hadm_id = a.hadm_id
"""


# === provider order entry (poe) <----- very messy, high irrelevance, ignore

poe_query = f"""
SELECT
    a.hadm_id
    , mimiciv_derived.DATETIME_DIFF(p.ordertime, a.admittime, 'DAY') AS offset
    , p.poe_id
    , p.order_type, p.order_subtype
    , p.transaction_type
    , pd.field_name
    , pd.field_value
FROM mimiciv_hosp.admissions a
INNER JOIN mimiciv_hosp.poe p
    ON a.hadm_id = p.hadm_id
LEFT JOIN  mimiciv_hosp.poe_detail pd
    ON p.poe_id = pd.poe_id
"""

In [92]:
input_df = pd.read_sql_query(input_query,con)

In [22]:
icuproc_df = pd.read_sql_query(icuproc_query,con)

In [23]:
hospicdproc_df = pd.read_sql_query(hospicdproc_query,con)

In [35]:
# Ignored entirely
# poe_df = pd.read_sql_query(poe_query,con)

In [24]:
def filter_interventions(df, hadm_id_selection=None, subject_id_selection=None):
    n = OrderedDict()
    n['n0'] = len(df)
    
    cols = df.columns
    
    # (1) Filter interventions based on admission_id selection.
    if hadm_id_selection is not None:
        df = df[df.hadm_id.isin(hadm_id_selection)]
    
    n['n adm. filter'] = len(df)
    
    
    # Merge with Admission Table
    df_ = df.merge(adm_df, on='hadm_id', how='left')

    # (2) Filter interventions based on subject_id selection
    if subject_id_selection is not None:
        df_ = df_[df_.subject_id.isin(subject_id_selection)]
    
    n['n subj. filter'] = len(df_)
        
    # (3) Filter interventions with intervals outside the hosp stay.
    mask1 = df_.start_time.between(df_.admittime, df_.dischtime)
    mask2 = df_.end_time.between(df_.admittime, df_.dischtime)
    n['n start-time filter'] = sum(mask1)
    n['n end-time filter'] = sum(mask1 & mask2)

    df_ = df_[mask1 & mask2]    
    
    df = df_[cols]
    n['n in-patient time'] = len(df)
    
    print('\n'.join(f'{filt}: {num} ({100 * num / n["n0"]:.1f})' for filt, num in n.items()))
    print('\n========\n')
    return df

In [93]:
input_df = filter_interventions(input_df, hadm_id_selection=hadm_id_los_geq_12h, 
                                subject_id_selection=subject_id_with_renal_info)

n0: 8979384 (100.0)
n adm. filter: 8961785 (99.8)
n subj. filter: 8961785 (99.8)
n start-time filter: 8939150 (99.6)
n end-time filter: 8922209 (99.4)
n in-patient time: 8922209 (99.4)




In [26]:
icuproc_df = filter_interventions(icuproc_df, hadm_id_selection=hadm_id_los_geq_12h, 
                                subject_id_selection=subject_id_with_renal_info)

n0: 696562 (100.0)
n adm. filter: 692909 (99.5)
n subj. filter: 692909 (99.5)
n start-time filter: 690445 (99.1)
n end-time filter: 667837 (95.9)
n in-patient time: 667837 (95.9)




In [27]:
hospicdproc_df = filter_interventions(hospicdproc_df, hadm_id_selection=hadm_id_los_geq_12h, 
                                subject_id_selection=subject_id_with_renal_info)

n0: 669186 (100.0)
n adm. filter: 653600 (97.7)
n subj. filter: 369996 (55.3)
n start-time filter: 243103 (36.3)
n end-time filter: 242961 (36.3)
n in-patient time: 242961 (36.3)




In [28]:
df = hospicdproc_df.merge(adm_df, on='hadm_id', how='left')

# timeperc_within_stay = (df['start_time'] - df['admittime']).dt.total_seconds() /(df['dischtime'] - df['admittime']).dt.total_seconds()
# timeperc_within_stay.plot.kde()

In [29]:
icd_n_subjects = df.groupby(['icd_code', 'long_title', 'icd_version'], as_index=False).agg(n_subjects=('subject_id', 'nunique'))
icd_n_subjects['p_subjects'] = icd_n_subjects['n_subjects'] / len(subject_id_with_renal_info)
# icd_n_subjects['p_subjects'].plot.kde()

In [30]:
# Conisder ICD codes with minimum coverage of 0.5% of the selected subjects.
hospicd_R1 = icd_n_subjects[icd_n_subjects['p_subjects'] > 0.005]

# Remove procedures that are purely diagnostic.
patterns = ['diag', 'fluoro', 'biops', 'inspection', 'bronchoscop', 'monitor', 'ultrasonography']

hospicd_R2_mask = hospicd_R1['long_title'].str.match('|'.join(f'(.*{p}.*)' for p in patterns), case=False)
hospicd_R2 = hospicd_R1[~hospicd_R2_mask]


hospicd_R1.to_csv('hospicd_R1.csv')
hospicd_R2.to_csv('hospicd_R2.csv')

In [31]:
# Apply filteration.
hospicdproc_df = hospicdproc_df[hospicdproc_df.icd_code.isin(hospicd_R2.icd_code)]

In [33]:
df = hospicdproc_df.merge(adm_df, on='hadm_id', how='left')

# timeperc_within_stay = (df['start_time'] - df['admittime']).dt.total_seconds() /(df['dischtime'] - df['admittime']).dt.total_seconds()
# timeperc_within_stay.plot.kde(bw_method=0.05)

In [34]:
icd_n_subjects = df.groupby(['icd_code', 'long_title', 'icd_version'], as_index=False).agg(n_subjects=('subject_id', 'nunique'))
icd_n_subjects['p_subjects'] = icd_n_subjects['n_subjects'] / len(subject_id_with_renal_info)
# icd_n_subjects['p_subjects'].plot.kde(bw_method=0.01)

In [35]:
hospicdproc_df.to_csv('int_hospicd.csv.gz', compression='gzip')

In [94]:
input_df['total_interval_hrs'] = (input_df['end_time'] - input_df['start_time']).dt.total_seconds() / 3600
input_df['amount_per_hour'] = input_df['amount'] / input_df['total_interval_hrs']

In [95]:
df = input_df.merge(adm_df, on='hadm_id', how='left')
df['count'] = 1
# timeperc_within_stay = (df['start_time'] - df['admittime']).dt.total_seconds() /(df['dischtime'] - df['admittime']).dt.total_seconds()
# timeperc_within_stay.plot.kde()

In [96]:
input_n_subjects1 = df.groupby(['label'], as_index=False).agg(
    n_subjects=('subject_id', 'nunique'),
    n=('count', 'sum'))
    
input_n_subjects1['p_subjects'] = input_n_subjects1['n_subjects'] / len(subject_id_with_renal_info)

In [97]:
input_n_subjects2 = df.groupby(['label', 'amountuom'], as_index=False).agg(
    n_subjects=('subject_id', 'nunique'),
    n=('count', 'sum'),
    mean_amount=('amount', 'mean'),
    std_amount=('amount', 'std'),
    mean_amount_per_hour=('amount_per_hour', 'mean'),
    std_amount_per_hour=('amount_per_hour', 'std'))
    
input_n_subjects2['p_subjects'] = input_n_subjects2['n_subjects'] / len(subject_id_with_renal_info)


# input_n_subjects['p_subjects'].plot.kde()

In [98]:
input_n_subjects1.to_csv('input_n_subjects1.csv')
input_n_subjects2.to_csv('input_n_subjects2.csv')

In [176]:
insulin_df = input_df[input_df.label.str.match('.*insulin.*', case=False)]

In [177]:
insulin_df

Unnamed: 0,hadm_id,start_time,end_time,label,rate,amount
361,23581541,2160-05-18 22:00:00,2160-05-18 23:00:00,Insulin - Regular,7.989103,7.989103
363,23581541,2160-05-18 23:00:00,2160-05-19 01:09:00,Insulin - Regular,5.995173,12.889622
375,23581541,2160-05-19 01:09:00,2160-05-19 02:17:00,Insulin - Regular,4.992422,5.658078
377,23581541,2160-05-19 02:17:00,2160-05-19 06:25:00,Insulin - Regular,3.997174,16.521651
380,23581541,2160-05-19 05:33:00,2160-05-19 05:34:00,Insulin - Glargine,,40.000000
...,...,...,...,...,...,...
8961130,21033226,2164-09-15 22:12:00,2164-09-15 22:13:00,Insulin - Regular,,4.000000
8961166,21033226,2164-09-15 11:00:00,2164-09-15 11:01:00,Insulin - Regular,,2.000000
8961202,21033226,2164-09-16 10:11:00,2164-09-16 10:12:00,Insulin - Regular,,2.000000
8961223,21033226,2164-09-16 16:53:00,2164-09-16 16:54:00,Insulin - Regular,,4.000000
