In [1]:
%load_ext autoreload
%autoreload 2

import sys
from collections import OrderedDict

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

sys.path.append("../../..")

import lib.ehr._dataset_mimic4 as m4

In [ ]:
from lib.ehr.coding_scheme import CodingScheme

dx_icd10 = CodingScheme.from_name('dx_icd10')
dx_icd9 = CodingScheme.from_name('dx_icd9')
pr_icd10 = CodingScheme.from_name('pr_icd10')
pr_icd9 = CodingScheme.from_name('pr_icd9')

In [2]:
# create a database connection
sqluser = 'postgres'
dbname = 'mimiciv'
hostname = 'localhost'
password = 'qwerasdf'
port = '5432'

mimiciv_config = m4.MIMICIVSQLConfig(host=hostname, port=port, user=sqluser, password=password, dbname=dbname)
mimiciv_interface = m4.MIMICIVSQL(mimiciv_config)
mimiciv_engine = mimiciv_interface.create_engine()


In [ ]:
mimiciv_interface.supported_obs_variables

In [ ]:
mimiciv_interface.supported_dx_discharge


In [ ]:
mimiciv_interface.supported_icu_input

In [ ]:
mimiciv_interface.supported_icu_procedure

In [ ]:
mimiciv_interface.supported_hosp_procedure

In [ ]:
obs_scheme = mimiciv_interface.register_obs_scheme('aki_study_obs', None)

In [3]:
dx_scheme = mimiciv_interface.register_dx_discharge_scheme('aki_study_dx', None)

In [4]:

icu_input_scheme = mimiciv_interface.register_icu_input_scheme('aki_study_icu_input', None)

In [5]:

icu_procedure_scheme = mimiciv_interface.register_icu_procedure_scheme('aki_study_icu_proc', None)

In [6]:

hosp_procedure_scheme = mimiciv_interface.register_hosp_procedure_scheme('aki_study_hosp_proc', None)


In [None]:
# Admission IDs with Length of Stay at least 12 hours.
hadm_id_los_geq_12h = hosp[hosp.hosp_los > 0.5].hadm_id
hadm_id_los_geq_12h

In [None]:
adm_df = pd.read_sql_query("""
SELECT subject_id, hadm_id, admittime, dischtime FROM mimiciv_hosp.admissions
""", con)
adm_df

In [None]:
renal_aki = ['aki_stage_smoothed']

renal_aki_query = f"""
select icu.hadm_id,
       max(aki_stage_smoothed) as aki_stage_smoothed
from mimiciv_derived.kdigo_stages as r
inner join mimiciv_icu.icustays as icu
 on icu.stay_id = r.stay_id
group by icu.hadm_id
"""
renal_aki_df = pd.read_sql_query(renal_aki_query, con)

renal_hadm_id = pd.Series(renal_aki_df.hadm_id.unique())
adm_df['adm_has_renal_data'] = adm_df.hadm_id.isin(renal_hadm_id)

subject_has_renal = adm_df.groupby('subject_id', as_index=False).agg(n_adm=('hadm_id', 'nunique'),
                                                                     adm_has_renal_data=('adm_has_renal_data', 'sum'))
subject_has_renal.head(5)

In [None]:
subjects_with_renal_info = subject_has_renal[subject_has_renal['adm_has_renal_data'] > 0]
subjects_with_renal_info[
    'p_adm_with_renal_info'] = subjects_with_renal_info.adm_has_renal_data / subjects_with_renal_info.n_adm
subject_id_with_renal_info = subjects_with_renal_info.subject_id

In [None]:
hosp2 = hosp1[hosp1.hadm_id.isin(hadm_id_los_geq_12h)]
hosp3 = hosp2[hosp2.subject_id.isin(subject_id_with_renal_info)]

In [None]:
hosp2_table = TableOne(hosp2, columns=hosp_columns, categorical=hosp_categorical,
                       order=hosp_order, limit=hosp_limit, rename=hosp_rename)
display(hosp2_table)

In [None]:
hosp3_table = TableOne(hosp3, columns=hosp_columns, categorical=hosp_categorical,
                       order=hosp_order, limit=hosp_limit, rename=hosp_rename)
display(hosp3_table)

In [None]:
cat1 = hosp1_table.cat_table
cont1 = hosp1_table.cont_table
cat1['Selection'] = '1'
cont1['Selection'] = '1'

cat2 = hosp2_table.cat_table
cont2 = hosp2_table.cont_table
cat2['Selection'] = '2'
cont2['Selection'] = '2'

cat3 = hosp3_table.cat_table
cont3 = hosp3_table.cont_table
cat3['Selection'] = '3'
cont3['Selection'] = '3'

In [None]:
import seaborn as sns

cat = pd.concat([cat1, cat2, cat3]).reset_index()
cat = cat[cat.value.isin(('F', 1, '1'))]
cat = cat[cat.variable != 'pat_count']
cat.loc[cat.variable == 'gender', 'variable'] = 'Females'
cat.loc[cat.variable == 'hospital_mortality', 'variable'] = 'Hospital Mortality'
cat.loc[cat.variable == 'one_year_mortality', 'variable'] = 'One-Year Mortality'

cat['n'] = cat.Overall.map(lambda s: s.split(' ')[0]).astype(int)
cat['%'] = cat.Overall.map(lambda s: s.split(' ')[1].replace('(', '').replace(')', '')).astype(float)

In [None]:
cont = pd.concat([cont1, cont2, cont3]).reset_index()
cont.loc[cont.variable == 'age', 'variable'] = 'Age'
cont.loc[cont.variable == 'hosp_los', 'variable'] = 'Hospital LoS'

cont['mean'] = cont.Overall.map(lambda s: s.split(' ')[0]).astype(float)
cont['SD'] = cont.Overall.map(lambda s: s.split(' ')[1].replace('(', '').replace(')', '')).astype(float)

In [None]:
cont

In [None]:
with sns.plotting_context(font_scale=2):
    plot = sns.catplot(data=cat, x='%', y='variable', hue='Selection', kind='bar')
    plot.savefig('consort_demo.pdf')

In [None]:
with sns.plotting_context(font_scale=2):
    plot = sns.catplot(data=cat, x='%', y='variable', hue='Selection', kind='bar')
    plot.savefig('consort_demo.pdf')

In [None]:
# Scale the data, just a simple example of how you might determine the scaling
cont_scaled = cont.copy()

mask = cont_scaled.variable == 'Hospital LoS'
scale = cont_scaled[~mask]['mean'].mean() / cont_scaled[mask]['mean'].mean()
cont_scaled.loc[mask, 'mean'] = cont_scaled.loc[mask, 'mean'] * scale
# cont_scaled.loc[mask, 'SD'] = cont_scaled.loc[mask, 'SD'] * scale

# Plot
fig, ax1 = plt.subplots()
g = sns.barplot(x="variable", y="mean", hue="Selection",
                data=cont_scaled, ax=ax1)

# x_coords = [p.get_x() + 0.5 * p.get_width() for p in g.patches]
# y_coords = [p.get_height() for p in g.patches]
# g.errorbar(x=x_coords, y=y_coords, 
#            yerr=cont_scaled["SD"], 
#            fmt="none", c="k")

# Create a second y-axis with the scaled ticks
ax1.set_ylabel('Years')
ax2 = ax1.twinx()

# Ensure ticks occur at the same positions, then modify labels
ax2.set_ylim(ax1.get_ylim())
ax2.set_yticklabels(np.round(ax1.get_yticks() / scale, 1))
ax2.set_ylabel('Days')

plt.show()
fig.savefig("consort_demo2.pdf")

### Observable Measurements (Prediction Target)

In [None]:
## NUMERIC FEATURES
## TODO: add mergers across tables and within-tables.

blood_gas = ['so2', 'po2', 'pco2', 'fio2', 'fio2_chartevents', 'aado2', 'aado2_calc', 'pao2fio2ratio', 'ph',
             'baseexcess', 'bicarbonate', 'totalco2', 'hematocrit', 'hemoglobin', 'carboxyhemoglobin', 'methemoglobin',
             'chloride', 'calcium', 'temperature', 'potassium', 'sodium', 'lactate', 'glucose']

blood_chemistry = ['albumin', 'globulin', 'total_protein', 'aniongap', 'bicarbonate', 'bun', 'calcium', 'chloride',
                   'creatinine', 'glucose', 'sodium', 'potassium']

cardiac_markers = ['troponin_t', 'ntprobnp', 'ck_mb']

cbc = ['hematocrit', 'hemoglobin', 'mch', 'mchc', 'mcv', 'platelet', 'rbc', 'rdw', 'wbc']

vital_signs = ['heart_rate', 'sbp', 'dbp', 'mbp', 'sbp_ni', 'dbp_ni', 'mbp_ni', 'resp_rate', 'temperature', 'spo2',
               'glucose']

# Glasgow Coma Scale, a measure of neurological function
coma_signs = ['gcs', 'gcs_motor', 'gcs_verbal', 'gcs_eyes', 'gcs_unable']

renal_out = ['uo_rt_6hr', 'uo_rt_12hr', 'uo_rt_24hr']

renal_creat = ['creat']

renal_aki = ['aki_stage_smoothed']

### Numeric Feature Tables

In [None]:
renal_out_query = f"""
select icu.hadm_id,
       {', '.join(renal_out)},
       charttime time_bin
from mimiciv_derived.kdigo_uo t
inner join mimiciv_icu.icustays icu
on icu.stay_id = t.stay_id
"""

renal_creat_query = f"""
select hadm_id,
       {', '.join(renal_creat)},
       charttime time_bin
from mimiciv_derived.kdigo_creatinine
"""

renal_aki_query = f"""
select hadm_id,
       {', '.join(renal_aki)},
       charttime time_bin
from mimiciv_derived.kdigo_stages
"""

renal_out_df = pd.read_sql_query(renal_out_query, con)
renal_creat_df = pd.read_sql_query(renal_creat_query, con)
renal_aki_df = pd.read_sql_query(renal_aki_query, con)


In [None]:
renal_aki_df.aki_stage_smoothed.max()

In [None]:
renal_hadm_id = pd.Series(renal_aki_df.hadm_id.unique())

In [None]:
adm_df['adm_has_renal_data'] = adm_df.hadm_id.isin(renal_hadm_id)
adm_df




In [None]:
subject_has_renal = adm_df.groupby('subject_id', as_index=False).agg(n_adm=('hadm_id', 'nunique'),
                                                                     adm_has_renal_data=('adm_has_renal_data', 'sum'))
subjects_with_renal_info = subject_has_renal[subject_has_renal['adm_has_renal_data'] > 0]
subjects_with_renal_info['p_adm_with_renal_info'] = subjects_with_renal_info['adm_has_renal_data'] / \
                                                    subjects_with_renal_info['n_adm']
subject_id_with_renal_info = subjects_with_renal_info.subject_id
# subjects_with_renal_info['p_adm_with_renal_info'].hist()

In [None]:
adm_df.subject_id.nunique()

In [None]:
subject_id_with_renal_info

In [None]:
renal_out_df = filter_measurements(renal_out_df, hadm_id_selection=hadm_id_los_geq_12h,
                                   subject_id_selection=subject_id_with_renal_info)
renal_creat_df = filter_measurements(renal_creat_df, hadm_id_selection=hadm_id_los_geq_12h,
                                     subject_id_selection=subject_id_with_renal_info)
renal_aki_df = filter_measurements(renal_aki_df, hadm_id_selection=hadm_id_los_geq_12h,
                                   subject_id_selection=subject_id_with_renal_info)

In [None]:
sofa_query = f"""
select hadm_id,
      s.sofa_24hours as sofa ,
      s.endtime time_bin
from mimiciv_derived.sofa as s
inner join mimiciv_icu.icustays icu on s.stay_id = icu.stay_id
"""

sofa_df = pd.read_sql_query(sofa_query, con)

In [None]:
sofa_df = filter_measurements(sofa_df, hadm_id_selection=hadm_id_los_geq_12h,
                              subject_id_selection=subject_id_with_renal_info)

In [None]:
blood_gas_query = f"""
select hadm_id,
       {', '.join(blood_gas)},
       charttime time_bin
from mimiciv_derived.bg as bg
where hadm_id is not null
"""

bg_df = pd.read_sql_query(blood_gas_query, con)

In [None]:
bg_df = filter_measurements(bg_df, hadm_id_selection=hadm_id_los_geq_12h,
                            subject_id_selection=subject_id_with_renal_info)

In [None]:
blood_chemistry_query = f"""
select hadm_id,
       {', '.join(blood_chemistry)},
       charttime time_bin
from mimiciv_derived.chemistry as ch
where hadm_id is not null
"""

ch_df = pd.read_sql_query(blood_chemistry_query, con)

In [None]:
ch_df.dtypes

In [None]:
ch_df = filter_measurements(ch_df, hadm_id_selection=hadm_id_los_geq_12h,
                            subject_id_selection=subject_id_with_renal_info)

In [None]:
cardiac_marker_query = \
    f"""
WITH trop AS
(
    SELECT specimen_id, MAX(valuenum) AS troponin_t
    FROM mimiciv_hosp.labevents
    WHERE itemid = 51003
    GROUP BY specimen_id
)
SELECT
    c.hadm_id
    , charttime time_bin
    , trop.troponin_t
    , c.ntprobnp
    , c.ck_mb
FROM mimiciv_hosp.admissions a
LEFT JOIN mimiciv_derived.cardiac_marker c
  ON a.hadm_id = c.hadm_id
LEFT JOIN trop
  ON c.specimen_id = trop.specimen_id
WHERE c.hadm_id is not null
"""

cardiac_df = pd.read_sql_query(cardiac_marker_query, con)

In [None]:
cardiac_df.dtypes

In [None]:
def filter_measurements(df, hadm_id_selection=None, subject_id_selection=None):
    n = OrderedDict()
    n['n0'] = len(df)

    cols = df.columns

    # (1) Filter measurements based on admission_id selection.
    if hadm_id_selection is not None:
        df = df[df.hadm_id.isin(hadm_id_selection)]

    n['n adm. filter'] = len(df)

    # Merge with Admission Table
    df_ = df.merge(adm_df, on='hadm_id', how='left')

    # (2) Filter measurements based on subject_id selection
    if subject_id_selection is not None:
        df_ = df_[df_.subject_id.isin(subject_id_selection)]

    n['n subj. filter'] = len(df_)

    # (3) Filter measurements with time_bin outside the hosp stay.
    mask = df_.time_bin.between(df_.admittime, df_.dischtime)

    df_ = df_[mask]
    df = df_[cols]
    n['n in-patient time'] = len(df)

    print('\n'.join(f'{filt}: {num} ({100 * num / n["n0"]:.1f})' for filt, num in n.items()))
    print('\n========\n')
    return df

In [None]:
cardiac_df = filter_measurements(cardiac_df, hadm_id_selection=hadm_id_los_geq_12h,
                                 subject_id_selection=subject_id_with_renal_info)

In [None]:
weight_query = f"""
select icu.hadm_id,
     w.weight,
        w.time_bin
 from (
 (select stay_id, w.weight, w.starttime time_bin
  from mimiciv_derived.weight_durations as w)
 union all
 (select stay_id, w.weight, w.endtime time_bin
     from mimiciv_derived.weight_durations as w)
 ) w
inner join mimiciv_icu.icustays icu on w.stay_id = icu.stay_id
"""

weight_df = pd.read_sql_query(weight_query, con)

In [None]:
weight_df = filter_measurements(weight_df, hadm_id_selection=hadm_id_los_geq_12h,
                                subject_id_selection=subject_id_with_renal_info)

In [None]:
cbc_query = f"""
select hadm_id,
       {', '.join(cbc)},
       cbc.charttime time_bin
from mimiciv_derived.complete_blood_count as cbc
where hadm_id is not null
"""
cbc_df = pd.read_sql_query(cbc_query, con)

In [None]:
cbc_df.dtypes

In [None]:
cbc_df = filter_measurements(cbc_df, hadm_id_selection=hadm_id_los_geq_12h,
                             subject_id_selection=subject_id_with_renal_info)

In [None]:
vital_query = f"""
select icu.hadm_id,
       {', '.join(vital_signs)},
       v.charttime time_bin
from mimiciv_derived.vitalsign as v
inner join mimiciv_icu.icustays as icu
 on icu.stay_id = v.stay_id
"""
vital_df = pd.read_sql_query(vital_query, con)

In [None]:
vital_df = filter_measurements(vital_df, hadm_id_selection=hadm_id_los_geq_12h,
                               subject_id_selection=subject_id_with_renal_info)

In [None]:
gcs_query = f"""
select icu.hadm_id,
       {', '.join(coma_signs)},
       gcs.charttime time_bin
from mimiciv_derived.gcs as gcs
inner join mimiciv_icu.icustays as icu
 on icu.stay_id = gcs.stay_id
"""
gcs_df = pd.read_sql_query(gcs_query, con)

In [None]:
gcs_df = filter_measurements(gcs_df, hadm_id_selection=hadm_id_los_geq_12h,
                             subject_id_selection=subject_id_with_renal_info)

In [None]:
obs_tables = {'blood_gas': bg_df,
              'chemistry': ch_df,
              'cardiac_marker': cardiac_df,
              'weight': weight_df,
              'cbc': cbc_df,
              'vital': vital_df,
              'gcs': gcs_df,
              'renal_out': renal_out_df,
              'renal_creat': renal_creat_df,
              'renal_aki': renal_aki_df,
              'sofa': sofa_df}
obs_columns = {table_name: set(df.columns) - {'time_bin', 'hadm_id'} for table_name, df in obs_tables.items()}

In [None]:
obs_group = {col: group for group, cols in obs_columns.items() for col in cols}
obs_codes = pd.DataFrame({'label': sorted(obs_group), 'group': map(obs_group.get, sorted(obs_group))})
obs_codes = obs_codes.sort_values(by='group').reset_index(drop=True)
obs_codes['code'] = obs_codes.index.map(lambda i: f'o{i:02d}')
code_map = dict(zip(obs_codes.label, obs_codes.code))

In [None]:
obs_melted_tables = {k: pd.melt(df, id_vars=['hadm_id', 'time_bin'],
                                var_name=['code'],
                                value_vars=obs_columns[k]) for k, df in
                     obs_tables.items()}

obs_melted_tables = {k: df[df.value.notnull()] for k, df in obs_melted_tables.items()}

In [None]:
obs_df = pd.concat(obs_melted_tables.values(), axis=0)

In [None]:
obs_df['code'] = obs_df['code'].map(code_map)
obs_df

In [None]:
obs_df.to_csv('obs_df.csv.gz', compression='gzip')
obs_codes.to_csv('mimic4_obs_codes.csv.gz', compression='gzip')

In [None]:
obs_codes

### Interventions



In [None]:
## Inputs - Canonicalise

input_query = \
    """
    SELECT
        a.hadm_id
        , inp.starttime as start_time
        , inp.endtime as end_time
        , di.label
        , di.itemid
        , inp.rate 
        , inp.amount
        , inp.rateuom
        , inp.amountuom
    FROM mimiciv_hosp.admissions a
    INNER JOIN mimiciv_icu.icustays i
        ON a.hadm_id = i.hadm_id
    LEFT JOIN mimiciv_icu.inputevents inp
        ON i.stay_id = inp.stay_id
    LEFT JOIN mimiciv_icu.d_items di
        ON inp.itemid = di.itemid
    """

## Procedures - Canonicalise and Refine
icuproc_query = \
    """
    SELECT
        a.hadm_id
        , pe.starttime as start_time
        , pe.endtime as end_time
        , di.label
        , di.itemid
        , pe.value
    FROM mimiciv_hosp.admissions a
    INNER JOIN mimiciv_icu.icustays i
        ON a.hadm_id = i.hadm_id
    LEFT JOIN mimiciv_icu.procedureevents pe
        ON i.stay_id = pe.stay_id
    LEFT JOIN mimiciv_icu.d_items di
        ON pe.itemid = di.itemid
    """

hospicdproc_query = \
    """
    select pi.hadm_id
    , (pi.chartdate)::timestamp as start_time
    , (pi.chartdate + interval '1 hour')::timestamp as end_time
    , pi.icd_code
    , pi.icd_version
    , di.long_title
    FROM mimiciv_hosp.procedures_icd pi
    INNER JOIN mimiciv_hosp.d_icd_procedures di
      ON pi.icd_version = di.icd_version
      AND pi.icd_code = di.icd_code
    INNER JOIN mimiciv_hosp.admissions a
      ON pi.hadm_id = a.hadm_id
    """

# === provider order entry (poe) <----- very messy, high irrelevance, ignore

poe_query = f"""
SELECT
    a.hadm_id
    , mimiciv_derived.DATETIME_DIFF(p.ordertime, a.admittime, 'DAY') AS offset
    , p.poe_id
    , p.order_type, p.order_subtype
    , p.transaction_type
    , pd.field_name
    , pd.field_value
FROM mimiciv_hosp.admissions a
INNER JOIN mimiciv_hosp.poe p
    ON a.hadm_id = p.hadm_id
LEFT JOIN  mimiciv_hosp.poe_detail pd
    ON p.poe_id = pd.poe_id
"""

In [None]:
input_df = pd.read_sql_query(input_query, con)

In [None]:
icuproc_df = pd.read_sql_query(icuproc_query, con, dtype=str)

In [None]:
hospicdproc_df = pd.read_sql_query(hospicdproc_query, con, dtype=str)

In [None]:
icuproc_df

In [None]:
# Ignored entirely
# poe_df = pd.read_sql_query(poe_query,con)

In [None]:
def filter_interventions(df, hadm_id_selection=None, subject_id_selection=None):
    n = OrderedDict()
    n['n0'] = len(df)

    cols = df.columns

    # (1) Filter interventions based on admission_id selection.
    if hadm_id_selection is not None:
        df = df[df.hadm_id.isin(hadm_id_selection)]

    n['n adm. filter'] = len(df)

    # Merge with Admission Table
    df_ = df.merge(adm_df, on='hadm_id', how='left')

    # (2) Filter interventions based on subject_id selection
    if subject_id_selection is not None:
        df_ = df_[df_.subject_id.isin(subject_id_selection)]

    n['n subj. filter'] = len(df_)

    # (3) Filter interventions with intervals outside the hosp stay.
    mask1 = df_.start_time.between(df_.admittime, df_.dischtime)
    mask2 = df_.end_time.between(df_.admittime, df_.dischtime)
    n['n start-time filter'] = sum(mask1)
    n['n end-time filter'] = sum(mask1 & mask2)

    df_ = df_[mask1 & mask2]

    df = df_[cols]
    n['n in-patient time'] = len(df)

    print('\n'.join(f'{filt}: {num} ({100 * num / n["n0"]:.1f})' for filt, num in n.items()))
    print('\n========\n')
    return df

In [None]:
input_df = filter_interventions(input_df, hadm_id_selection=hadm_id_los_geq_12h,
                                subject_id_selection=subject_id_with_renal_info)

In [None]:
input_df.label.nunique()

In [None]:
icuproc_df = filter_interventions(icuproc_df, hadm_id_selection=hadm_id_los_geq_12h,
                                  subject_id_selection=subject_id_with_renal_info)

In [None]:
hospicdproc_df = filter_interventions(hospicdproc_df, hadm_id_selection=hadm_id_los_geq_12h,
                                      subject_id_selection=subject_id_with_renal_info)

In [None]:
hospicdproc_df

### `hospicdproc` + `icuproc` Further filteration and grouper

In [None]:
df = hospicdproc_df.merge(adm_df, on='hadm_id', how='left')

# timeperc_within_stay = (df['start_time'] - df['admittime']).dt.total_seconds() /(df['dischtime'] - df['admittime']).dt.total_seconds()
# timeperc_within_stay.plot.kde()

In [None]:
icd_n_subjects = df.groupby(['icd_code', 'long_title', 'icd_version'], as_index=False).agg(
    n_subjects=('subject_id', 'nunique'))
icd_n_subjects['p_subjects'] = icd_n_subjects['n_subjects'] / len(subject_id_with_renal_info)
# icd_n_subjects['p_subjects'].plot.kde()

In [None]:
# Conisder ICD codes with minimum coverage of 0.5% of the selected subjects.
hospicd_R1 = icd_n_subjects[icd_n_subjects['p_subjects'] > 0.005]

# Remove diagnostic procedures.
patterns = ['diag', 'fluoro', 'biops', 'inspection', 'bronchoscop', 'monitor', 'ultrasonography']

hospicd_R2_mask = hospicd_R1['long_title'].str.match('|'.join(f'(.*{p}.*)' for p in patterns), case=False)
hospicd_R2 = hospicd_R1[~hospicd_R2_mask]

hospicd_R1.to_csv('hospicd_R1.csv')
hospicd_R2.to_csv('hospicd_R2.csv')

In [None]:
# Apply filteration.
hospicdproc_df = hospicdproc_df[hospicdproc_df.icd_code.isin(hospicd_R2.icd_code)]

In [None]:
df = hospicdproc_df.merge(adm_df, on='hadm_id', how='left')

# timeperc_within_stay = (df['start_time'] - df['admittime']).dt.total_seconds() /(df['dischtime'] - df['admittime']).dt.total_seconds()
# timeperc_within_stay.plot.kde(bw_method=0.05)

In [None]:
icd_n_subjects = df.groupby(['icd_code', 'long_title', 'icd_version'], as_index=False).agg(
    n_subjects=('subject_id', 'nunique'))
icd_n_subjects['p_subjects'] = icd_n_subjects['n_subjects'] / len(subject_id_with_renal_info)
# icd_n_subjects['p_subjects'].plot.kde(bw_method=0.01)

In [None]:
hospicdproc_df

In [None]:
hospicdproc_grouper = pd.read_csv('int_grouper_hospicdproc.csv')
hospicdproc_grouper

In [None]:
n1 = len(hospicdproc_df)
icd_selection = set(hospicdproc_grouper.icd_code)
print(hospicdproc_grouper.icd_code.nunique(), len(hospicdproc_grouper))



In [None]:
hospicdproc_df = hospicdproc_df[hospicdproc_df.icd_code.isin(icd_selection)]
n2 = len(hospicdproc_df)
n1, n2

#### Standard Procedure Table

In [None]:
hospicd_coder = lambda v, c: f"v{v}:c{c}"

hospicdproc_df["code"] = list(map(hospicd_coder, hospicdproc_df["icd_version"], hospicdproc_df["icd_code"]))
hospicdproc_grouper["code"] = list(
    map(hospicd_coder, hospicdproc_grouper["icd_version"], hospicdproc_grouper["icd_code"]))
hospicdproc_grouper["label"] = hospicdproc_grouper["long_title"]

hospicdproc_df = hospicdproc_df[["hadm_id", "start_time", "end_time", "code"]]
hospicdproc_grouper = hospicdproc_grouper[["code", "group", "label"]]

### 'icuproc` refinement

In [None]:
icuproc_df

In [None]:
icuproc_df['total_interval_hrs'] = (icuproc_df['end_time'] - icuproc_df['start_time']).dt.total_seconds() / 3600
icuproc_df['total_interval_mins'] = (icuproc_df['end_time'] - icuproc_df['start_time']).dt.total_seconds() / 60
icuproc_df['value_per_hour'] = icuproc_df['value'] / icuproc_df['total_interval_hrs']

df = icuproc_df.merge(adm_df, on='hadm_id', how='left')
df['count'] = 1

In [None]:
icuproc_n_subjects = df.groupby('label', as_index=False).agg(
    n_subjects=('subject_id', 'nunique'),
    n=('count', 'sum'),
    mean_value=('value', 'mean'),
    std_value=('value', 'std'),
    mean_value_per_hour=('value_per_hour', 'mean'),
    std_value_per_hour=('value_per_hour', 'std'),
    mean_interval_minutes=('total_interval_mins', 'mean'),
    std_interval_minutes=('total_interval_mins', 'std'))


In [None]:
icuproc_n_subjects.to_csv('icuproc_n_subjects.csv')

In [None]:
int_icuproc_grouper = pd.read_csv('int_grouper_icuproc.csv')

In [None]:
n1 = len(icuproc_df)
exclude_icuproc = set(int_icuproc_grouper[int_icuproc_grouper.group == 'exclude'].label)

icuproc_df = icuproc_df[~icuproc_df.label.isin(exclude_icuproc)]
n2 = len(icuproc_df)
n1, n2

#### Standard Procedure Table

In [None]:
icuproc_df["code"] = icuproc_df["label"]
int_icuproc_grouper["code"] = int_icuproc_grouper["label"]
int_icuproc_grouper["label"] = int_icuproc_grouper["label"]

icuproc_df = icuproc_df[["hadm_id", "start_time", "end_time", "code"]]
int_icuproc_grouper = int_icuproc_grouper[["code", "group", "label"]]

In [None]:
int_proc_df = pd.concat([hospicdproc_df, icuproc_df], axis=0)
int_grouper_proc = pd.concat([hospicdproc_grouper, int_icuproc_grouper], axis=0)

int_proc_df.to_csv('int_proc.csv.gz', compression='gzip')
int_grouper_proc.to_csv('mimic4_int_grouper_proc.csv.gz', compression='gzip')

In [None]:
int_grouper_proc2 = int_grouper_proc.set_index(['group', 'code']).sort_index()
with pd.ExcelWriter('int_grouper_proc2.xlsx') as writer:
    int_grouper_proc2.to_excel(writer, sheet_name='Sheet1', merge_cells=True)

In [None]:
int_grouper_proc

### `input` filteration and refinement

In [None]:
input_df['total_interval_hrs'] = (input_df['end_time'] - input_df['start_time']).dt.total_seconds() / 3600
input_df['total_interval_mins'] = (input_df['end_time'] - input_df['start_time']).dt.total_seconds() / 60
input_df['amount_per_hour'] = input_df['amount'] / input_df['total_interval_hrs']

In [None]:
df = input_df.merge(adm_df, on='hadm_id', how='left')
df['count'] = 1

In [None]:
input_n_subjects = df.groupby(['label', 'amountuom'], as_index=False).agg(
    n_subjects=('subject_id', 'nunique'),
    n=('count', 'sum'),
    mean_amount=('amount', 'mean'),
    std_amount=('amount', 'std'),
    mean_amount_per_hour=('amount_per_hour', 'mean'),
    std_amount_per_hour=('amount_per_hour', 'std'),
    mean_interval_minutes=('total_interval_mins', 'mean'),
    std_interval_minutes=('total_interval_mins', 'std'))


In [None]:
input_n_subjects.to_csv('input_n_subjects.csv')


In [None]:
"""
from IPython.display import display

input_n_subjects_tom = pd.read_csv('input_n_subjects_tom.csv', index_col=[0])
# Change 'M' (maybe) to 'Y' (yes)
input_n_subjects_tom.loc[input_n_subjects_tom['dose_impact'] == 'M', 'dose_impact'] = 'Y'

# Add column for unit normalisation, default=1.0
input_n_subjects_tom['rate_normaliser'] = 1.0

# Add label groups
input_n_subjects_tom['group'] = input_n_subjects_tom.label.map(input_group_map)

# Default group decision:

init_decision = {}
for group, group_df in input_n_subjects_tom.groupby('group'):
    dose_impact = list(group_df['dose_impact'].unique())
    
    assert len(dose_impact) == 1, f"Incosistant dose impact decision {group}."
    dose_impact = dose_impact[0]
    if dose_impact == 'Y':
        if group_df.label.nunique() > 1:
            if group_df.amountuom.nunique() > 1:
                init_decision[group] = 'DS(or DH, norm)'
            else:
                init_decision[group] = 'DS(or DH)'
        else:
            if group_df.amountuom.nunique() > 1:
                init_decision[group] = 'DH(norm)'
            else:
                init_decision[group] = 'DH'
    elif dose_impact == 'N':
        
        if group_df.label.nunique() > 1:
            init_decision[group] = 'H(or S)'
        else:
            init_decision[group] = 'H'
    else:
        init_decision[group] = 'E'

input_n_subjects_tom['group_decision'] = input_n_subjects_tom['group'].map(init_decision)

# Change the order of rate_normaliser column
input_n_subjects_tom.insert(1, 'rate_normaliser', input_n_subjects_tom.pop('rate_normaliser'))

input_n_subjects_tom = input_n_subjects_tom.set_index(['group', 'dose_impact', 'group_decision', 'label', 'amountuom'])
"""

In [None]:
"""
with pd.ExcelWriter('input_label_groups_notes.xlsx') as writer:
    input_n_subjects_tom.to_excel(writer, sheet_name='Sheet1', merge_cells = True)
"""

In [None]:
int_grouper_input_df = pd.read_excel('input_label_groups_decisions.xlsx').fillna(method='ffill')

In [None]:
int_grouper_input_df

In [None]:
int_grouper_input_df.group_decision.unique()

In [None]:
int_grouper_input_df.to_csv('mimic4_int_grouper_input.csv.gz', compression='gzip')

#### Normalise rates of different units for the same input item

In [None]:
rate_scaler = dict(zip(map(lambda c1, c2: f'{c1}-{c2}', int_grouper_input_df.label, int_grouper_input_df.amountuom),
                       int_grouper_input_df.rate_normaliser))

input_df['label-uom'] = list(map(lambda c1, c2: f'{c1}-{c2}', input_df.label, input_df.amountuom))
input_df['normalised_amount_per_hour'] = input_df['amount_per_hour'] * input_df['label-uom'].map(rate_scaler)

In [None]:
input_df = input_df[['hadm_id', 'start_time', 'end_time', 'label', 'normalised_amount_per_hour']]

In [None]:
input_df.to_csv('int_input.csv.gz', compression='gzip')

In [None]:
input_df

## Dx Codes Filteration

In [None]:
def filter_discharge_codes(df, subject_id_selection=None):
    n = OrderedDict()
    n['n0'] = len(df)

    cols = df.columns

    # (1) Filter discharge codes if subject has a single admission.
    n_adms = df.groupby('subject_id', as_index=False).agg(n_adms=('hadm_id', 'nunique'))
    n_adms_subjects = set(n_adms[n_adms['n_adms'] > 1].subject_id)
    df = df[df.subject_id.isin(n_adms_subjects)]

    n['n adm. > 1 filter'] = len(df)

    # (2) Filter discharge dx codes based on subject_id selection
    if subject_id_selection is not None:
        df = df[df.subject_id.isin(subject_id_selection)]

    n['n subj. filter'] = len(df)

    print('\n'.join(f'{filt}: {num} ({100 * num / n["n0"]:.1f})' for filt, num in n.items()))
    print('\n========\n')
    return df

In [None]:
dx_query = f"""
SELECT
    hadm_id, subject_id, icd_code, icd_version
FROM mimiciv_hosp.diagnoses_icd 
"""
dx_df = pd.read_sql_query(dx_query, con)

In [None]:
dx_df = filter_discharge_codes(dx_df, subject_id_selection=subject_id_with_renal_info)

In [None]:
dx_df

In [None]:
dx_df.to_csv('dx_df.csv.gz', compression='gzip')

## Admissions & Static Tables 

In [None]:
def filter_admissions(df, subject_id_selection=None):
    n = OrderedDict()
    n['n0'] = len(df)

    cols = df.columns

    # (1) Filter discharge dx codes based on subject_id selection
    if subject_id_selection is not None:
        df = df[df.subject_id.isin(subject_id_selection)]

    n['n subj. filter'] = len(df)

    print('\n'.join(f'{filt}: {num} ({100 * num / n["n0"]:.1f})' for filt, num in n.items()))
    print('\n========\n')
    return df

In [None]:
adm_query = f"""
SELECT
    hadm_id, subject_id, admittime, dischtime, race
FROM mimiciv_hosp.admissions 
"""
adm_df = pd.read_sql_query(adm_query, con)

In [None]:
static_query = f"""
SELECT 
    p.subject_id, p.gender, a.race, p.anchor_age, p.anchor_year
    from mimiciv_hosp.patients p
    left join 
    (select subject_id, max(race) as race
    from mimiciv_hosp.admissions
    group by subject_id) as a
    on p.subject_id = a.subject_id
"""
static_df = pd.read_sql_query(static_query, con)

In [None]:
adm_df = filter_admissions(adm_df, subject_id_selection=subject_id_with_renal_info)
static_df = filter_admissions(static_df, subject_id_selection=subject_id_with_renal_info)


In [None]:
adm_df.to_csv('adm_df.csv.gz', compression='gzip')
static_df.to_csv('static_df.csv.gz', compression='gzip')

In [None]:

# static_df.groupby('race', as_index=False).agg(n=('subject_id', 'count')).to_csv('race_groups.csv', compression='gzip')
race_groups_df = pd.read_csv('race_grouper.csv', index_col=0)

In [None]:
race_groups_df.to_csv('mimic4_race_grouper.csv.gz', compression='gzip')

In [None]:
import seaborn as sns
import sys

sys.path.append("../../..")

from lib.ehr.coding_scheme import MIMIC4Observables

C = MIMIC4Observables()

z_min = -2.5
z_max = -z_min
q_min = 0.1
q_max = 1 - q_min

for code in sorted(m4inpatient_dataset.df['obs'].code.unique()):
    df = m4inpatient_dataset.df['obs']
    code_df = df[df.code == code]

    q1 = code_df.value.quantile(q_min)
    q2 = code_df.value.quantile(q_max)
    qin_mask = code_df.value.between(q1, q2)

    mu = code_df.value.mean()
    std = code_df.value.std()
    code_df['z'] = (code_df.value - mu) / std
    x1 = mu + z_min * std
    x2 = mu + z_max * std

    zin_mask = code_df.z.between(z_min, z_max)

    textstr = '\n'.join((
        f'n={len(code_df)}',
        f'(min, max): ({code_df.value.min():0.2f}, {code_df.value.max():0.2f})',
        f'p(qin): {qin_mask.mean():0.3f}',
        f'p(zin): {zin_mask.mean():0.3f}',
        f'p(zin or qin): {(qin_mask | zin_mask).mean(): 0.3f}'))
    props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)

    g = sns.displot(code_df, x="value", kde=True)

    for ax in g.axes.flat:
        args = zip([x1, x2, q1, q2],
                   ["solid", "solid", "dashed", "dashed"],
                   ["red", "red", "red", "red"])
        ax.set_title(C.desc[code])
        # place a text box in upper left in axes coords
        ax.text(0.05, 0.95, textstr, transform=ax.transAxes, fontsize=10,
                verticalalignment='top', bbox=props)
        for x, style, color in args:
            ax.axvline(x=x, ymin=0, ymax=1.0, linestyle=style, color=color)
    g.savefig(f"obs_outlier__{code}_{C.desc[code]}.pdf", bbox_inches='tight')

In [None]:
icuproc_df_unique = icuproc_df[['label', 'itemid']].drop_duplicates()
icuproc_df_unique = icuproc_df_unique[icuproc_df_unique['itemid'].astype(float).notnull()]
icuproc_df_unique = icuproc_df_unique.astype({'itemid': float}).astype({'itemid': int}).astype({'itemid': str})
#.astype({'itemid': float}).astype({'itemid': int}).astype({'itemid': str})
icuproc_df_unique

In [None]:
icu_proc_selection = pd.read_csv('/home/asem/Downloads/mimic4_int_grouper_proc.csv.gz')
icuproc_df_unique = pd.merge(icuproc_df_unique, icu_proc_selection[['label', 'group']],
                             left_on='label', right_on='label', how='inner')
icuproc_df_unique = icuproc_df_unique[icuproc_df_unique['group'] != 'exclude']
icuproc_df_unique

In [None]:
icuproc_df_unique.to_csv(
    '/home/asem/GP/ICENODE/lib/ehr/resources/mimic4_aki_study/icu_procedures_itemid_selection.csv.gz',
    compression='gzip', index=False)

In [None]:
unique_dx_codes.to_csv('/home/asem/GP/ICENODE/lib/ehr/resources/mimic4_aki_study/dx_icd_version_selection.csv',
                       index=False)

In [None]:
q = """
select icd_code, icd_version, long_title as description from mimiciv_hosp.d_icd_procedures 
"""
hosp_proc_df = pd.read_sql_query(q, con, dtype=str)
hosp_proc_df = hosp_proc_df[hosp_proc_df.description.isin(hosp_proc_selection_labels)]
hosp_proc_df.to_csv('/home/asem/GP/ICENODE/lib/ehr/resources/mimic4_aki_study/hosp_procedures_icd_selection.csv',
                    index=False)

In [None]:
icu_input_labels_df = pd.DataFrame({'label': int_input_grouper.label.unique()})
icu_input_df = input_df[['itemid', 'label']].reset_index(drop=True).drop_duplicates()

In [None]:
icu_input_labels_df

In [None]:
icu_input_df

In [None]:
icu_input_labels_df = pd.merge(icu_input_labels_df, icu_input_df,
                               left_on='label', right_on='label', how='inner')

In [None]:
icu_input_labels_df = icu_input_labels_df.astype({'itemid': 'int64'}).astype({'itemid': str})

In [None]:
icu_input_labels_df

In [None]:
icu_input_labels_df.to_csv('mimic4_aki_icu_inputs_itemid_selection.csv.gz', compression='gzip')

In [None]:
icu_input_labels_df.itemid.nunique()