In [1]:
%load_ext autoreload
%autoreload 2

import sys
from collections import OrderedDict

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

sys.path.append("../../..")

import lib.ehr.example_datasets.mimiciv as m4
import lib.ehr.example_datasets.mimiciv_aki as m4aki

In [2]:
# create a database connection
sqluser = 'postgres'
dbname = 'mimiciv'
hostname = 'localhost'
password = 'qwerasdf'
port = '5432'

mimiciv_config = m4.MIMICIVSQLTablesConfig(host=hostname, port=port, user=sqluser, password=password, dbname=dbname)
mimiciv_interface = m4.MIMICIVSQLTablesInterface(mimiciv_config)
mimiciv_engine = mimiciv_interface.create_engine()


In [3]:
mimiciv_scheme_conf = m4aki.AKIMIMICIVDatasetConfig(tables=mimiciv_config)

dataset_raw = m4aki.AKIMIMICIVDataset(mimiciv_scheme_conf)
dataset = dataset_raw.execute_pipeline()


  df[self.config.date_of_birth_alias] = anchor_date + anchor_age


Transforming Dataset:   0%|          | 0/11 [00:00<?, ?transformations/s]

In [4]:
dataset.tables.static

Unnamed: 0_level_0,gender,race,anchor_age,anchor_year,date_of_birth
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10000032,F,WHITE,52,2180,2128-01-01
10001884,F,BLACK/AFRICAN AMERICAN,68,2122,2054-01-01
10002760,M,UNABLE TO OBTAIN,56,2141,2085-01-01
10004764,M,WHITE,70,2168,2098-01-01
10005348,M,WHITE,76,2128,2052-01-01
...,...,...,...,...,...
19997448,F,WHITE,52,2121,2069-01-01
19998591,F,WHITE,52,2184,2132-01-01
19999287,F,WHITE,71,2191,2120-01-01
19999297,M,MULTIPLE RACE/ETHNICITY,42,2162,2120-01-01


In [5]:
dataset.save("mimiciv_aki_dataset.h5", overwrite=True)

## Preprocessing

Preparation into Dataset:

- Inputs:
    - selection of race (optional).
    - selection of dx_codes (optional).
    - selection of icu_proc codes (optional).
    - selection of icu_inputs codes (optional).
    - selection of hosp_proc codes (optional).
    - selection of subjects from a function indicating an aki availability.
    - icu_input unit-of-measurement conversion table.
- Pipeline:
    - Set Indices.
    - Select subjects with at least one aki measurement (synchronize all).
    - Cast Datetime.
    - Merge overlapping admissions (remap hadm_id in all).
    - Remove subjects having admissions with negative intervals (ensure the negative interval is handled correctly when merging overlaps.).
    - Filter timestamped measurement outside the admittime/dischtime.
    - Filter interventional intervals entirely outside admittime/dischtime.
    - Clip overlapping interventional intervals to admittime/dischtime.
    - Filter unsupported codes.
    - Unit-normalize amount_per_hour.
    - Remove subjects with invalid input rates.
    - Replace Alphaneumeric Codes with Integer Indices.
    - Replace timestamps with relative times from admission time.

- Output: MIMICIVDataset (7 data tables + 7 scheme objects).
    - `static_df` + (race_scheme, gender_scheme) 
    - `adm_df`
    - `dx_df` + (dx_scheme: MixedICDScheme)
    - `obs_df` + (obs_scheme: MIMICObsScheme)
    - `icu_proc_df` + (icu_proc_scheme: ItemBasedScheme)
    - `icu_input_df` + (icu_input_scheme: ItemBasedScheme)
    - `hosp_proc_df` + (hocp_proc_scheme: MixedICDScheme)
 

Preparation into DynSys interface:

- Inputs:
    - Dataset
    - target::race_scheme, target::gender_scheme
    - target::proc_scheme (groups `icu_proc_df` + `hosp_proc_df` into `proc_df`).
    - target::input (group-aggregates `icu_input_df` items)
    - target::dx_scheme
    - outcome_extractor
    - leading_config
    - aggregation configuration:
        - conditional aggregation measurements with small timescales, e.g.  quantile75(dt) < 1-hour
        - hard aggregation, fixed time_bin interval.
    - splits
    - config outlier removal + rescaling.
- Pipeline:
    - Sample Subjects.
    - Outlier removal and rescaling.
    - Vectorize:
        - dx
        - dx_history
        - obs
        - input
        - proc
    - observations conditional aggregation OR hard aggregation.
    - Vectorize:
        - lead (extracted after aggregation of obs)
    - Exclude admissions with less than 12 hours stay (it comes after vectorize so dx_history do not miss preceding admissions).
- Output: DynEHRSystem (original dataset + VecPatients + 5 target_schemes + outcome_extractor + leaf_config)  



### Interventions



### `hospicdproc` + `icuproc` Further filteration and grouper

In [None]:
df = hospicdproc_df.merge(adm_df, on='hadm_id', how='left')

# timeperc_within_stay = (df['start_time'] - df['admittime']).dt.total_seconds() /(df['dischtime'] - df['admittime']).dt.total_seconds()
# timeperc_within_stay.plot.kde()

In [None]:
icd_n_subjects = df.groupby(['icd_code', 'long_title', 'icd_version'], as_index=False).agg(
    n_subjects=('subject_id', 'nunique'))
icd_n_subjects['p_subjects'] = icd_n_subjects['n_subjects'] / len(subject_id_with_renal_info)
# icd_n_subjects['p_subjects'].plot.kde()

### 'icuproc` refinement

#### Standard Procedure Table

### `input` filteration and refinement

#### Normalise rates of different units for the same input item

In [None]:
rate_scaler = dict(zip(map(lambda c1, c2: f'{c1}-{c2}', int_grouper_input_df.label, int_grouper_input_df.amountuom),
                       int_grouper_input_df.rate_normaliser))

input_df['label-uom'] = list(map(lambda c1, c2: f'{c1}-{c2}', input_df.label, input_df.amountuom))
input_df['normalised_amount_per_hour'] = input_df['amount_per_hour'] * input_df['label-uom'].map(rate_scaler)

In [None]:
input_df = input_df[['hadm_id', 'start_time', 'end_time', 'label', 'normalised_amount_per_hour']]

In [None]:
input_df.to_csv('int_input.csv.gz', compression='gzip')

In [None]:
input_df

## Dx Codes Filteration

In [None]:
def filter_discharge_codes(df, subject_id_selection=None):
    n = OrderedDict()
    n['n0'] = len(df)

    cols = df.columns

    # (1) Filter discharge codes if subject has a single admission.
    n_adms = df.groupby('subject_id', as_index=False).agg(n_adms=('hadm_id', 'nunique'))
    n_adms_subjects = set(n_adms[n_adms['n_adms'] > 1].subject_id)
    df = df[df.subject_id.isin(n_adms_subjects)]

    n['n adm. > 1 filter'] = len(df)

    # (2) Filter discharge dx codes based on subject_id selection
    if subject_id_selection is not None:
        df = df[df.subject_id.isin(subject_id_selection)]

    n['n subj. filter'] = len(df)

    print('\n'.join(f'{filt}: {num} ({100 * num / n["n0"]:.1f})' for filt, num in n.items()))
    print('\n========\n')
    return df

In [None]:
dx_query = f"""
SELECT
    hadm_id, subject_id, icd_code, icd_version
FROM mimiciv_hosp.diagnoses_icd 
"""
dx_df = pd.read_sql_query(dx_query, con)

In [None]:
dx_df = filter_discharge_codes(dx_df, subject_id_selection=subject_id_with_renal_info)

## Admissions & Static Tables 

In [None]:
def filter_admissions(df, subject_id_selection=None):
    n = OrderedDict()
    n['n0'] = len(df)

    cols = df.columns

    # (1) Filter discharge dx codes based on subject_id selection
    if subject_id_selection is not None:
        df = df[df.subject_id.isin(subject_id_selection)]

    n['n subj. filter'] = len(df)

    print('\n'.join(f'{filt}: {num} ({100 * num / n["n0"]:.1f})' for filt, num in n.items()))
    print('\n========\n')
    return df

In [None]:
adm_df = filter_admissions(adm_df, subject_id_selection=subject_id_with_renal_info)
static_df = filter_admissions(static_df, subject_id_selection=subject_id_with_renal_info)


In [None]:
adm_df.to_csv('adm_df.csv.gz', compression='gzip')
static_df.to_csv('static_df.csv.gz', compression='gzip')

In [None]:

# static_df.groupby('race', as_index=False).agg(n=('subject_id', 'count')).to_csv('race_groups.csv', compression='gzip')
race_groups_df = pd.read_csv('race_grouper.csv', index_col=0)

In [None]:
race_groups_df.to_csv('mimic4_race_grouper.csv.gz', compression='gzip')

In [None]:
import seaborn as sns
import sys

sys.path.append("../../..")

from lib.ehr.coding_scheme import MIMIC4Observables

C = MIMIC4Observables()

z_min = -2.5
z_max = -z_min
q_min = 0.1
q_max = 1 - q_min

for code in sorted(m4inpatient_dataset.df['obs'].code.unique()):
    df = m4inpatient_dataset.df['obs']
    code_df = df[df.code == code]

    q1 = code_df.value.quantile(q_min)
    q2 = code_df.value.quantile(q_max)
    qin_mask = code_df.value.between(q1, q2)

    mu = code_df.value.mean()
    std = code_df.value.std()
    code_df['z'] = (code_df.value - mu) / std
    x1 = mu + z_min * std
    x2 = mu + z_max * std

    zin_mask = code_df.z.between(z_min, z_max)

    textstr = '\n'.join((
        f'n={len(code_df)}',
        f'(min, max): ({code_df.value.min():0.2f}, {code_df.value.max():0.2f})',
        f'p(qin): {qin_mask.mean():0.3f}',
        f'p(zin): {zin_mask.mean():0.3f}',
        f'p(zin or qin): {(qin_mask | zin_mask).mean(): 0.3f}'))
    props = dict(boxstyle='round', facecolor='wheat', alpha=0.5)

    g = sns.displot(code_df, x="value", kde=True)

    for ax in g.axes.flat:
        args = zip([x1, x2, q1, q2],
                   ["solid", "solid", "dashed", "dashed"],
                   ["red", "red", "red", "red"])
        ax.set_title(C.desc[code])
        # place a text box in upper left in axes coords
        ax.text(0.05, 0.95, textstr, transform=ax.transAxes, fontsize=10,
                verticalalignment='top', bbox=props)
        for x, style, color in args:
            ax.axvline(x=x, ymin=0, ymax=1.0, linestyle=style, color=color)
    g.savefig(f"obs_outlier__{code}_{C.desc[code]}.pdf", bbox_inches='tight')