In [1]:
import numpy as np
import pandas as pd
import json
import collections
from collections import defaultdict 
from functools import partial
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm


In [2]:
# Good read: https://iq-inc.com/importerror-attempted-relative-import/

import sys
import importlib
from mimicnet import concept

importlib.reload(sys.modules['mimicnet.concept'])

In [3]:
# multi_visit_mimic_dir = '/home/am8520/GP/ehr-data/mimic3-multi-visit'
multi_visit_mimic_dir = '/home/asem/GP/ehr-data/mimic3-multi-visit'
transformed_mimic_dir = '/home/asem/GP/ehr-data/mimic3-transforms'
# mimic_dir = '/home/asem/GP/ehr-data/mimic3-v1.4/physionet.org/files/mimiciii/1.4'
mimic_dir = '/home/asem/GP/MIMIC-SNONET/RAW/mimic-iii-clinical-database-1.4'


In [4]:
D_LABITEMS = pd.read_csv(f'{mimic_dir}/D_LABITEMS.csv.gz')
D_ITEMS = pd.read_csv(f'{mimic_dir}/D_ITEMS.csv.gz')

In [5]:
D_LABITEMS.head()

In [6]:
D_ITEMS.head()

In [7]:
D_TEST = pd.concat([D_LABITEMS, D_ITEMS], join='inner')
test_label_dict = dict(zip(D_TEST.ITEMID, D_TEST.LABEL))
test_cat_dict = dict(zip(D_TEST.ITEMID, D_TEST.CATEGORY))

In [8]:
PATIENTS = pd.read_csv(f'{multi_visit_mimic_dir}/PATIENTS_2WKS.csv.gz')
ADMISSIONS = pd.read_csv(f'{multi_visit_mimic_dir}/ADMISSIONS_2WKS.csv.gz')
DIAGNOSES_ICD = pd.read_csv(f'{multi_visit_mimic_dir}/DIAGNOSES_ICD_2WKS.csv.gz', dtype = {'ICD9_CODE': str})
PROCEDURES_ICD = pd.read_csv(f'{multi_visit_mimic_dir}/PROCEDURES_ICD_2WKS.csv.gz', dtype = {'ICD9_CODE': str})
LABEVENTS = pd.read_csv(f'{multi_visit_mimic_dir}/LABEVENTS_Q5_UNITS_FIXED_2WKS.csv.gz')
CHARTEVENTS = pd.read_csv(f'{multi_visit_mimic_dir}/CHARTEVENTS_Q5_2WKS.csv.gz')

In [9]:
LABEVENTS.head()

In [10]:
CHARTEVENTS.head()

# Checklist

- (A) **Cast datetime to date**.
- (B) **Merge CHARTEVENTS and LABEVENTS**
- (C) **Remove outliers by IQR** (NOTE: possible information leakage between training and testing. So the aim just to assess the ability of neural ODEs to predict codes for next visits compared to GRAM methods. But for real clinical investigation, outlier removal should be applied on training subset then passing filtration limits to test subset.)
- (D) **Normalize to Z-scores**. (NOTE: see (C)).
- (E) **For repeated measurements in the same day, consider the average.**
    - Average is considered instead of median to be sensitive to edge (and sporadic) values which could be indicative of health conditions.
- (F) For DX/PR codes:
    1. Add to the middle day of the H. Adm.
    2. Add to all days of the H.Adm.


# (A) Cast datetime to date

## (A-1) PATIENTS table

In [11]:
PATIENTS.head()

In [12]:
PATIENTS = PATIENTS[['SUBJECT_ID', 'GENDER', 'DOB']]
PATIENTS['DOB'] = pd.to_datetime(PATIENTS.DOB, infer_datetime_format=True).dt.normalize()

In [13]:
PATIENTS.head()

In [14]:
PATIENTS.dtypes

## (A-2) ADMISSIONS table

In [15]:
ADMISSIONS.head()

In [16]:
ADMISSIONS = ADMISSIONS[['SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME', 'ADMISSION_TYPE', 'ADMISSION_LOCATION', 'ETHNICITY', 'DIAGNOSIS', 'DAYS', 'MAX_DAYS']]
ADMISSIONS['ADMITTIME'] = pd.to_datetime(ADMISSIONS.ADMITTIME, infer_datetime_format=True).dt.normalize()
ADMISSIONS['DISCHTIME'] = pd.to_datetime(ADMISSIONS.DISCHTIME, infer_datetime_format=True).dt.normalize()
ADMISSIONS.head()

In [17]:
ADMISSIONS.dtypes

## (A-3) DIAGNOSES  and PROCEDURES tables

In [18]:
DIAGNOSES_ICD.head()

In [19]:
DIAGNOSES_ICD = DIAGNOSES_ICD[['SUBJECT_ID', 'HADM_ID', 'ICD9_CODE']]

In [20]:
PROCEDURES_ICD.head()


In [21]:
PROCEDURES_ICD = PROCEDURES_ICD[['SUBJECT_ID', 'HADM_ID', 'ICD9_CODE']]

## (A-4) LABEVENTS and CHARTEVENTS (don't normalize here).

In [22]:
LABEVENTS.head()

In [23]:
LABEVENTS = LABEVENTS[['SUBJECT_ID', 'ITEMID', 'CHARTTIME', 'VALUENUM', 'VALUEUOM']]
LABEVENTS['CHARTTIME'] = pd.to_datetime(LABEVENTS.CHARTTIME, infer_datetime_format=True)
LABEVENTS.head()

In [24]:
CHARTEVENTS.head()

In [25]:
CHARTEVENTS = CHARTEVENTS[['SUBJECT_ID', 'ITEMID', 'CHARTTIME', 'VALUENUM', 'VALUEUOM']]
CHARTEVENTS['CHARTTIME'] = pd.to_datetime(CHARTEVENTS.CHARTTIME, infer_datetime_format=True)
CHARTEVENTS.head()

# (B) Concatenate LABEVENTS and CHARTEVENTS into TESTS

In [26]:
TESTS = pd.concat([LABEVENTS, CHARTEVENTS], join="inner")

In [27]:
TESTS

In [28]:
TESTS.VALUENUM.notnull().all()

## (C) Remove outliers in TESTS using IQR

In [29]:
# Good read: https://iq-inc.com/importerror-attempted-relative-import/

import sys
import importlib
from mimicnet import concept

importlib.reload(sys.modules['mimicnet.concept'])

from mimicnet.concept import Subject

In [30]:
TESTS.shape

In [31]:
iqr_filter = Subject.make_iqr_concept_filter(TESTS)

In [32]:
len(set(TESTS.ITEMID))

In [33]:
len(iqr_filter)

In [34]:
sum(map(lambda t: t[0] == t[1], iqr_filter.values()))

In [35]:
iqr_filter_df = pd.DataFrame({'ITEMID': iqr_filter.keys(),
                              'LABEL': map(test_label_dict.get, iqr_filter.keys()),
                              'CATEGORY': map(test_cat_dict.get, iqr_filter.keys()),
                             'MIN': map(lambda t: t[0], iqr_filter.values()),
                             'MAX': map(lambda t: t[1], iqr_filter.values())})

In [36]:
iqr_filter_df.to_csv('iqr_filter.csv')
iqr_filter_df[iqr_filter_df.MAX == iqr_filter_df.MIN].to_csv('iqr_filter2.csv')
constant_tests = set(iqr_filter_df[iqr_filter_df.MAX == iqr_filter_df.MIN].ITEMID)
variable_tests = set(iqr_filter_df.ITEMID) - constant_tests

In [37]:
TESTS_NO_CONSTANTS = TESTS[TESTS.ITEMID.isin(variable_tests)]

In [38]:
TESTS_NO_CONSTANTS.shape

In [39]:
TESTS_FILTERED = Subject.apply_iqr_concept_filter(TESTS_NO_CONSTANTS, iqr_filter)

In [40]:
TESTS_FILTERED.shape

In [41]:
len(set(TESTS_FILTERED.ITEMID))

In [43]:
TESTS_FILTERED.to_csv(f'{transformed_mimic_dir}/TESTS_FILTERED.csv.gz', compression='gzip', index=False)


In [44]:
TESTS_FILTERED

## (E) Z-Score Normalization

In [45]:
zscore_scaler = Subject.make_zscore_concept_scaler(TESTS_FILTERED)

In [46]:
import sys
import importlib
from mimicnet import concept

importlib.reload(sys.modules['mimicnet.concept'])

import mimicnet

In [47]:
TESTS_FILTERED.shape

In [48]:
TESTS_FILTERED['VALUENUM'].to_numpy().shape

In [49]:
TESTS_FILTERED_ZSCORES = mimicnet.concept.Subject.apply_zscore_concept_scaler(TESTS_FILTERED, zscore_scaler)

In [50]:
TESTS_FILTERED_ZSCORES

In [51]:
TESTS_FILTERED_ZSCORES.to_csv(f'{transformed_mimic_dir}/TESTS_FILTERED_ZSCORES.csv.gz', compression='gzip', index=False)

## (D) Merge repeated measurements for the same day by taking the average

In [52]:
TESTS_FILTERED_ZSCORES_DTNORMALIZED = TESTS_FILTERED_ZSCORES.copy(deep=True)
TESTS_FILTERED_ZSCORES_DTNORMALIZED['CHARTTIME'] = TESTS_FILTERED_ZSCORES_DTNORMALIZED['CHARTTIME'].dt.normalize()

In [53]:
tests_filtered_day_agg = []

for subject_id, subject_df in tqdm(TESTS_FILTERED_ZSCORES_DTNORMALIZED.groupby('SUBJECT_ID')):
    for day, day_df in subject_df.groupby('CHARTTIME'):
        for item_id, items_df in day_df.groupby('ITEMID'):
            mean = items_df['VALUENUM'].mean()
            median = items_df['VALUENUM'].median()
            tests_filtered_day_agg.append((subject_id, item_id, day, mean, median))
            

In [54]:
TESTS_FILTERED_ZSCORES_AGG_DAY = pd.DataFrame(tests_filtered_day_agg, columns=['SUBJECT_ID', 'ITEMID', 'CHARTDAY', 'MEAN', 'MEDIAN'])

In [75]:
TESTS_FILTERED_ZSCORES_AGG_DAY

In [None]:
TESTS_FILTERED_ZSCORES_AGG_DAY.to_csv(f'{transformed_mimic_dir}/TESTS_FILTERED_ZSCORES_AGG_DAY.csv.gz', compression='gzip', index=False)
PATIENTS.to_csv(f'{transformed_mimic_dir}/PATIENTS.csv.gz', compression='gzip', index=False)
ADMISSIONS.to_csv(f'{transformed_mimic_dir}/ADMISSIONS.csv.gz', compression='gzip', index=False)

In [56]:
DIAGNOSES_ICD.to_csv(f'{transformed_mimic_dir}/DIAGNOSES_ICD.csv.gz', compression='gzip', index=False)
PROCEDURES_ICD.to_csv(f'{transformed_mimic_dir}/PROCEDURES_ICD.csv.gz', compression='gzip', index=False)

In [57]:
TESTS_FILTERED_ZSCORES_AGG_DAY

In [58]:
PATIENTS

In [59]:
set(PATIENTS.GENDER)

## (F) Ethnicity normalization (grouping)

In [60]:
ADMISSIONS

In [61]:
len(set(ADMISSIONS.ETHNICITY))

In [62]:
ethnicity_group_d = {
    'AMERICAN INDIAN/ALASKA NATIVE': ['AMERICAN INDIAN/ALASKA NATIVE', 'NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER'],
    'ASIAN': ['ASIAN', 'ASIAN - ASIAN INDIAN',
                         'ASIAN - CAMBODIAN',
                         'ASIAN - CHINESE',
                         'ASIAN - FILIPINO',
                         'ASIAN - KOREAN',
                         'ASIAN - OTHER',
                         'ASIAN - THAI',
                         'ASIAN - VIETNAMESE'],
    'BLACK/AFRICAN': ['BLACK/AFRICAN',
                     'BLACK/AFRICAN AMERICAN',
                     'BLACK/CAPE VERDEAN',
                     'BLACK/HAITIAN'],
    'HISPANIC OR LATINO': ['HISPANIC OR LATINO', 
                           'CARIBBEAN ISLAND',
                          'HISPANIC/LATINO - CENTRAL AMERICAN (OTHER)',
                             'HISPANIC/LATINO - COLOMBIAN',
                             'HISPANIC/LATINO - CUBAN',
                             'HISPANIC/LATINO - DOMINICAN',
                             'HISPANIC/LATINO - GUATEMALAN',
                             'HISPANIC/LATINO - MEXICAN',
                             'HISPANIC/LATINO - PUERTO RICAN',
                             'HISPANIC/LATINO - SALVADORAN'],
    'MIDDLE EASTERN': ['MIDDLE EASTERN'],
    'WHITE': ['WHITE',
             'WHITE - BRAZILIAN',
             'WHITE - EASTERN EUROPEAN',
             'WHITE - OTHER EUROPEAN',
             'WHITE - RUSSIAN',
             'PORTUGUESE'],
    'UNKNOWN': ['MULTI RACE ETHNICITY', 'OTHER', 'PATIENT DECLINED TO ANSWER', 'UNABLE TO OBTAIN', 
               'UNKNOWN/NOT SPECIFIED']
}

# 37 ethnicities mapped to 7 groups

In [63]:
ethnicity_d = {}
for ethnic_group, ethnic_labels in ethnicity_group_d.items():
    for eth_label in ethnic_labels:
        assert eth_label not in ethnicity_d, f"{eth_label} is assigned to multiple groups: {ethnicity_d[eth_label]} and ethnic_group."
        
        ethnicity_d[eth_label] = ethnic_group

In [64]:
len(ethnicity_d)

- When patient has admissions with different ethnicities recorded,

1. group the different ethnicities using the grouping above into a new set
2. if the set has at least one group in addition to 'UNKNOWN', remove 'UNKNOWN'
3. if set still has more than a group, then assign 'UNKNOWN'


In [65]:
subject_ethnicity_d = {}
for subject_id, subject_df in ADMISSIONS.groupby('SUBJECT_ID'):
    ethnicity = set(map(ethnicity_d.get, subject_df.ETHNICITY))
    if len(ethnicity) > 1 and 'UNKNOWN' in ethnicity:
        ethnicity.remove('UNKNOWN')
    if len(ethnicity) == 1:
        ethnicity = ethnicity.pop()
    else:
        ethnicity = 'UNKNOWN'
    subject_ethnicity_d[subject_id] = ethnicity

In [66]:
len(subject_ethnicity_d)

## (G) Static attributes table

In [67]:
static_df = PATIENTS.copy(deep=True)
static_df['ETHNIC_GROUP'] = static_df.SUBJECT_ID.map(subject_ethnicity_d)

In [68]:
static_df

In [69]:
static_df.to_csv(f'{transformed_mimic_dir}/static_df.csv.gz', compression='gzip', index=False)


## (H) Finalize admissions table

In [70]:
adm_df = ADMISSIONS[['SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME']]
adm_df

In [71]:
adm_df.to_csv(f'{transformed_mimic_dir}/adm_df.csv.gz', compression='gzip', index=False)


## (I) Finalize tests table

In [72]:
test_df = TESTS_FILTERED_ZSCORES_AGG_DAY[['SUBJECT_ID', 'ITEMID', 'CHARTDAY', 'MEAN']]

In [73]:
test_df.columns = ['SUBJECT_ID', 'ITEMID', 'DATE', 'VALUE']
test_df

In [74]:
test_df.to_csv(f'{transformed_mimic_dir}/test_df.csv.gz', compression='gzip', index=False)


## (J) Finalize PROCEDURES/DIAGNOSES tables

In [75]:
diag_df = DIAGNOSES_ICD[DIAGNOSES_ICD.ICD9_CODE.notnull()]
diag_df

In [76]:
proc_df = PROCEDURES_ICD[PROCEDURES_ICD.ICD9_CODE.notnull()]
proc_df

### Remove duplicate codes for the same patient for the same admission

In [77]:
diag_df = diag_df.drop_duplicates(ignore_index=True)
diag_df

In [78]:
proc_df = proc_df.drop_duplicates(ignore_index=True)
proc_df

In [79]:
diag_df.to_csv(f'{transformed_mimic_dir}/diag_df.csv.gz', compression='gzip', index=False)
proc_df.to_csv(f'{transformed_mimic_dir}/proc_df.csv.gz', compression='gzip', index=False)


In [80]:
diag_df

In [81]:
proc_df

In [5]:
diag_df = pd.read_csv(f'{transformed_mimic_dir}/diag_df.csv.gz')


In [6]:
print('Avg diag. ICD9 codes per admission=', len(diag_df)/diag_df['HADM_ID'].nunique())