In [1]:
import numpy as np
import pandas as pd
import json
import collections
from collections import defaultdict 
from functools import partial
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm


In [2]:
multi_visit_mimic_dir = '/home/asem/GP/ehr-data/mimic4-multi-visit'
transformed_mimic_dir = '/home/asem/GP/ehr-data/mimic4-transforms'
mimic_dir = '/home/asem/GP/ehr-data/mimic4-v1.0/physionet.org/files/mimiciv/1.0'

In [3]:
patients = pd.read_csv(f'{multi_visit_mimic_dir}/patients_2wks.csv.gz')
admissions = pd.read_csv(f'{multi_visit_mimic_dir}/admissions_2wks.csv.gz')
diagnoses_icd = pd.read_csv(f'{multi_visit_mimic_dir}/diagnoses_icd_2wks.csv.gz', dtype = {'icd_code': str})
procedures_icd = pd.read_csv(f'{multi_visit_mimic_dir}/procedures_icd_2wks.csv.gz', dtype = {'icd_code': str})

# Checklist

- (A) **Cast datetime to date**.
- (B) **Merge CHARTEVENTS and LABEVENTS**
- (C) **Remove outliers by IQR** (NOTE: possible information leakage between training and testing. So the aim just to assess the ability of neural ODEs to predict codes for next visits compared to GRAM methods. But for real clinical investigation, outlier removal should be applied on training subset then passing filtration limits to test subset.)
- (D) **Normalize to Z-scores**. (NOTE: see (C)).
- (E) **For repeated measurements in the same day, consider the average.**
    - Average is considered instead of median to be sensitive to edge (and sporadic) values which could be indicative of health conditions.
- (F) For DX/PR codes:
    1. Add to the middle day of the H. Adm.
    2. Add to all days of the H.Adm.


# (A) Cast datetime to date

## (A-1) PATIENTS table

In [4]:
patients.head()

Unnamed: 0,subject_id,gender,anchor_age,anchor_year,anchor_year_group,dod
0,10002723,F,0,2128,2017 - 2019,
1,10018928,F,31,2125,2008 - 2010,
2,10124191,F,27,2125,2011 - 2013,
3,10156486,F,75,2124,2017 - 2019,
4,10159585,M,59,2146,2008 - 2010,2154-04-16


In [5]:
patients.dtypes

subject_id            int64
gender               object
anchor_age            int64
anchor_year           int64
anchor_year_group    object
dod                  object
dtype: object

In [6]:
patients = patients[['subject_id', 'gender', 'anchor_age', 'anchor_year']]
patients['dob'] = pd.to_datetime(patients['anchor_year'] - patients['anchor_age'], format='%Y').dt.normalize()

In [7]:
patients.head()

Unnamed: 0,subject_id,gender,anchor_age,anchor_year,dob
0,10002723,F,0,2128,2128-01-01
1,10018928,F,31,2125,2094-01-01
2,10124191,F,27,2125,2098-01-01
3,10156486,F,75,2124,2049-01-01
4,10159585,M,59,2146,2087-01-01


In [8]:
patients.dtypes

subject_id              int64
gender                 object
anchor_age              int64
anchor_year             int64
dob            datetime64[ns]
dtype: object

## (A-2) ADMISSIONS table

In [9]:
admissions.head()

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,deathtime,admission_type,admission_location,discharge_location,insurance,language,marital_status,ethnicity,edregtime,edouttime,hospital_expire_flag,days,max_days
0,10292548,26653546,2120-01-07 05:51:00,2120-01-12 13:45:00,,URGENT,PHYSICIAN REFERRAL,HOME,Other,ENGLISH,MARRIED,ASIAN,,,0,5,5
1,11735820,24560424,2151-10-24 20:32:00,2151-10-25 12:25:00,,EU OBSERVATION,EMERGENCY ROOM,,Medicaid,?,MARRIED,HISPANIC/LATINO,2151-10-24 13:45:00,2151-10-25 12:25:00,0,1,1
2,16261811,26233676,2145-12-08 18:41:00,2145-12-09 19:40:00,,EU OBSERVATION,EMERGENCY ROOM,,Medicare,ENGLISH,SINGLE,WHITE,2145-12-08 14:44:00,2145-12-08 19:48:00,0,1,3
3,12988422,25192155,2132-05-24 07:10:00,2132-05-24 13:50:00,,EU OBSERVATION,EMERGENCY ROOM,,Medicare,ENGLISH,SINGLE,WHITE,2132-05-23 22:09:00,2132-05-24 13:50:00,0,0,1
4,10945838,20090853,2166-05-29 11:44:00,2166-05-30 12:00:00,,DIRECT EMER.,PHYSICIAN REFERRAL,HOME,Other,?,,WHITE,,,0,1,3


In [10]:
admissions = admissions[['subject_id', 'hadm_id', 'admittime', 'dischtime', 'ethnicity']]
admissions.columns = ['SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME', 'ETHNICITY']

In [11]:
admissions.ADMITTIME = pd.to_datetime(admissions.ADMITTIME, infer_datetime_format=True).dt.normalize()
admissions.DISCHTIME = pd.to_datetime(admissions.DISCHTIME, infer_datetime_format=True).dt.normalize()
admissions.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,ETHNICITY
0,10292548,26653546,2120-01-07,2120-01-12,ASIAN
1,11735820,24560424,2151-10-24,2151-10-25,HISPANIC/LATINO
2,16261811,26233676,2145-12-08,2145-12-09,WHITE
3,12988422,25192155,2132-05-24,2132-05-24,WHITE
4,10945838,20090853,2166-05-29,2166-05-30,WHITE


In [12]:
admissions.dtypes

SUBJECT_ID             int64
HADM_ID                int64
ADMITTIME     datetime64[ns]
DISCHTIME     datetime64[ns]
ETHNICITY             object
dtype: object

## (A-3) DIAGNOSES  and PROCEDURES tables

In [13]:
diagnoses_icd.head()

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version
0,15734973,20475282,3,2825,9
1,15734973,20475282,2,V0251,9
2,15734973,20475282,5,V270,9
3,15734973,20475282,1,64891,9
4,15734973,20475282,4,66481,9


In [14]:
diagnoses_icd = diagnoses_icd[['subject_id', 'hadm_id', 'icd_code', 'icd_version']]

In [15]:
procedures_icd.head()


Unnamed: 0,subject_id,hadm_id,seq_num,chartdate,icd_code,icd_version
0,10287061,27485182,1,2124-08-22,7569,9
1,13135573,22286210,3,2187-05-30,7309,9
2,13135573,22286210,1,2187-05-30,7569,9
3,13135573,22286210,2,2187-05-30,734,9
4,16902387,29170406,4,2139-12-07,7359,9


In [16]:
procedures_icd = procedures_icd[['subject_id', 'hadm_id', 'icd_code', 'icd_version']]

## Convert ICD-10 to ICD-9

In [17]:
icd_conv = pd.read_csv('icd10toicd9gem.csv', dtype = {'icd9cm': str, 'icd10cm': str})
icd_conv.head()


Unnamed: 0,icd10cm,icd9cm,flags,approximate,no_map,combination,scenario,choice_list
0,16070,231,10000,1,0,0,0,0
1,16071,231,10000,1,0,0,0,0
2,16072,232,10000,1,0,0,0,0
3,16073,232,10000,1,0,0,0,0
4,16074,233,10000,1,0,0,0,0


In [18]:
(icd_conv.no_map == 0).mean()

0.9958552497072654

In [19]:
from collections import defaultdict
icd_conv_dict = defaultdict(set)
for row in icd_conv[icd_conv.no_map == 0].itertuples():
    icd_conv_dict[row.icd10cm].add(row.icd9cm)

In [20]:
set(map(len, icd_conv_dict.values()))

{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}

In [21]:
diagnoses_icd_10 = diagnoses_icd[diagnoses_icd.icd_version == 10]
procedures_icd_10 = procedures_icd[procedures_icd.icd_version == 10]


In [22]:
diagnoses_icd9_converted = {'subject_id': [],
                            'hadm_id': [],
                            'icd_code': [],
                            'icd_version': []}
procedures_icd9_converted = {'subject_id': [],
                            'hadm_id': [],
                            'icd_code': [],
                            'icd_version': []}

for row in diagnoses_icd_10.itertuples():
    for icd9 in icd_conv_dict.get(row.icd_code, {}):
        diagnoses_icd9_converted['subject_id'].append(row.subject_id)
        diagnoses_icd9_converted['hadm_id'].append(row.hadm_id)
        diagnoses_icd9_converted['icd_code'].append(icd9)
        diagnoses_icd9_converted['icd_version'].append(9)

for row in procedures_icd_10.itertuples():
    for icd9 in icd_conv_dict.get(row.icd_code, {}):
        procedures_icd9_converted['subject_id'].append(row.subject_id)
        procedures_icd9_converted['hadm_id'].append(row.hadm_id)
        procedures_icd9_converted['icd_code'].append(icd9)
        procedures_icd9_converted['icd_version'].append(9)

In [23]:
diagnoses_icd9_converted = pd.DataFrame(diagnoses_icd9_converted)
procedures_icd9_converted = pd.DataFrame(procedures_icd9_converted)


In [24]:
print('diag 10->9 expansion ratio:', len(diagnoses_icd9_converted)/len(diagnoses_icd_10))
print('proc 10->9 expansion ratio:', len(procedures_icd9_converted)/len(procedures_icd_10))


diag 10->9 expansion ratio: 1.2668582827560269
proc 10->9 expansion ratio: 1.473002380490197


In [25]:
# The original rows with ICD9
diagnoses_icd9 = diagnoses_icd[diagnoses_icd.icd_version == 9]
procedures_icd9 = procedures_icd[procedures_icd.icd_version == 9]

print('|diag_icd9_original|=', len(diagnoses_icd9))
print('|proc_icd9_original|=', len(procedures_icd9))

print('|diag_icd9_converted|=', len(diagnoses_icd9_converted))
print('|proc_icd9_converted|=', len(procedures_icd9_converted))


# Now with merging the converted ICD9
diagnoses_icd9 = diagnoses_icd9.append(diagnoses_icd9_converted)
procedures_icd9 = procedures_icd9.append(procedures_icd9_converted)

print('|diag_icd9_total|=', len(diagnoses_icd9))
print('|proc_icd9_total|=', len(procedures_icd9))


|diag_icd9_original|= 1670245
|proc_icd9_original|= 243161
|diag_icd9_converted|= 1374298
|proc_icd9_converted|= 144176
|diag_icd9_total|= 3044543
|proc_icd9_total|= 387337


In [26]:
diagnoses_icd9.to_csv(f'{transformed_mimic_dir}/diagnoses_icd9.csv.gz', compression='gzip', index=False)
procedures_icd9.to_csv(f'{transformed_mimic_dir}/procedures_icd9.csv.gz', compression='gzip', index=False)

In [27]:
patients

Unnamed: 0,subject_id,gender,anchor_age,anchor_year,dob
0,10002723,F,0,2128,2128-01-01
1,10018928,F,31,2125,2094-01-01
2,10124191,F,27,2125,2098-01-01
3,10156486,F,75,2124,2049-01-01
4,10159585,M,59,2146,2087-01-01
...,...,...,...,...,...
72620,19996968,M,32,2125,2093-01-01
72621,19997062,M,50,2115,2065-01-01
72622,19997448,F,52,2121,2069-01-01
72623,19997887,F,57,2112,2055-01-01


In [28]:
set(patients.gender)

{'F', 'M'}

## (F) Ethnicity normalization (grouping)

In [29]:
admissions

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,ETHNICITY
0,10292548,26653546,2120-01-07,2120-01-12,ASIAN
1,11735820,24560424,2151-10-24,2151-10-25,HISPANIC/LATINO
2,16261811,26233676,2145-12-08,2145-12-09,WHITE
3,12988422,25192155,2132-05-24,2132-05-24,WHITE
4,10945838,20090853,2166-05-29,2166-05-30,WHITE
...,...,...,...,...,...
265632,19748558,29331244,2163-03-20,2163-03-23,WHITE
265633,18190942,20823019,2153-04-28,2153-05-01,WHITE
265634,17137572,20943099,2147-08-01,2147-08-02,HISPANIC/LATINO
265635,12298845,22347500,2138-05-31,2138-06-04,WHITE


In [30]:
set(admissions.ETHNICITY)

{'AMERICAN INDIAN/ALASKA NATIVE',
 'ASIAN',
 'BLACK/AFRICAN AMERICAN',
 'HISPANIC/LATINO',
 'OTHER',
 'UNABLE TO OBTAIN',
 'UNKNOWN',
 'WHITE'}

In [31]:
ethnicity_group_d = {
    'AMERICAN INDIAN/ALASKA NATIVE': ['AMERICAN INDIAN/ALASKA NATIVE'],
    'ASIAN': ['ASIAN'],
    'BLACK/AFRICAN': ['BLACK/AFRICAN AMERICAN'],
    'HISPANIC OR LATINO': ['HISPANIC/LATINO'],
    'WHITE': ['WHITE'],
    'UNKNOWN': ['OTHER', 'UNABLE TO OBTAIN', 'UNKNOWN']
}

# 8 ethnicities mapped to 6 groups

In [32]:
ethnicity_d = {}
for ethnic_group, ethnic_labels in ethnicity_group_d.items():
    for eth_label in ethnic_labels:
        assert eth_label not in ethnicity_d, f"{eth_label} is assigned to multiple groups: {ethnicity_d[eth_label]} and ethnic_group."
        
        ethnicity_d[eth_label] = ethnic_group

In [33]:
len(ethnicity_d)

8

- When patient has admissions with different ethnicities recorded,

1. group the different ethnicities using the grouping above into a new set
2. if the set has at least one group in addition to 'UNKNOWN', remove 'UNKNOWN'
3. if set still has more than a group, then assign 'UNKNOWN'


In [34]:
subject_ethnicity_d = {}
for subject_id, subject_df in admissions.groupby('SUBJECT_ID'):
    ethnicity = set(map(ethnicity_d.get, subject_df.ETHNICITY))
    if len(ethnicity) > 1 and 'UNKNOWN' in ethnicity:
        ethnicity.remove('UNKNOWN')
    if len(ethnicity) == 1:
        ethnicity = ethnicity.pop()
    else:
        ethnicity = 'UNKNOWN'
    subject_ethnicity_d[subject_id] = ethnicity

In [35]:
len(subject_ethnicity_d)

72625

## (G) Static attributes table

In [36]:
static_df = patients.copy(deep=True)
static_df['ETHNIC_GROUP'] = static_df.subject_id.map(subject_ethnicity_d)

In [37]:
static_df

Unnamed: 0,subject_id,gender,anchor_age,anchor_year,dob,ETHNIC_GROUP
0,10002723,F,0,2128,2128-01-01,WHITE
1,10018928,F,31,2125,2094-01-01,BLACK/AFRICAN
2,10124191,F,27,2125,2098-01-01,BLACK/AFRICAN
3,10156486,F,75,2124,2049-01-01,WHITE
4,10159585,M,59,2146,2087-01-01,BLACK/AFRICAN
...,...,...,...,...,...,...
72620,19996968,M,32,2125,2093-01-01,BLACK/AFRICAN
72621,19997062,M,50,2115,2065-01-01,WHITE
72622,19997448,F,52,2121,2069-01-01,WHITE
72623,19997887,F,57,2112,2055-01-01,WHITE


In [38]:
static_df = static_df[['subject_id', 'gender', 'dob', 'ETHNIC_GROUP']]
static_df.columns = ['SUBJECT_ID', 'GENDER', 'DOB', 'ETHNIC_GROUP']

In [39]:
static_df.to_csv(f'{transformed_mimic_dir}/static_df.csv.gz', compression='gzip', index=False)


## (H) Statistics of admissions table

In [45]:

patient_ordered_admissions = {}

for patient_id, patient_df in admissions.groupby('SUBJECT_ID'):
    admit = patient_df['ADMITTIME'].sort_values()
    patient_ordered_admissions[patient_id] = list(admit)

In [46]:
patient_intervals = defaultdict(list)
all_intervals = []
for patient_id, admittimes in patient_ordered_admissions.items():
    intervals = []
    for adm1, adm2 in zip(admittimes[:-1], admittimes[1:]):
        weeks = (adm2 - adm1).days / 7
        intervals.append(weeks)
        all_intervals.append(weeks)
    patient_intervals[patient_id] = intervals
    
pd.DataFrame({'intervals(weeks)': all_intervals}).describe()

Unnamed: 0,intervals(weeks)
count,193012.0
mean,53.765978
std,85.753985
min,0.0
25%,3.285714
50%,15.142857
75%,65.857143
max,618.714286


## (J) Finalize PROCEDURES/DIAGNOSES tables

In [165]:
diagnoses_icd9 = diagnoses_icd9[['subject_id', 'hadm_id', 'icd_code']]
diagnoses_icd9.columns = ['SUBJECT_ID', 'HADM_ID', 'ICD9_CODE']
diagnoses_icd9 = diagnoses_icd9.sort_values(by=['SUBJECT_ID', 'HADM_ID']).reset_index(drop=True)

In [166]:
diag_df = diagnoses_icd9[diagnoses_icd9.ICD9_CODE.notnull()]
diag_df

Unnamed: 0,SUBJECT_ID,HADM_ID,ICD9_CODE
0,10000032,22595853,496
1,10000032,22595853,07070
2,10000032,22595853,29680
3,10000032,22595853,V1582
4,10000032,22595853,5715
...,...,...,...
3044538,19999840,26071774,43820
3044539,19999840,26071774,4019
3044540,19999840,26071774,43811
3044541,19999840,26071774,2724


In [167]:
procedures_icd9 = procedures_icd9[['subject_id', 'hadm_id', 'icd_code']]
procedures_icd9.columns = ['SUBJECT_ID', 'HADM_ID', 'ICD9_CODE']
procedures_icd9 = procedures_icd9.sort_values(by=['SUBJECT_ID', 'HADM_ID']).reset_index(drop=True)

In [168]:
proc_df = procedures_icd9[procedures_icd9.ICD9_CODE.notnull()]
proc_df

Unnamed: 0,SUBJECT_ID,HADM_ID,ICD9_CODE
0,10000032,22595853,5491
1,10000032,22841357,5491
2,10000032,25742920,5491
3,10000117,27988844,7915
4,10000730,24810836,9983
...,...,...,...
387332,19999840,21033226,9604
387333,19999840,21033226,0331
387334,19999840,21033226,9672
387335,19999840,26071774,8841


### Remove duplicate codes for the same patient for the same admission

In [169]:
diag_df = diag_df.drop_duplicates(ignore_index=True)
diag_df

Unnamed: 0,SUBJECT_ID,HADM_ID,ICD9_CODE
0,10000032,22595853,496
1,10000032,22595853,07070
2,10000032,22595853,29680
3,10000032,22595853,V1582
4,10000032,22595853,5715
...,...,...,...
3030907,19999840,26071774,43820
3030908,19999840,26071774,4019
3030909,19999840,26071774,43811
3030910,19999840,26071774,2724


In [170]:
proc_df = proc_df.drop_duplicates(ignore_index=True)
proc_df

Unnamed: 0,SUBJECT_ID,HADM_ID,ICD9_CODE
0,10000032,22595853,5491
1,10000032,22841357,5491
2,10000032,25742920,5491
3,10000117,27988844,7915
4,10000730,24810836,9983
...,...,...,...
367553,19999840,21033226,966
367554,19999840,21033226,9604
367555,19999840,21033226,9672
367556,19999840,26071774,8841


In [171]:
diag_df.to_csv(f'{transformed_mimic_dir}/diag_df.csv.gz', compression='gzip', index=False)
proc_df.to_csv(f'{transformed_mimic_dir}/proc_df.csv.gz', compression='gzip', index=False)


In [172]:
diag_df

Unnamed: 0,SUBJECT_ID,HADM_ID,ICD9_CODE
0,10000032,22595853,496
1,10000032,22595853,07070
2,10000032,22595853,29680
3,10000032,22595853,V1582
4,10000032,22595853,5715
...,...,...,...
3030907,19999840,26071774,43820
3030908,19999840,26071774,4019
3030909,19999840,26071774,43811
3030910,19999840,26071774,2724


In [173]:
proc_df

Unnamed: 0,SUBJECT_ID,HADM_ID,ICD9_CODE
0,10000032,22595853,5491
1,10000032,22841357,5491
2,10000032,25742920,5491
3,10000117,27988844,7915
4,10000730,24810836,9983
...,...,...,...
367553,19999840,21033226,966
367554,19999840,21033226,9604
367555,19999840,21033226,9672
367556,19999840,26071774,8841


In [174]:
print('avg. icd9 codes per admission', len(diag_df)/diag_df['HADM_ID'].nunique())

avg. icd9 codes per admission 11.460281089420013


In [175]:
len(set(diag_df.HADM_ID))

264471

In [176]:
len(set(admissions.HADM_ID))

265637

In [177]:
len(set(diag_df.HADM_ID) - set(admissions.HADM_ID))

0

In [179]:
import os, sys
parent_dir = os.path.abspath('..')
# the parent_dir could already be there if the kernel was not restarted,
# and we run this cell again
if parent_dir not in sys.path:
    sys.path.append(parent_dir)
    
from mimicnet.mimic3 import concept, dag

KG = dag.CCSDAG()
admission_n_ccs_codes = {}
admission_n_icd_codes = {}

In [186]:
for admission_id, diag_adm_df in diag_df.groupby('HADM_ID'):
    icd_codes = set(diag_adm_df.ICD9_CODE)
    ccs_codes = set(map(KG.diag_icd2ccs.get, icd_codes))
    admission_n_ccs_codes[admission_id] = len(ccs_codes)
    admission_n_icd_codes[admission_id] = len(icd_codes)

In [188]:
admission_n_codes = pd.DataFrame(index=admission_n_ccs_codes.keys(),
                                data = {'CCS': admission_n_ccs_codes.values(),
                                       'ICD': admission_n_icd_codes.values()})

In [190]:
admission_n_codes.describe()

Unnamed: 0,CCS,ICD
count,264471.0,264471.0
mean,9.750048,11.460281
std,5.984254,7.553626
min,1.0,1.0
25%,5.0,6.0
50%,9.0,10.0
75%,13.0,16.0
max,43.0,61.0


In [185]:
dir(KG)

['CCS_DIR',
 'DIAG_MULTI_CCS_FILE',
 'DIAG_SINGLE_CCS_FILE',
 'DIR',
 'PROC_MULTI_CCS_FILE',
 'PROC_SINGLE_CCS_FILE',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'ancestors_linkage',
 'common_parent',
 'diag_ccs2icd',
 'diag_ccs_children_traversal',
 'diag_ccs_codes',
 'diag_ccs_df',
 'diag_ccs_pt2ch',
 'diag_flatccs2icd',
 'diag_flatccs_codes',
 'diag_flatccs_df',
 'diag_icd2ccs',
 'diag_icd2flatccs',
 'diag_icd_codes',
 'diag_icd_label',
 'find_diag_icd_name',
 'find_proc_icd_name',
 'get_ccs_parents',
 'get_diag_ccs',
 'get_diag_ccs_children',
 'get_proc_ccs',
 'get_proc_ccs_children',
 'make_diag_icd2ccs_dict',
 'make_diag_icd_dict',
 'make_diag_multi_dictionar