In [1]:
import numpy as np
import pandas as pd
import json
import collections
from collections import defaultdict 
from functools import partial
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm


In [2]:
# Good read: https://iq-inc.com/importerror-attempted-relative-import/

import sys
import importlib
from mimicnet import concept

importlib.reload(sys.modules['mimicnet.concept'])

ModuleNotFoundError: No module named 'mimicnet'

In [2]:
# multi_visit_mimic_dir = '/home/am8520/GP/ehr-data/mimic3-multi-visit'
multi_visit_mimic_dir = '/home/asem/GP/ehr-data/mimic3-multi-visit'
transformed_mimic_dir = '/home/asem/GP/ehr-data/mimic3-transforms'
# mimic_dir = '/home/asem/GP/ehr-data/mimic3-v1.4/physionet.org/files/mimiciii/1.4'
mimic_dir = '/home/asem/GP/MIMIC-SNONET/RAW/mimic-iii-clinical-database-1.4'


In [4]:
D_LABITEMS = pd.read_csv(f'{mimic_dir}/D_LABITEMS.csv.gz')
D_ITEMS = pd.read_csv(f'{mimic_dir}/D_ITEMS.csv.gz')

In [5]:
D_LABITEMS.head()

Unnamed: 0,ROW_ID,ITEMID,LABEL,FLUID,CATEGORY,LOINC_CODE
0,546,51346,Blasts,Cerebrospinal Fluid (CSF),Hematology,26447-3
1,547,51347,Eosinophils,Cerebrospinal Fluid (CSF),Hematology,26451-5
2,548,51348,"Hematocrit, CSF",Cerebrospinal Fluid (CSF),Hematology,30398-2
3,549,51349,Hypersegmented Neutrophils,Cerebrospinal Fluid (CSF),Hematology,26506-6
4,550,51350,Immunophenotyping,Cerebrospinal Fluid (CSF),Hematology,


In [6]:
D_ITEMS.head()

Unnamed: 0,ROW_ID,ITEMID,LABEL,ABBREVIATION,DBSOURCE,LINKSTO,CATEGORY,UNITNAME,PARAM_TYPE,CONCEPTID
0,457,497,Patient controlled analgesia (PCA) [Inject],,carevue,chartevents,,,,
1,458,498,PCA Lockout (Min),,carevue,chartevents,,,,
2,459,499,PCA Medication,,carevue,chartevents,,,,
3,460,500,PCA Total Dose,,carevue,chartevents,,,,
4,461,501,PCV Exh Vt (Obser),,carevue,chartevents,,,,


In [7]:
D_TEST = pd.concat([D_LABITEMS, D_ITEMS], join='inner')
test_label_dict = dict(zip(D_TEST.ITEMID, D_TEST.LABEL))
test_cat_dict = dict(zip(D_TEST.ITEMID, D_TEST.CATEGORY))

In [3]:
PATIENTS = pd.read_csv(f'{multi_visit_mimic_dir}/PATIENTS_2WKS.csv.gz')
ADMISSIONS = pd.read_csv(f'{multi_visit_mimic_dir}/ADMISSIONS_2WKS.csv.gz')
DIAGNOSES_ICD = pd.read_csv(f'{multi_visit_mimic_dir}/DIAGNOSES_ICD_2WKS.csv.gz', dtype = {'ICD9_CODE': str})
PROCEDURES_ICD = pd.read_csv(f'{multi_visit_mimic_dir}/PROCEDURES_ICD_2WKS.csv.gz', dtype = {'ICD9_CODE': str})

In [8]:
LABEVENTS = pd.read_csv(f'{multi_visit_mimic_dir}/LABEVENTS_Q5_UNITS_FIXED_2WKS.csv.gz')
CHARTEVENTS = pd.read_csv(f'{multi_visit_mimic_dir}/CHARTEVENTS_Q5_2WKS.csv.gz')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [9]:
LABEVENTS.head()

Unnamed: 0.1,Unnamed: 0,SUBJECT_ID,ITEMID,CHARTTIME,VALUE,VALUENUM,VALUEUOM
0,0,17,50960,2134-12-29 03:18:00,2.1,2.1,mg/dL
1,1,17,50970,2134-12-29 03:18:00,2.0,2.0,mg/dL
2,2,17,50971,2134-12-29 03:18:00,4.7,4.7,mEq/L
3,3,17,50983,2134-12-29 03:18:00,136.0,136.0,mEq/L
4,4,17,51006,2134-12-29 03:18:00,11.0,11.0,mg/dL


In [10]:
CHARTEVENTS.head()

Unnamed: 0,SUBJECT_ID,ITEMID,CHARTTIME,VALUE,VALUENUM,VALUEUOM
0,36,223834,2134-05-12 12:00:00,15.0,15.0,L/min
1,36,223835,2134-05-12 12:00:00,100.0,100.0,
2,36,220224,2134-05-12 12:35:00,58.0,58.0,mmHg
3,36,220235,2134-05-12 12:35:00,60.0,60.0,mmHg
4,36,223830,2134-05-12 12:35:00,7.29,7.29,units


# Checklist

- (A) **Cast datetime to date**.
- (B) **Merge CHARTEVENTS and LABEVENTS**
- (C) **Remove outliers by IQR** (NOTE: possible information leakage between training and testing. So the aim just to assess the ability of neural ODEs to predict codes for next visits compared to GRAM methods. But for real clinical investigation, outlier removal should be applied on training subset then passing filtration limits to test subset.)
- (D) **Normalize to Z-scores**. (NOTE: see (C)).
- (E) **For repeated measurements in the same day, consider the average.**
    - Average is considered instead of median to be sensitive to edge (and sporadic) values which could be indicative of health conditions.
- (F) For DX/PR codes:
    1. Add to the middle day of the H. Adm.
    2. Add to all days of the H.Adm.


# (A) Cast datetime to date

## (A-1) PATIENTS table

In [4]:
PATIENTS.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,GENDER,DOB,DOD,DOD_HOSP,DOD_SSN,EXPIRE_FLAG
0,240,256,M,2086-07-31 00:00:00,,,,0
1,637,674,F,2113-12-14 00:00:00,2195-02-17 00:00:00,2195-02-17 00:00:00,2195-02-17 00:00:00,1
2,642,679,F,2059-11-04 00:00:00,2145-03-19 00:00:00,2145-03-19 00:00:00,2145-03-19 00:00:00,1
3,655,695,F,2093-05-14 00:00:00,2178-09-16 00:00:00,,2178-09-16 00:00:00,1
4,658,698,F,1864-11-16 00:00:00,2168-04-22 00:00:00,,2168-04-22 00:00:00,1


In [5]:
PATIENTS = PATIENTS[['SUBJECT_ID', 'GENDER', 'DOB']]
PATIENTS['DOB'] = pd.to_datetime(PATIENTS.DOB, infer_datetime_format=True).dt.normalize()

In [6]:
PATIENTS.head()

Unnamed: 0,SUBJECT_ID,GENDER,DOB
0,256,M,2086-07-31
1,674,F,2113-12-14
2,679,F,2059-11-04
3,695,F,2093-05-14
4,698,F,1864-11-16


In [7]:
PATIENTS.dtypes

SUBJECT_ID             int64
GENDER                object
DOB           datetime64[ns]
dtype: object

## (A-2) ADMISSIONS table

In [8]:
ADMISSIONS.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,...,RELIGION,MARITAL_STATUS,ETHNICITY,EDREGTIME,EDOUTTIME,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA,DAYS,MAX_DAYS
0,22,23,152223,2153-09-03 07:15:00,2153-09-08 19:10:00,,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,...,CATHOLIC,MARRIED,WHITE,,,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,0,1,5,7
1,23,23,124321,2157-10-18 19:34:00,2157-10-25 14:00:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicare,...,CATHOLIC,MARRIED,WHITE,,,BRAIN MASS,0,1,7,7
2,33,34,115799,2186-07-18 16:46:00,2186-07-20 16:00:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME,Medicare,...,CATHOLIC,MARRIED,WHITE,,,CHEST PAIN\CATH,0,1,2,2
3,34,34,144319,2191-02-23 05:23:00,2191-02-25 20:20:00,,EMERGENCY,CLINIC REFERRAL/PREMATURE,HOME HEALTH CARE,Medicare,...,CATHOLIC,MARRIED,WHITE,2191-02-23 04:23:00,2191-02-23 07:25:00,BRADYCARDIA,0,1,2,2
4,36,36,182104,2131-04-30 07:15:00,2131-05-08 14:00:00,,EMERGENCY,CLINIC REFERRAL/PREMATURE,HOME HEALTH CARE,Medicare,...,NOT SPECIFIED,MARRIED,WHITE,,,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,0,1,8,13


In [9]:
ADMISSIONS = ADMISSIONS[['SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME', 'ADMISSION_TYPE', 'ADMISSION_LOCATION', 'ETHNICITY', 'DIAGNOSIS', 'DAYS', 'MAX_DAYS']]
ADMISSIONS['ADMITTIME'] = pd.to_datetime(ADMISSIONS.ADMITTIME, infer_datetime_format=True).dt.normalize()
ADMISSIONS['DISCHTIME'] = pd.to_datetime(ADMISSIONS.DISCHTIME, infer_datetime_format=True).dt.normalize()
ADMISSIONS.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,ETHNICITY,DIAGNOSIS,DAYS,MAX_DAYS
0,23,152223,2153-09-03,2153-09-08,ELECTIVE,PHYS REFERRAL/NORMAL DELI,WHITE,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,5,7
1,23,124321,2157-10-18,2157-10-25,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,WHITE,BRAIN MASS,7,7
2,34,115799,2186-07-18,2186-07-20,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,WHITE,CHEST PAIN\CATH,2,2
3,34,144319,2191-02-23,2191-02-25,EMERGENCY,CLINIC REFERRAL/PREMATURE,WHITE,BRADYCARDIA,2,2
4,36,182104,2131-04-30,2131-05-08,EMERGENCY,CLINIC REFERRAL/PREMATURE,WHITE,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,8,13


In [10]:
ADMISSIONS.dtypes

SUBJECT_ID                     int64
HADM_ID                        int64
ADMITTIME             datetime64[ns]
DISCHTIME             datetime64[ns]
ADMISSION_TYPE                object
ADMISSION_LOCATION            object
ETHNICITY                     object
DIAGNOSIS                     object
DAYS                           int64
MAX_DAYS                       int64
dtype: object

## (A-3) DIAGNOSES  and PROCEDURES tables

In [11]:
DIAGNOSES_ICD.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE
0,1488,112,174105,1.0,53100
1,1489,112,174105,2.0,41071
2,1490,112,174105,3.0,2859
3,1491,112,174105,4.0,41401
4,1492,112,174105,5.0,725


In [12]:
DIAGNOSES_ICD = DIAGNOSES_ICD[['SUBJECT_ID', 'HADM_ID', 'ICD9_CODE']]

In [13]:
PROCEDURES_ICD.head()


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE
0,944,62641,154460,3,3404
1,951,28600,189217,1,3613
2,952,28600,189217,2,3615
3,953,28600,189217,3,3961
4,966,18395,101133,1,159


In [14]:
PROCEDURES_ICD = PROCEDURES_ICD[['SUBJECT_ID', 'HADM_ID', 'ICD9_CODE']]

## (A-4) LABEVENTS and CHARTEVENTS (don't normalize here).

In [22]:
LABEVENTS.head()

Unnamed: 0.1,Unnamed: 0,SUBJECT_ID,ITEMID,CHARTTIME,VALUE,VALUENUM,VALUEUOM
0,0,17,50960,2134-12-29 03:18:00,2.1,2.1,mg/dL
1,1,17,50970,2134-12-29 03:18:00,2.0,2.0,mg/dL
2,2,17,50971,2134-12-29 03:18:00,4.7,4.7,mEq/L
3,3,17,50983,2134-12-29 03:18:00,136.0,136.0,mEq/L
4,4,17,51006,2134-12-29 03:18:00,11.0,11.0,mg/dL


In [23]:
LABEVENTS = LABEVENTS[['SUBJECT_ID', 'ITEMID', 'CHARTTIME', 'VALUENUM', 'VALUEUOM']]
LABEVENTS['CHARTTIME'] = pd.to_datetime(LABEVENTS.CHARTTIME, infer_datetime_format=True)
LABEVENTS.head()

Unnamed: 0,SUBJECT_ID,ITEMID,CHARTTIME,VALUENUM,VALUEUOM
0,17,50960,2134-12-29 03:18:00,2.1,mg/dL
1,17,50970,2134-12-29 03:18:00,2.0,mg/dL
2,17,50971,2134-12-29 03:18:00,4.7,mEq/L
3,17,50983,2134-12-29 03:18:00,136.0,mEq/L
4,17,51006,2134-12-29 03:18:00,11.0,mg/dL


In [24]:
CHARTEVENTS.head()

Unnamed: 0,SUBJECT_ID,ITEMID,CHARTTIME,VALUE,VALUENUM,VALUEUOM
0,36,223834,2134-05-12 12:00:00,15.0,15.0,L/min
1,36,223835,2134-05-12 12:00:00,100.0,100.0,
2,36,220224,2134-05-12 12:35:00,58.0,58.0,mmHg
3,36,220235,2134-05-12 12:35:00,60.0,60.0,mmHg
4,36,223830,2134-05-12 12:35:00,7.29,7.29,units


In [25]:
CHARTEVENTS = CHARTEVENTS[['SUBJECT_ID', 'ITEMID', 'CHARTTIME', 'VALUENUM', 'VALUEUOM']]
CHARTEVENTS['CHARTTIME'] = pd.to_datetime(CHARTEVENTS.CHARTTIME, infer_datetime_format=True)
CHARTEVENTS.head()

Unnamed: 0,SUBJECT_ID,ITEMID,CHARTTIME,VALUENUM,VALUEUOM
0,36,223834,2134-05-12 12:00:00,15.0,L/min
1,36,223835,2134-05-12 12:00:00,100.0,
2,36,220224,2134-05-12 12:35:00,58.0,mmHg
3,36,220235,2134-05-12 12:35:00,60.0,mmHg
4,36,223830,2134-05-12 12:35:00,7.29,units


# (B) Concatenate LABEVENTS and CHARTEVENTS into TESTS

In [26]:
TESTS = pd.concat([LABEVENTS, CHARTEVENTS], join="inner")

In [27]:
TESTS

Unnamed: 0,SUBJECT_ID,ITEMID,CHARTTIME,VALUENUM,VALUEUOM
0,17,50960,2134-12-29 03:18:00,2.1,mg/dL
1,17,50970,2134-12-29 03:18:00,2.0,mg/dL
2,17,50971,2134-12-29 03:18:00,4.7,mEq/L
3,17,50983,2134-12-29 03:18:00,136.0,mEq/L
4,17,51006,2134-12-29 03:18:00,11.0,mg/dL
...,...,...,...,...,...
12568265,99781,223901,2133-07-28 08:00:00,6.0,
12568266,99781,227343,2133-08-02 08:29:00,0.0,
12568267,99781,227344,2133-08-02 08:29:00,20.0,
12568268,99781,227345,2133-08-02 08:29:00,0.0,


In [28]:
TESTS.VALUENUM.notnull().all()

True

## (C) Remove outliers in TESTS using IQR

In [29]:
# Good read: https://iq-inc.com/importerror-attempted-relative-import/

import sys
import importlib
from mimicnet import concept

importlib.reload(sys.modules['mimicnet.concept'])

from mimicnet.concept import Subject

In [30]:
TESTS.shape

(16150404, 5)

In [31]:
iqr_filter = Subject.make_iqr_concept_filter(TESTS)

In [32]:
len(set(TESTS.ITEMID))

667

In [33]:
len(iqr_filter)

667

In [34]:
sum(map(lambda t: t[0] == t[1], iqr_filter.values()))

117

In [35]:
iqr_filter_df = pd.DataFrame({'ITEMID': iqr_filter.keys(),
                              'LABEL': map(test_label_dict.get, iqr_filter.keys()),
                              'CATEGORY': map(test_cat_dict.get, iqr_filter.keys()),
                             'MIN': map(lambda t: t[0], iqr_filter.values()),
                             'MAX': map(lambda t: t[1], iqr_filter.values())})

In [36]:
iqr_filter_df.to_csv('iqr_filter.csv')
iqr_filter_df[iqr_filter_df.MAX == iqr_filter_df.MIN].to_csv('iqr_filter2.csv')
constant_tests = set(iqr_filter_df[iqr_filter_df.MAX == iqr_filter_df.MIN].ITEMID)
variable_tests = set(iqr_filter_df.ITEMID) - constant_tests

In [37]:
TESTS_NO_CONSTANTS = TESTS[TESTS.ITEMID.isin(variable_tests)]

In [38]:
TESTS_NO_CONSTANTS.shape

(13307385, 5)

In [39]:
TESTS_FILTERED = Subject.apply_iqr_concept_filter(TESTS_NO_CONSTANTS, iqr_filter)

In [40]:
TESTS_FILTERED.shape

(12479309, 5)

In [41]:
len(set(TESTS_FILTERED.ITEMID))

550

In [43]:
TESTS_FILTERED.to_csv(f'{transformed_mimic_dir}/TESTS_FILTERED.csv.gz', compression='gzip', index=False)


In [44]:
TESTS_FILTERED

Unnamed: 0,SUBJECT_ID,ITEMID,CHARTTIME,VALUENUM,VALUEUOM
2,17,50971,2134-12-29 03:18:00,4.7,mEq/L
3,17,50983,2134-12-29 03:18:00,136.0,mEq/L
4,17,51006,2134-12-29 03:18:00,11.0,mg/dL
5,17,51221,2134-12-29 03:18:00,29.5,%
6,17,51222,2134-12-29 03:18:00,10.7,g/dL
...,...,...,...,...,...
12568261,99781,224409,2133-07-28 05:25:00,5.0,
12568262,99781,220739,2133-07-28 08:00:00,4.0,
12568263,99781,223791,2133-07-28 08:00:00,0.0,
12568264,99781,223900,2133-07-28 08:00:00,5.0,


## (E) Z-Score Normalization

In [45]:
zscore_scaler = Subject.make_zscore_concept_scaler(TESTS_FILTERED)

In [46]:
import sys
import importlib
from mimicnet import concept

importlib.reload(sys.modules['mimicnet.concept'])

import mimicnet

In [47]:
TESTS_FILTERED.shape

(12479309, 5)

In [48]:
TESTS_FILTERED['VALUENUM'].to_numpy().shape

(12479309,)

In [49]:
TESTS_FILTERED_ZSCORES = mimicnet.concept.Subject.apply_zscore_concept_scaler(TESTS_FILTERED, zscore_scaler)

In [50]:
TESTS_FILTERED_ZSCORES

Unnamed: 0,SUBJECT_ID,ITEMID,CHARTTIME,VALUENUM,VALUEUOM
2,17,50971,2134-12-29 03:18:00,1.091589,mEq/L
3,17,50983,2134-12-29 03:18:00,-0.695703,mEq/L
4,17,51006,2134-12-29 03:18:00,-0.923199,mg/dL
5,17,51221,2134-12-29 03:18:00,-0.458517,%
6,17,51222,2134-12-29 03:18:00,-0.015098,g/dL
...,...,...,...,...,...
12568261,99781,224409,2133-07-28 05:25:00,2.490322,
12568262,99781,220739,2133-07-28 08:00:00,0.493699,
12568263,99781,223791,2133-07-28 08:00:00,-0.799835,
12568264,99781,223900,2133-07-28 08:00:00,0.653657,


In [51]:
TESTS_FILTERED_ZSCORES.to_csv(f'{transformed_mimic_dir}/TESTS_FILTERED_ZSCORES.csv.gz', compression='gzip', index=False)

## (D) Merge repeated measurements for the same day by taking the average

In [52]:
TESTS_FILTERED_ZSCORES_DTNORMALIZED = TESTS_FILTERED_ZSCORES.copy(deep=True)
TESTS_FILTERED_ZSCORES_DTNORMALIZED['CHARTTIME'] = TESTS_FILTERED_ZSCORES_DTNORMALIZED['CHARTTIME'].dt.normalize()

In [53]:
tests_filtered_day_agg = []

for subject_id, subject_df in tqdm(TESTS_FILTERED_ZSCORES_DTNORMALIZED.groupby('SUBJECT_ID')):
    for day, day_df in subject_df.groupby('CHARTTIME'):
        for item_id, items_df in day_df.groupby('ITEMID'):
            mean = items_df['VALUENUM'].mean()
            median = items_df['VALUENUM'].median()
            tests_filtered_day_agg.append((subject_id, item_id, day, mean, median))
            

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4434/4434 [09:32<00:00,  7.75it/s]


In [54]:
TESTS_FILTERED_ZSCORES_AGG_DAY = pd.DataFrame(tests_filtered_day_agg, columns=['SUBJECT_ID', 'ITEMID', 'CHARTDAY', 'MEAN', 'MEDIAN'])

In [75]:
TESTS_FILTERED_ZSCORES_AGG_DAY

Unnamed: 0,SUBJECT_ID,ITEMID,CHARTDAY,MEAN,MEDIAN
0,17,50852,2134-12-22,-1.201339,-1.201339
1,17,50861,2134-12-22,-0.809370,-0.809370
2,17,50862,2134-12-22,1.634410,1.634410
3,17,50863,2134-12-22,-1.452596,-1.452596
4,17,50867,2134-12-22,-0.608894,-0.608894
...,...,...,...,...,...
4546204,99982,227456,2157-02-22,1.794272,1.794272
4546205,99982,227457,2157-02-22,-0.723781,-0.723781
4546206,99982,227465,2157-02-22,2.247434,2.247434
4546207,99982,227466,2157-02-22,-0.029395,-0.029395


In [None]:
TESTS_FILTERED_ZSCORES_AGG_DAY.to_csv(f'{transformed_mimic_dir}/TESTS_FILTERED_ZSCORES_AGG_DAY.csv.gz', compression='gzip', index=False)
PATIENTS.to_csv(f'{transformed_mimic_dir}/PATIENTS.csv.gz', compression='gzip', index=False)
ADMISSIONS.to_csv(f'{transformed_mimic_dir}/ADMISSIONS.csv.gz', compression='gzip', index=False)

In [56]:
DIAGNOSES_ICD.to_csv(f'{transformed_mimic_dir}/DIAGNOSES_ICD.csv.gz', compression='gzip', index=False)
PROCEDURES_ICD.to_csv(f'{transformed_mimic_dir}/PROCEDURES_ICD.csv.gz', compression='gzip', index=False)

In [57]:
TESTS_FILTERED_ZSCORES_AGG_DAY

Unnamed: 0,SUBJECT_ID,ITEMID,CHARTDAY,MEAN,MEDIAN
0,17,50852,2134-12-22,-1.201339,-1.201339
1,17,50861,2134-12-22,-0.809370,-0.809370
2,17,50862,2134-12-22,1.634410,1.634410
3,17,50863,2134-12-22,-1.452596,-1.452596
4,17,50867,2134-12-22,-0.608894,-0.608894
...,...,...,...,...,...
4546204,99982,227456,2157-02-22,1.794272,1.794272
4546205,99982,227457,2157-02-22,-0.723781,-0.723781
4546206,99982,227465,2157-02-22,2.247434,2.247434
4546207,99982,227466,2157-02-22,-0.029395,-0.029395


In [58]:
PATIENTS

Unnamed: 0,SUBJECT_ID,GENDER,DOB
0,256,M,2086-07-31
1,674,F,2113-12-14
2,679,F,2059-11-04
3,695,F,2093-05-14
4,698,F,1864-11-16
...,...,...,...
4429,43943,F,2116-12-07
4430,43946,F,2061-07-31
4431,43991,M,2069-01-27
4432,44061,M,2073-09-29


In [59]:
set(PATIENTS.GENDER)

{'F', 'M'}

## (F) Ethnicity normalization (grouping)

In [15]:
ADMISSIONS

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,ETHNICITY,DIAGNOSIS,DAYS,MAX_DAYS
0,23,152223,2153-09-03,2153-09-08,ELECTIVE,PHYS REFERRAL/NORMAL DELI,WHITE,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,5,7
1,23,124321,2157-10-18,2157-10-25,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,WHITE,BRAIN MASS,7,7
2,34,115799,2186-07-18,2186-07-20,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,WHITE,CHEST PAIN\CATH,2,2
3,34,144319,2191-02-23,2191-02-25,EMERGENCY,CLINIC REFERRAL/PREMATURE,WHITE,BRADYCARDIA,2,2
4,36,182104,2131-04-30,2131-05-08,EMERGENCY,CLINIC REFERRAL/PREMATURE,WHITE,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,8,13
...,...,...,...,...,...,...,...,...,...,...
10949,98759,109836,2165-05-05,2165-05-08,EMERGENCY,CLINIC REFERRAL/PREMATURE,WHITE,BRAIN ANEURYSM,3,3
10950,98759,175386,2165-06-05,2165-06-07,ELECTIVE,PHYS REFERRAL/NORMAL DELI,WHITE,BRAIN ANEURYSM/SDA,2,3
10951,98761,184477,2186-01-16,2186-01-16,ELECTIVE,PHYS REFERRAL/NORMAL DELI,WHITE,GASTROPARESIS\PLACEMENT OF G-TUBE **REMOTE WES...,0,7
10952,98761,182540,2186-02-08,2186-02-08,ELECTIVE,PHYS REFERRAL/NORMAL DELI,WHITE,SHORT GUT SYNDROME/SDA,0,7


In [16]:
len(set(ADMISSIONS.ETHNICITY))

37

In [17]:
ethnicity_group_d = {
    'AMERICAN INDIAN/ALASKA NATIVE': ['AMERICAN INDIAN/ALASKA NATIVE', 'NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER'],
    'ASIAN': ['ASIAN', 'ASIAN - ASIAN INDIAN',
                         'ASIAN - CAMBODIAN',
                         'ASIAN - CHINESE',
                         'ASIAN - FILIPINO',
                         'ASIAN - KOREAN',
                         'ASIAN - OTHER',
                         'ASIAN - THAI',
                         'ASIAN - VIETNAMESE'],
    'BLACK/AFRICAN': ['BLACK/AFRICAN',
                     'BLACK/AFRICAN AMERICAN',
                     'BLACK/CAPE VERDEAN',
                     'BLACK/HAITIAN'],
    'HISPANIC OR LATINO': ['HISPANIC OR LATINO', 
                           'CARIBBEAN ISLAND',
                          'HISPANIC/LATINO - CENTRAL AMERICAN (OTHER)',
                             'HISPANIC/LATINO - COLOMBIAN',
                             'HISPANIC/LATINO - CUBAN',
                             'HISPANIC/LATINO - DOMINICAN',
                             'HISPANIC/LATINO - GUATEMALAN',
                             'HISPANIC/LATINO - MEXICAN',
                             'HISPANIC/LATINO - PUERTO RICAN',
                             'HISPANIC/LATINO - SALVADORAN'],
    'MIDDLE EASTERN': ['MIDDLE EASTERN'],
    'WHITE': ['WHITE',
             'WHITE - BRAZILIAN',
             'WHITE - EASTERN EUROPEAN',
             'WHITE - OTHER EUROPEAN',
             'WHITE - RUSSIAN',
             'PORTUGUESE'],
    'UNKNOWN': ['MULTI RACE ETHNICITY', 'OTHER', 'PATIENT DECLINED TO ANSWER', 'UNABLE TO OBTAIN', 
               'UNKNOWN/NOT SPECIFIED']
}

# 37 ethnicities mapped to 7 groups

In [18]:
ethnicity_d = {}
for ethnic_group, ethnic_labels in ethnicity_group_d.items():
    for eth_label in ethnic_labels:
        assert eth_label not in ethnicity_d, f"{eth_label} is assigned to multiple groups: {ethnicity_d[eth_label]} and ethnic_group."
        
        ethnicity_d[eth_label] = ethnic_group

In [19]:
len(ethnicity_d)

37

- When patient has admissions with different ethnicities recorded,

1. group the different ethnicities using the grouping above into a new set
2. if the set has at least one group in addition to 'UNKNOWN', remove 'UNKNOWN'
3. if set still has more than a group, then assign 'UNKNOWN'


In [20]:
subject_ethnicity_d = {}
for subject_id, subject_df in ADMISSIONS.groupby('SUBJECT_ID'):
    ethnicity = set(map(ethnicity_d.get, subject_df.ETHNICITY))
    if len(ethnicity) > 1 and 'UNKNOWN' in ethnicity:
        ethnicity.remove('UNKNOWN')
    if len(ethnicity) == 1:
        ethnicity = ethnicity.pop()
    else:
        ethnicity = 'UNKNOWN'
    subject_ethnicity_d[subject_id] = ethnicity

In [21]:
len(subject_ethnicity_d)

4434

## (G) Static attributes table

In [22]:
static_df = PATIENTS.copy(deep=True)
static_df['ETHNIC_GROUP'] = static_df.SUBJECT_ID.map(subject_ethnicity_d)

In [23]:
static_df

Unnamed: 0,SUBJECT_ID,GENDER,DOB,ETHNIC_GROUP
0,256,M,2086-07-31,WHITE
1,674,F,2113-12-14,WHITE
2,679,F,2059-11-04,WHITE
3,695,F,2093-05-14,UNKNOWN
4,698,F,1864-11-16,WHITE
...,...,...,...,...
4429,43943,F,2116-12-07,WHITE
4430,43946,F,2061-07-31,BLACK/AFRICAN
4431,43991,M,2069-01-27,WHITE
4432,44061,M,2073-09-29,WHITE


In [24]:
static_df.to_csv(f'{transformed_mimic_dir}/static_df.csv.gz', compression='gzip', index=False)


## (H) Finalize admissions table

In [25]:
adm_df = ADMISSIONS[['SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME']]
adm_df

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME
0,23,152223,2153-09-03,2153-09-08
1,23,124321,2157-10-18,2157-10-25
2,34,115799,2186-07-18,2186-07-20
3,34,144319,2191-02-23,2191-02-25
4,36,182104,2131-04-30,2131-05-08
...,...,...,...,...
10949,98759,109836,2165-05-05,2165-05-08
10950,98759,175386,2165-06-05,2165-06-07
10951,98761,184477,2186-01-16,2186-01-16
10952,98761,182540,2186-02-08,2186-02-08


In [71]:
adm_df.to_csv(f'{transformed_mimic_dir}/adm_df.csv.gz', compression='gzip', index=False)


## (I) Finalize tests table

In [72]:
test_df = TESTS_FILTERED_ZSCORES_AGG_DAY[['SUBJECT_ID', 'ITEMID', 'CHARTDAY', 'MEAN']]

In [73]:
test_df.columns = ['SUBJECT_ID', 'ITEMID', 'DATE', 'VALUE']
test_df

Unnamed: 0,SUBJECT_ID,ITEMID,DATE,VALUE
0,17,50852,2134-12-22,-1.201339
1,17,50861,2134-12-22,-0.809370
2,17,50862,2134-12-22,1.634410
3,17,50863,2134-12-22,-1.452596
4,17,50867,2134-12-22,-0.608894
...,...,...,...,...
4546204,99982,227456,2157-02-22,1.794272
4546205,99982,227457,2157-02-22,-0.723781
4546206,99982,227465,2157-02-22,2.247434
4546207,99982,227466,2157-02-22,-0.029395


In [74]:
test_df.to_csv(f'{transformed_mimic_dir}/test_df.csv.gz', compression='gzip', index=False)


## (J) Finalize PROCEDURES/DIAGNOSES tables

In [26]:
diag_df = DIAGNOSES_ICD[DIAGNOSES_ICD.ICD9_CODE.notnull()]
diag_df

Unnamed: 0,SUBJECT_ID,HADM_ID,ICD9_CODE
0,112,174105,53100
1,112,174105,41071
2,112,174105,2859
3,112,174105,41401
4,112,174105,725
...,...,...,...
127261,97488,161999,0414
127262,97488,161999,30391
127263,97488,161999,E8798
127264,97488,161999,78791


In [27]:
proc_df = PROCEDURES_ICD[PROCEDURES_ICD.ICD9_CODE.notnull()]
proc_df

Unnamed: 0,SUBJECT_ID,HADM_ID,ICD9_CODE
0,62641,154460,3404
1,28600,189217,3613
2,28600,189217,3615
3,28600,189217,3961
4,18395,101133,0159
...,...,...,...
31433,41035,102460,3228
31434,41035,102460,3201
31435,42694,180323,0206
31436,42694,180323,0123


### Remove duplicate codes for the same patient for the same admission

In [28]:
diag_df = diag_df.drop_duplicates(ignore_index=True)
diag_df

Unnamed: 0,SUBJECT_ID,HADM_ID,ICD9_CODE
0,112,174105,53100
1,112,174105,41071
2,112,174105,2859
3,112,174105,41401
4,112,174105,725
...,...,...,...
127213,97488,161999,0414
127214,97488,161999,30391
127215,97488,161999,E8798
127216,97488,161999,78791


In [29]:
proc_df = proc_df.drop_duplicates(ignore_index=True)
proc_df

Unnamed: 0,SUBJECT_ID,HADM_ID,ICD9_CODE
0,62641,154460,3404
1,28600,189217,3613
2,28600,189217,3615
3,28600,189217,3961
4,18395,101133,0159
...,...,...,...
30383,41035,102460,3228
30384,41035,102460,3201
30385,42694,180323,0206
30386,42694,180323,0123


In [79]:
diag_df.to_csv(f'{transformed_mimic_dir}/diag_df.csv.gz', compression='gzip', index=False)
proc_df.to_csv(f'{transformed_mimic_dir}/proc_df.csv.gz', compression='gzip', index=False)


In [80]:
diag_df

Unnamed: 0,SUBJECT_ID,HADM_ID,ICD9_CODE
0,112,174105,53100
1,112,174105,41071
2,112,174105,2859
3,112,174105,41401
4,112,174105,725
...,...,...,...
127213,97488,161999,0414
127214,97488,161999,30391
127215,97488,161999,E8798
127216,97488,161999,78791


In [81]:
proc_df

Unnamed: 0,SUBJECT_ID,HADM_ID,ICD9_CODE
0,62641,154460,3404
1,28600,189217,3613
2,28600,189217,3615
3,28600,189217,3961
4,18395,101133,0159
...,...,...,...
30383,41035,102460,3228
30384,41035,102460,3201
30385,42694,180323,0206
30386,42694,180323,0123


In [5]:
diag_df = pd.read_csv(f'{transformed_mimic_dir}/diag_df.csv.gz')


In [6]:
print('Avg diag. ICD9 codes per admission=', len(diag_df)/diag_df['HADM_ID'].nunique())

Avg diag. ICD9 codes per admission= 11.655336692624829


In [30]:
import os, sys
parent_dir = os.path.abspath('..')
# the parent_dir could already be there if the kernel was not restarted,
# and we run this cell again
if parent_dir not in sys.path:
    sys.path.append(parent_dir)
    
from mimicnet.mimic3 import concept, dag

KG = dag.CCSDAG()
admission_n_ccs_codes = {}
admission_n_icd_codes = {}

In [31]:
for admission_id, diag_adm_df in diag_df.groupby('HADM_ID'):
    icd_codes = set(diag_adm_df.ICD9_CODE)
    ccs_codes = set(map(KG.diag_icd2ccs.get, icd_codes))
    admission_n_ccs_codes[admission_id] = len(ccs_codes)
    admission_n_icd_codes[admission_id] = len(icd_codes)

In [32]:
admission_n_codes = pd.DataFrame(index=admission_n_ccs_codes.keys(),
                                data = {'CCS': admission_n_ccs_codes.values(),
                                       'ICD': admission_n_icd_codes.values()})

In [33]:
admission_n_codes.describe()

Unnamed: 0,CCS,ICD
count,10915.0,10915.0
mean,10.84535,11.655337
std,5.423405,6.091353
min,1.0,1.0
25%,7.0,8.0
50%,10.0,10.0
75%,14.0,15.0
max,34.0,39.0
