In [1]:
import numpy as np
import pandas as pd
import json
import collections
from collections import defaultdict 
from functools import partial
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm


In [2]:
multi_visit_mimic_dir = '/home/asem/GP/ehr-data/mimic4-multi-visit'
transformed_mimic_dir = '/home/asem/GP/ehr-data/mimic4-transforms'
mimic_dir = '/home/asem/GP/ehr-data/mimic4-v1.0/physionet.org/files/mimiciv/1.0'

In [3]:
patients = pd.read_csv(f'{multi_visit_mimic_dir}/patients_2wks.csv.gz')
admissions = pd.read_csv(f'{multi_visit_mimic_dir}/admissions_2wks.csv.gz')
diagnoses_icd = pd.read_csv(f'{multi_visit_mimic_dir}/diagnoses_icd_2wks.csv.gz', dtype = {'icd_code': str})
procedures_icd = pd.read_csv(f'{multi_visit_mimic_dir}/procedures_icd_2wks.csv.gz', dtype = {'icd_code': str})

# Checklist

- (A) **Cast datetime to date**.
- (B) **Merge CHARTEVENTS and LABEVENTS**
- (C) **Remove outliers by IQR** (NOTE: possible information leakage between training and testing. So the aim just to assess the ability of neural ODEs to predict codes for next visits compared to GRAM methods. But for real clinical investigation, outlier removal should be applied on training subset then passing filtration limits to test subset.)
- (D) **Normalize to Z-scores**. (NOTE: see (C)).
- (E) **For repeated measurements in the same day, consider the average.**
    - Average is considered instead of median to be sensitive to edge (and sporadic) values which could be indicative of health conditions.
- (F) For DX/PR codes:
    1. Add to the middle day of the H. Adm.
    2. Add to all days of the H.Adm.


# (A) Cast datetime to date

## (A-1) PATIENTS table

In [4]:
patients.head()

In [5]:
patients.dtypes

In [6]:
patients = patients[['subject_id', 'gender', 'anchor_age', 'anchor_year']]
patients['dob'] = pd.to_datetime(patients['anchor_year'] - patients['anchor_age'], format='%Y').dt.normalize()

In [7]:
patients.head()

In [8]:
patients.dtypes

## (A-2) ADMISSIONS table

In [9]:
admissions.head()

In [10]:
admissions = admissions[['subject_id', 'hadm_id', 'admittime', 'dischtime', 'ethnicity']]
admissions.columns = ['SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME', 'ETHNICITY']

In [11]:
admissions.ADMITTIME = pd.to_datetime(admissions.ADMITTIME, infer_datetime_format=True).dt.normalize()
admissions.DISCHTIME = pd.to_datetime(admissions.DISCHTIME, infer_datetime_format=True).dt.normalize()
admissions.head()

In [12]:
admissions.dtypes

## (A-3) DIAGNOSES  and PROCEDURES tables

In [13]:
diagnoses_icd.head()

In [14]:
diagnoses_icd = diagnoses_icd[['subject_id', 'hadm_id', 'icd_code', 'icd_version']]

In [15]:
procedures_icd.head()


In [16]:
procedures_icd = procedures_icd[['subject_id', 'hadm_id', 'icd_code', 'icd_version']]

## Convert ICD-10 to ICD-9

In [17]:
icd_conv = pd.read_csv('icd10toicd9gem.csv', dtype = {'icd9cm': str, 'icd10cm': str})
icd_conv.head()


In [18]:
(icd_conv.no_map == 0).mean()

In [19]:
from collections import defaultdict
icd_conv_dict = defaultdict(set)
for row in icd_conv[icd_conv.no_map == 0].itertuples():
    icd_conv_dict[row.icd10cm].add(row.icd9cm)

In [20]:
set(map(len, icd_conv_dict.values()))

In [21]:
diagnoses_icd_10 = diagnoses_icd[diagnoses_icd.icd_version == 10]
procedures_icd_10 = procedures_icd[procedures_icd.icd_version == 10]


In [22]:
diagnoses_icd9_converted = {'subject_id': [],
                            'hadm_id': [],
                            'icd_code': [],
                            'icd_version': []}
procedures_icd9_converted = {'subject_id': [],
                            'hadm_id': [],
                            'icd_code': [],
                            'icd_version': []}

for row in diagnoses_icd_10.itertuples():
    for icd9 in icd_conv_dict.get(row.icd_code, {}):
        diagnoses_icd9_converted['subject_id'].append(row.subject_id)
        diagnoses_icd9_converted['hadm_id'].append(row.hadm_id)
        diagnoses_icd9_converted['icd_code'].append(icd9)
        diagnoses_icd9_converted['icd_version'].append(9)

for row in procedures_icd_10.itertuples():
    for icd9 in icd_conv_dict.get(row.icd_code, {}):
        procedures_icd9_converted['subject_id'].append(row.subject_id)
        procedures_icd9_converted['hadm_id'].append(row.hadm_id)
        procedures_icd9_converted['icd_code'].append(icd9)
        procedures_icd9_converted['icd_version'].append(9)

In [23]:
diagnoses_icd9_converted = pd.DataFrame(diagnoses_icd9_converted)
procedures_icd9_converted = pd.DataFrame(procedures_icd9_converted)


In [24]:
print('diag 10->9 expansion ratio:', len(diagnoses_icd9_converted)/len(diagnoses_icd_10))
print('proc 10->9 expansion ratio:', len(procedures_icd9_converted)/len(procedures_icd_10))


In [25]:
# The original rows with ICD9
diagnoses_icd9 = diagnoses_icd[diagnoses_icd.icd_version == 9]
procedures_icd9 = procedures_icd[procedures_icd.icd_version == 9]

print('|diag_icd9_original|=', len(diagnoses_icd9))
print('|proc_icd9_original|=', len(procedures_icd9))

print('|diag_icd9_converted|=', len(diagnoses_icd9_converted))
print('|proc_icd9_converted|=', len(procedures_icd9_converted))


# Now with merging the converted ICD9
diagnoses_icd9 = diagnoses_icd9.append(diagnoses_icd9_converted)
procedures_icd9 = procedures_icd9.append(procedures_icd9_converted)

print('|diag_icd9_total|=', len(diagnoses_icd9))
print('|proc_icd9_total|=', len(procedures_icd9))


In [26]:
diagnoses_icd9.to_csv(f'{transformed_mimic_dir}/diagnoses_icd9.csv.gz', compression='gzip', index=False)
procedures_icd9.to_csv(f'{transformed_mimic_dir}/procedures_icd9.csv.gz', compression='gzip', index=False)

In [27]:
patients

In [28]:
set(patients.gender)

## (F) Ethnicity normalization (grouping)

In [29]:
admissions

In [30]:
set(admissions.ETHNICITY)

In [31]:
ethnicity_group_d = {
    'AMERICAN INDIAN/ALASKA NATIVE': ['AMERICAN INDIAN/ALASKA NATIVE'],
    'ASIAN': ['ASIAN'],
    'BLACK/AFRICAN': ['BLACK/AFRICAN AMERICAN'],
    'HISPANIC OR LATINO': ['HISPANIC/LATINO'],
    'WHITE': ['WHITE'],
    'UNKNOWN': ['OTHER', 'UNABLE TO OBTAIN', 'UNKNOWN']
}

# 8 ethnicities mapped to 6 groups

In [32]:
ethnicity_d = {}
for ethnic_group, ethnic_labels in ethnicity_group_d.items():
    for eth_label in ethnic_labels:
        assert eth_label not in ethnicity_d, f"{eth_label} is assigned to multiple groups: {ethnicity_d[eth_label]} and ethnic_group."
        
        ethnicity_d[eth_label] = ethnic_group

In [33]:
len(ethnicity_d)

- When patient has admissions with different ethnicities recorded,

1. group the different ethnicities using the grouping above into a new set
2. if the set has at least one group in addition to 'UNKNOWN', remove 'UNKNOWN'
3. if set still has more than a group, then assign 'UNKNOWN'


In [34]:
subject_ethnicity_d = {}
for subject_id, subject_df in admissions.groupby('SUBJECT_ID'):
    ethnicity = set(map(ethnicity_d.get, subject_df.ETHNICITY))
    if len(ethnicity) > 1 and 'UNKNOWN' in ethnicity:
        ethnicity.remove('UNKNOWN')
    if len(ethnicity) == 1:
        ethnicity = ethnicity.pop()
    else:
        ethnicity = 'UNKNOWN'
    subject_ethnicity_d[subject_id] = ethnicity

In [35]:
len(subject_ethnicity_d)

## (G) Static attributes table

In [36]:
static_df = patients.copy(deep=True)
static_df['ETHNIC_GROUP'] = static_df.subject_id.map(subject_ethnicity_d)

In [37]:
static_df

In [38]:
static_df = static_df[['subject_id', 'gender', 'dob', 'ETHNIC_GROUP']]
static_df.columns = ['SUBJECT_ID', 'GENDER', 'DOB', 'ETHNIC_GROUP']

In [39]:
static_df.to_csv(f'{transformed_mimic_dir}/static_df.csv.gz', compression='gzip', index=False)


## (H) Statistics of admissions table

In [45]:

patient_ordered_admissions = {}

for patient_id, patient_df in admissions.groupby('SUBJECT_ID'):
    admit = patient_df['ADMITTIME'].sort_values()
    patient_ordered_admissions[patient_id] = list(admit)

In [46]:
patient_intervals = defaultdict(list)
all_intervals = []
for patient_id, admittimes in patient_ordered_admissions.items():
    intervals = []
    for adm1, adm2 in zip(admittimes[:-1], admittimes[1:]):
        weeks = (adm2 - adm1).days / 7
        intervals.append(weeks)
        all_intervals.append(weeks)
    patient_intervals[patient_id] = intervals
    
pd.DataFrame({'intervals(weeks)': all_intervals}).describe()

## (J) Finalize PROCEDURES/DIAGNOSES tables

In [165]:
diagnoses_icd9 = diagnoses_icd9[['subject_id', 'hadm_id', 'icd_code']]
diagnoses_icd9.columns = ['SUBJECT_ID', 'HADM_ID', 'ICD9_CODE']
diagnoses_icd9 = diagnoses_icd9.sort_values(by=['SUBJECT_ID', 'HADM_ID']).reset_index(drop=True)

In [166]:
diag_df = diagnoses_icd9[diagnoses_icd9.ICD9_CODE.notnull()]
diag_df

In [167]:
procedures_icd9 = procedures_icd9[['subject_id', 'hadm_id', 'icd_code']]
procedures_icd9.columns = ['SUBJECT_ID', 'HADM_ID', 'ICD9_CODE']
procedures_icd9 = procedures_icd9.sort_values(by=['SUBJECT_ID', 'HADM_ID']).reset_index(drop=True)

In [168]:
proc_df = procedures_icd9[procedures_icd9.ICD9_CODE.notnull()]
proc_df

### Remove duplicate codes for the same patient for the same admission

In [169]:
diag_df = diag_df.drop_duplicates(ignore_index=True)
diag_df

In [170]:
proc_df = proc_df.drop_duplicates(ignore_index=True)
proc_df

In [171]:
diag_df.to_csv(f'{transformed_mimic_dir}/diag_df.csv.gz', compression='gzip', index=False)
proc_df.to_csv(f'{transformed_mimic_dir}/proc_df.csv.gz', compression='gzip', index=False)


In [172]:
diag_df

In [173]:
proc_df

In [174]:
print('avg. icd9 codes per admission', len(diag_df)/diag_df['HADM_ID'].nunique())

In [175]:
len(set(diag_df.HADM_ID))

In [176]:
len(set(admissions.HADM_ID))

In [177]:
len(set(diag_df.HADM_ID) - set(admissions.HADM_ID))

In [179]:
import os, sys
parent_dir = os.path.abspath('..')
# the parent_dir could already be there if the kernel was not restarted,
# and we run this cell again
if parent_dir not in sys.path:
    sys.path.append(parent_dir)
    
from mimicnet.mimic3 import concept, dag

KG = dag.CCSDAG()
admission_n_ccs_codes = {}
admission_n_icd_codes = {}

In [186]:
for admission_id, diag_adm_df in diag_df.groupby('HADM_ID'):
    icd_codes = set(diag_adm_df.ICD9_CODE)
    ccs_codes = set(map(KG.diag_icd2ccs.get, icd_codes))
    admission_n_ccs_codes[admission_id] = len(ccs_codes)
    admission_n_icd_codes[admission_id] = len(icd_codes)

In [188]:
admission_n_codes = pd.DataFrame(index=admission_n_ccs_codes.keys(),
                                data = {'CCS': admission_n_ccs_codes.values(),
                                       'ICD': admission_n_icd_codes.values()})

In [190]:
admission_n_codes.describe()

In [185]:
dir(KG)