### MIMIC III clinical records prep

In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.append('../')
from icd9 import *

The data path below points to a directory containing the "DIAGNOSES_ICD", "NOTEEVENTS", and "PROCEDURES_ICD" files. 

In [2]:
data_path = '../data/'

First, I need to standardize ICD-9 codes and normalize the results by hospital admission ID.

### ICD-9 Codes

In [3]:
keep_cols = ['hadm_id', 'icd9_code']

diag_icd = pd.read_csv(f'{data_path}/restricted_mimic_iii/DIAGNOSES_ICD.csv')
diag_icd.columns = [name.lower() for name in diag_icd.columns]
diag_icd = diag_icd.loc[:, keep_cols]
diag_icd.dropna(subset=['icd9_code'], inplace=True)

proc_icd = pd.read_csv(f'{data_path}/restricted_mimic_iii/PROCEDURES_ICD.csv')
proc_icd.columns = [name.lower() for name in proc_icd.columns]
proc_icd = proc_icd.loc[:, keep_cols]
proc_icd.dropna(subset=['icd9_code'], inplace=True)

The MIMIC III documentation provides the following clarification:

"The code field for the ICD-9-CM Principal and Other Diagnosis Codes is six characters in length, with the decimal point implied between the third and fourth digit for all diagnosis codes other than the V codes. The decimal is implied for V codes between the second and third digit."

I will reformat the ICD-9 codes to a string format with decimal. The format needs to be consistent with the ICD-9 tree object naming convention.  

In [4]:
# This could be vectorized using Series op's but dataset is small
# enough and apply method is more readable for this reformatting
# Credit: https://github.com/jamesmullenbach/caml-mimic
def format_codes(code, is_diag):
    """Reformat codes to match ICD-9 tree."""
    str_code = str(code)
    if is_diag:
        if str_code[0] == 'E':
            if len(str_code) > 4:
                str_code = str_code[:4] + '.' + str_code[4:]
        else:
            if len(str_code) > 3:
                str_code = str_code[:3] + '.' + str_code[3:]
    else:
        if len(str_code) > 2:
            str_code = str_code[:2] + '.' + str_code[2:]
    return str_code

In [5]:
# Reformat ICD-9 codes to match ICD-9 tree class
diag_icd['fcode'] = diag_icd['icd9_code'].apply(format_codes, is_diag=True)
proc_icd['fcode'] = proc_icd['icd9_code'].apply(format_codes, is_diag=False)

Now, I will instantiate an ICD-9 tree object. 

In [6]:
tree = ICD9Tree(f'{data_path}node_desc.csv', f'{data_path}node_parent.csv')

In [7]:
# Join all ICD-9 codes together and check proportion that
# do not match the tree
all_icd = pd.concat([diag_icd, proc_icd], axis=0).drop('icd9_code', axis=1)
match_icd = all_icd.loc[all_icd['fcode'].apply(lambda x: x in tree.nodes),:]
perc_drop = 100*(1 - match_icd.shape[0] / all_icd.shape[0])
print('{:.2f}% of events dropped.'.format(perc_drop))

0.24% of events dropped.


I checked some of the dropped codes and they do not appear to exist in the ICD-9 diagnosis or procedure hierarchies. This makes me think that they are erroneous. Rather than trying to infer the intended codes, I will simply drop the small number of affected events.

Finally, I need to group the ICD-9 codes by hospital admission ID. 

In [8]:
labels_df = match_icd.groupby('hadm_id')['fcode'] \
                     .apply(lambda x: ';'.join(x)).reset_index()

### Clinical Text

In [9]:
notes_df = pd.read_csv(f'{data_path}/restricted_mimic_iii/NOTEEVENTS.csv', parse_dates=['CHARTDATE'],
                       low_memory=False)
notes_df.columns = [name.lower() for name in notes_df.columns]

Check the category and description values.

In [10]:
# Subset down to discharge summaries only
keep_cols = ['hadm_id', 'text']
notes_df = notes_df.loc[notes_df['category'] == 'Discharge summary',
                        keep_cols]

In [11]:
# Concatenate discharge notes by hadm_id
notes_df = notes_df.groupby('hadm_id')['text'] \
                   .apply(lambda x: ' '.join(x)).reset_index()

### Join Notes to Labels and Assign Splits

In [12]:
train_ids = pd.read_csv(f'{data_path}train_full_hadm_ids.csv',
                        header=None, names=['hadm_id'])
val_ids = pd.read_csv(f'{data_path}dev_full_hadm_ids.csv',
                      header=None, names=['hadm_id'])
test_ids = pd.read_csv(f'{data_path}test_full_hadm_ids.csv',
                       header=None, names=['hadm_id'])

In [13]:
train_notes = notes_df.merge(train_ids, on='hadm_id', how='inner')
train_notes['split'] = 'train'
val_notes = notes_df.merge(val_ids, on='hadm_id', how='inner')
val_notes['split'] = 'val'
test_notes = notes_df.merge(test_ids, on='hadm_id', how='inner')
test_notes['split'] = 'test'

In [14]:
labeled_notes = pd.concat([train_notes, val_notes, test_notes], axis=0) \
                  .merge(labels_df, on='hadm_id', how='inner')

In [15]:
labeled_notes.to_csv(f'{data_path}/restricted_mimic_iii/labeled_notes.csv', index=False)