In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.append('../')
from icd9 import *
from collections import defaultdict
data_path = '../data/'
mimic_path = '../data/restricted_mimic_iii'

This notebook maps labels in CAML-formatted datasets to different maximum depths and confirms that the mapped-to codes exist in the format expected by CAML description lookups. To produce the CAML-formatted datasets required to run the code below, please see the CAML repo.
Credit: https://github.com/jamesmullenbach/caml-mimic

In [2]:
# Check which codes are in the CAML datasets but missing from the ICD9 tree
caml_ds_codes = set()
for split in ['train', 'dev', 'test']:
    df = pd.read_csv(f'{mimic_path}/{split}_full.csv')
    df['LABELS'].fillna('', inplace=True)
    code_series = df['LABELS'].str.split(';').apply(set)
    for code_set in code_series:
        caml_ds_codes = caml_ds_codes.union(code_set)

In [3]:
# Build ICD-9 tree for use in mapping
tree = ICD9Tree(f'{data_path}node_desc.csv', f'{data_path}node_parent.csv')

In [4]:
caml_ds_codes - tree.nodes.keys()

{'',
 '11.',
 '11.8',
 '12.',
 '13.',
 '14.',
 '15.',
 '16.',
 '17.',
 '17.0',
 '22.',
 '23.9',
 '29.6',
 '30.9',
 '32.',
 '34.',
 '36.01',
 '36.02',
 '36.05',
 '40.7',
 '43.',
 '45.',
 '47.4',
 '50.',
 '51.',
 '52.',
 '54.',
 '55.',
 '61.',
 '63.',
 '64.',
 '719.70',
 '93.'}

I handle the 'XX.' pattern in the max_depth_labels helper (the tree drops trailing '.' chars). I am ignoring the other omissions for now, but it appears they are not matching the tree because they are the codes affected by the erroneous dropping of leading 0's issue from the CAML pipeline.

In [5]:
# This helper takes a series of sets of labels and maps to labels at
# a maximum depth in the ICD-9 ontology
def max_depth_labels(labels, max_depth, tree):
    """Helper for remapping labels to a maximum depth.
    
    Arguments
    *********
    labels : pandas Series of sets
        A series containing sets of labels.
    max_depth : int
        The maximum depth to map labels to.
    tree : ICD9Tree object
        A tree object used for mapping nodes to ancestors. 
        
    Returns
    *******
    A series with remapped labels
    """
    # Buils a label-ancestor map for efficiency 
    temp_map = {}
    results = []
    for label_set in labels:
        temp_set = set()
        for label in label_set:
            if label in temp_map:
                temp_set.add(temp_map[label])
            else:
                if len(label) > 0:
                    if label[-1] == '.':
                        temp_node = tree.get_node(label[:-1])
                    else:
                        temp_node = tree.get_node(label)
                else:
                    temp_node = tree.get_node(label)
                if temp_node is not None:
                    while temp_node.depth > max_depth:
                        temp_node = temp_node.parent
                    temp_map[label] = temp_node.code
                    temp_set.add(temp_node.code)
        results.append(temp_set)
        
    return pd.Series(results)

In [6]:
# The following files are used for indexing codes in CAML.
# I will load them into a set to confirm codes are mapped to new codes that will be recognized by CAML
diag_desc_df = pd.read_csv(mimic_path+'/D_ICD_DIAGNOSES.csv')
proc_desc_df = pd.read_csv(mimic_path+'/D_ICD_PROCEDURES.csv')
icd_desc_df = pd.read_csv(mimic_path+'/ICD9_descriptions', header=None,
                          delimiter='\t', names=['ICD9_CODE', 'description'])

In [7]:
# This helper is directly from the CAML code 
def reformat(code, is_diag):
    """
        Put a period in the right place because the MIMIC-3 data files exclude them.
        Generally, procedure codes have dots after the first two digits, 
        while diagnosis codes have dots after the first three digits.
    """
    code = ''.join(code.split('.'))
    if is_diag:
        if code.startswith('E'):
            if len(code) > 4:
                code = code[:4] + '.' + code[4:]
        else:
            if len(code) > 3:
                code = code[:3] + '.' + code[3:]
    else:
        code = code[:2] + '.' + code[2:]
    return code

In [8]:
# Reformat codes from mimic-iii to CAML format
diag_desc_df['ICD9_CODE'] = diag_desc_df['ICD9_CODE'].apply(reformat, is_diag=True)
proc_desc_df['ICD9_CODE'] = diag_desc_df['ICD9_CODE'].apply(reformat, is_diag=False)

In [9]:
# Build set of codes
caml_codes = diag_desc_df['ICD9_CODE'].tolist()
caml_codes.extend(proc_desc_df['ICD9_CODE'].tolist())
caml_codes.extend(icd_desc_df['ICD9_CODE'].tolist())
caml_codes = set(caml_codes)

In [10]:
# Load in CAML-formatted datasets with all codes
not_found = set()
for split in ['train', 'dev', 'test']:
    df = pd.read_csv(f'{mimic_path}/{split}_full.csv')
    df['LABELS'].fillna('', inplace=True)
    df['LABELS'] = df['LABELS'].str.split(';').apply(set)
    for md in range(1,8):
        df['md'] = max_depth_labels(df['LABELS'], md, tree)
        for new_set in df['md']:
            for code in new_set:
                if code not in caml_codes:
                    not_found.add(code)
        df['md'] = df['md'].apply(lambda x : ';'.join(list(x)))
        df.to_csv(f'{mimic_path}/{split}_md{md}.csv', columns=[f'SUBJECT_ID', 'HADM_ID', 'TEXT', 'md', 'length'],
                  header=['SUBJECT_ID', 'HADM_ID', 'TEXT', 'LABELS', 'length'], index=False)
print(f'Warning: {not_found} not found in CAML codes. This may cause problems if using DR-CAML.')

