In [43]:
import numpy as np
import pandas as pd
import sys
import os
from collections import defaultdict

In [2]:
icd9_tree_path = '../icd9-tree/'
sys.path.append(icd9_tree_path)
from icd9 import ICD9
# feel free to replace with your path to the json file
tree = ICD9(os.path.join(icd9_tree_path,'codes.json'))

This builds an index from ICD-9 codes to the records that contain them. The index is then used with the ICD-9 tree to return the indices of all rows that contain the leaves corresponding to any node in the tree.

First, I load the processed discharge records (see CAML repo for method to build notes_labeles.csv from MIMIC III tables.

In [48]:
file = '../mimicdata/mimic3/notes_labeled.csv'
df = pd.read_csv(file)

Convert each string of ';' delimited ICD9 codes to set objects.

In [54]:
df['LABELS'].fillna('', inplace=True)
df['LABELS'] = df['LABELS'].str.split(';').apply(set)

Build index from ICD9 codes to rows in the dataframe.

In [58]:
code_to_idx = defaultdict(list)

def build_code_to_idx(row):
    for code in row.LABELS:
        code_to_idx[code].append(row.name)
    return row

for index, row in df.iterrows():
    for code in row.LABELS:
        code_to_idx[code].append(index)

Finally, define a helper that returns a list of indices for records containing any descendents of any node in the tree. The arguments are the ICD9 tree object and a string with the nodes name from the tree.

In [128]:
def get_idx_for_node(tree, node):
    """Gets row idx for records related to node's descendents"""
    idx = set()
    root = tree.find(node)
    
    if root is None:
        print("Node {} not found.".format(node))
        return None
    else:   
        for leaf in root.leaves:
            idx = idx.union(set(code_to_idx[leaf.code]))
    
    return list(idx)