In [1]:
import pandas as pd
import numpy as np
import sys
import re
sys.path.append('../')
data_path = '../data/'
model_path = '../models/'
from icd9 import *


In [2]:
data = pd.read_csv(f'{data_path}restricted_mimic_iii/labeled_notes.csv')

In [3]:
# Use basic text cleaning functions from reference book
# Citation: Python Machine Learning 2nd Edition, Raschka
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text

def tokenize(text):
    return text.split()

In [4]:
# Preprocess all text
data['text'] = data['text'].apply(preprocessor)

For now, I will follow Perotte 2014 and use only the top 10,000 most frequent unigrams. 

In [5]:
# Next, fit tfidf to training data and transform other splits
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(stop_words='english',
                                   tokenizer=tokenize,
                                   ngram_range=(1,1),
                                   max_features=10000)

train_data = data.loc[data['split']=='train',:].reset_index()
val_data = data.loc[data['split']=='val',:].reset_index()
test_data = data.loc[data['split']=='test',:].reset_index()

train_X = tfidf_vectorizer.fit_transform(train_data['text'].values)
val_X = tfidf_vectorizer.transform(val_data['text'].values)
test_X = tfidf_vectorizer.transform(test_data['text'].values)

Below is a simple demo of fitting hierarchical SVMs and then making predictions.

In [6]:
# Instantiate ICD-9 tree and build index for train_data
tree = ICD9Tree(f'{data_path}node_desc.csv', f'{data_path}node_parent.csv')
tree.index_df(train_data, codes='fcode')

In [7]:
# Fit models and make predictions for entire ICD-9 tree
test_node = tree.root

To retrain and save the models, flip the comments in the two cells below. Otherwise, simply load the saved models from pickle files.

In [8]:
#tree.fit_hmodel(train_X, test_node, max_depth=None)
#tree.save_models(model_path+'hsvm_C1.obj', model_type='h')
tree.load_models(model_path+'hsvm_C1.obj', model_type='h')

In [9]:
#tree.fit_fmodel(train_X, test_node, max_depth=None)
#tree.save_models(model_path+'fsvm_C1.obj', model_type='f')
tree.load_models(model_path+'fsvm_C1.obj', model_type='f')

In [10]:
h_preds_train = tree.predict_hmodel(train_X, test_node)
h_preds_val = tree.predict_hmodel(val_X, test_node)
h_preds_test = tree.predict_hmodel(test_X, test_node)

In [11]:
f_preds_train = tree.predict_fmodel(train_X, test_node)
f_preds_val = tree.predict_fmodel(val_X, test_node)
f_preds_test = tree.predict_fmodel(test_X, test_node)

In [12]:
# Function for comparing flat and hierarchical models
def flat_v_hier_eval(df, h_preds, f_preds, codes='fcode'):
    """Helper for evaluating flat v. hierarchical models.
    
    df : pandas DataFrame
        The evaluation set features and true labels.
    h_preds : list of lists of code Strings
        Labels predicted by the hierarchical model.
    f_preds : list of lists of code Strings
        Labels predicted by the flat model.
    codes : String
        The name of the column containing the codes as ";" delimited
        String.
    """
    df['h_preds'] = pd.Series(h_preds).apply(set)
    df['f_preds'] = pd.Series(f_preds).apply(set)

    df['h_match'] = 0
    df['f_match'] = 0

    df[codes].fillna('', inplace=True)
    df[codes] = df[codes].str.split(';').apply(set)

    for idx, row in df.iterrows():
        df.loc[idx, 'h_match'] = len(row.fcode.intersection(row.h_preds))

    for idx, row in df.iterrows():
        df.loc[idx, 'f_match'] = len(row.fcode.intersection(row.f_preds)) 
        
    def f1_score(prec, recall):
        """Compute f1 score
        
        Source: https://en.wikipedia.org/wiki/F1_score
        """
        return 2 * (prec * recall)/(prec + recall)

    h_recall = df['h_match'].sum() / df['fcode'].apply(len).sum()
    h_prec = df['h_match'].sum() / df['h_preds'].apply(len).sum()

    f_recall = df['f_match'].sum() / df['fcode'].apply(len).sum()
    f_prec = df['f_match'].sum() / df['f_preds'].apply(len).sum()

    print(f'Flat micro-avg metrics: precision = {f_prec:.2f}, recall = {f_recall:.2f}, f1 = {f1_score(f_prec, f_recall):.2f}')
    print(f'Hierarchical micro-avg metrics: precision = {h_prec:.2f}, recall = {h_recall:.2f}, f1 = {f1_score(h_prec, h_recall):.2f}')
    
    return None

In [13]:
# Evaluate models on training set
flat_v_hier_eval(train_data, h_preds_train, f_preds_train)

Flat micro-avg metrics: precision = 0.79, recall = 0.49, f1 = 0.6014615492289305
Hierarchical micro-avg metrics: precision = 0.72, recall = 0.64, f1 = 0.6800467863386154


In [14]:
# Evaluate models on validation set
flat_v_hier_eval(val_data, h_preds_val, f_preds_val)

Flat micro-avg metrics: precision = 0.62, recall = 0.23, f1 = 0.3400204185809086
Hierarchical micro-avg metrics: precision = 0.50, recall = 0.34, f1 = 0.4062975072367188


In [15]:
# Evaluate models on test set
flat_v_hier_eval(test_data, h_preds_test, f_preds_test)

Flat micro-avg metrics: precision = 0.61, recall = 0.23, f1 = 0.33221391687430035
Hierarchical micro-avg metrics: precision = 0.49, recall = 0.33, f1 = 0.3938021874632483


I cannot compare directly to Perotte 2014, as so far I have only run this on MIMIC III using the CAML paper splits. However, these numbers are similar so far. Perotte found f1/precision/recall of 0.21/0.56/0.13 and 0.29/0.39/0.23 for the flat and hierarchical SVM models, respectively. 

In comparison, I am finding precision/recall of 0.33/0.61/0.23 and 0.39/0.49/0.33 for the flat and hierarchical SVM models, respectively.

A MIMIC III comparison point is micro-f1 score with Mullenbach et. al (2018). Mullenbach et. al found 0.272 with a flat logistic regression model (very similar model to a linear SVC) while I found 0.332 with a flat linear SVC and 0.394 with a hierarchical SVC. 

Note: I am using the strict version of precision/recall where only exact code matches are considered. Perotte also presents a less strict version of precision/recall where ancestors/descendents are considered for a true positive match and descendents are considered for overriding an otherwise false negative.