In [1]:
import pandas as pd
import numpy as np
import sys
import re
sys.path.append('../')
data_path = '../data/'
from icd9 import *

In [2]:
data = pd.read_csv(f'{data_path}/restricted_mimic_iii/labeled_notes.csv')

In [3]:
# Use basic text cleaning functions from reference book
# Citation: Python Machine Learning 2nd Edition, Raschka
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text

def tokenize(text):
    return text.split()

In [4]:
# Preprocess all text
data['text'] = data['text'].apply(preprocessor)

For now, I will follow Perotte 2014 and use only the top 10,000 most frequent unigrams. 

In [5]:
# Next, fit tfidf to training data and transform other splits
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(stop_words='english',
                                   tokenizer=tokenize,
                                   ngram_range=(1,1),
                                   max_features=10000)

train_data = data.loc[data['split']=='train',:].reset_index()
val_data = data.loc[data['split']=='val',:].reset_index()

train_X = tfidf_vectorizer.fit_transform(train_data['text'].values)
val_X = tfidf_vectorizer.transform(val_data['text'].values)

Below is a simple demo of fitting hierarchical SVMs and then making predictions.

In [6]:
# Instantiate ICD-9 tree and build index for train_data
tree = ICD9Tree(f'{data_path}node_desc.csv', f'{data_path}node_parent.csv')
tree.index_df(train_data, codes='fcode')

In [7]:
# Fit models and make predictions for entire ICD-9 tree
test_node = tree.root

In [8]:
tree.fit_hsvm(train_X, test_node, max_depth=None)

22406 model fits attempted.


In [9]:
preds = tree.predict_hsvm(val_X, test_node)

In [10]:
# Compare predictions to actual, only looking at recall for now
val_data['preds'] = pd.Series(preds).apply(set)
val_data['recall'] = 0

val_data['fcode'].fillna('', inplace=True)
val_data['fcode'] = val_data['fcode'].str.split(';').apply(set)

for idx, row in val_data.iterrows():
    #print(len(row.fcode.intersection(row.preds)))
    val_data.loc[idx, 'recall'] = len(row.fcode.intersection(row.preds)) / len(row.fcode)


print(f'Average recall: {val_data.recall.mean()}')

Average recall: 0.3667077935714987


In [11]:
# As a crude baseline, I will repeat using shuffled data which should be even worse than majority voting
preds2 = tree.predict_hsvm(val_X[np.random.permutation(np.arange(val_X.shape[0])),:], test_node)

# Compare predictions to actual, only looking at recall for now
val_data['preds'] = pd.Series(preds2).apply(set)
val_data['recall'] = 0

for idx, row in val_data.iterrows():
    #print(len(row.fcode.intersection(row.preds)))
    val_data.loc[idx, 'recall'] = len(row.fcode.intersection(row.preds)) / len(row.fcode)


print(f'Average recall: {val_data.recall.mean()}')

Average recall: 0.06534520358534794


Great, it's lifting pretty well relative to random guessing. Next step will be to compare to a majority class (a more realistic baseline), look at other metrics like precision/F1, and also compare to a flat SVM.