In [1]:
import pandas as pd
%run utils.ipynb

In [2]:
df = pd.read_csv('data.csv')

In [6]:
df['tokens'] = df['text'].apply(process_text)
df['umls_tokens'] = df['text'].apply(getUMLSTokens)

In [7]:
# df.to_csv('data_tokenized.csv', index=False)

In [293]:
value_distribution = df.loc[:,"dyslipidemia":"readmission"].apply(pd.value_counts).transpose()
value_distribution

Unnamed: 0,Maybe,No,Yes
dyslipidemia,3.0,1948.0,1011.0
fluid_electrolyte_disorder,,2256.0,706.0
obesity,4.0,2240.0,718.0
cancer,11.0,2078.0,873.0
peptic_ulcer,6.0,1988.0,968.0
hypertension,2.0,1512.0,1448.0
readmission,,2199.0,763.0


In [1]:
import pandas as pd

'''Features'''
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import label_binarize

'''Classifiers'''
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier

'''Metrics/Evaluation'''
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix
from scipy import interp
from itertools import cycle

'''Plotting'''
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

'''Display'''
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))
import warnings
warnings.filterwarnings('ignore')
pd.options.display.float_format = '{:,.2f}'.format

# load utility functions
%run utils.ipynb

Starting LanguageProcessor...
Initializing Tokenizer...
Initializing EmrSectionizer...
Initializing NegationMatcher...
Initializing MedicalPhraseMatcher...
Initializing EmrPhraseNormalizer...
Initializing EmrLanguagePostProcessor...
LanguageProcessor ready: ['tagger', 'custom_sentencizer', 'emr_sectionizer', 'medical_phrase_matcher', 'emr_phrase_normalizer', 'negation_matcher', 'emr_post_processor']


In [197]:
def get_train_test_data(feature_col, target_col, target_mapper):
    X = get_vectorized_data(df[feature_col].astype('str'))
    y = df[target_col].map(target_mapper).values
    #Train test split with stratified sampling for evaluation
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,shuffle=True,random_state=3,stratify = y,)
    
    return X_train, y_train, X_test, y_test

In [218]:
def get_vectorized_data(text_array):
    
    tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), 
                                   min_df = 2, 
                                   max_df = .95)

    X = tfidf_vectorizer.fit_transform(text_array) #features
    
    return reduce_vector_dimension(X)

In [219]:
def reduce_vector_dimension(X):
    lsa = TruncatedSVD(n_components=100, n_iter=10, random_state=3)
    X = lsa.fit_transform(X)
    return X

In [247]:
def test_multiple_classifiers(classifiers_dict, X_train, y_train, X_test, y_test, scenario_id='' ):
    model_name, ac_score_list, p_score_list, r_score_list, f1_score_list = [], [], [], [], []
    if scenario_id:
        scenario_id += ": "
    for k,v in classifiers_dict.items():   
        model_name.append(k)
        v.fit(X_train, y_train)
        y_pred = v.predict(X_test)
        ac_score_list.append(accuracy_score(y_test, y_pred))
        p_score_list.append(precision_score(y_test, y_pred, average='macro'))
        r_score_list.append(recall_score(y_test, y_pred, average='macro'))
        f1_score_list.append(f1_score(y_test, y_pred, average='macro'))
        model_comparison_df = pd.DataFrame([model_name, ac_score_list, p_score_list, r_score_list, f1_score_list]).T
        model_comparison_df.columns = ['model_name', scenario_id+'accuracy', scenario_id+'precision', scenario_id+'recall', scenario_id+'f1']
        model_comparison_df = model_comparison_df.sort_values(by=scenario_id+'f1', ascending=False)

    return model_comparison_df.set_index('model_name')

In [248]:
def test_multiple_features(feature_cols, target_col, target_mapper):
    results = None
    for feature_col in feature_cols:
        X_train, y_train, X_test, y_test = get_train_test_data(feature_col, target_col, target_mapper )
        results = pd.concat([test_multiple_classifiers(classifier_dict, X_train, y_train, X_test, y_test, feature_col), results], axis=1)
                             
    return results

In [249]:
df = pd.read_csv('data_tokenized.csv')

In [250]:
classifier_dict = { 
    'Stochastic Gradient Descent' : SGDClassifier(random_state=3, loss='hinge', penalty='l2', alpha=1e-3,  max_iter=5),
    'Logistic Regression': LogisticRegression(n_jobs=1, C=1e5),
    'Random Forest': RandomForestClassifier(random_state=3),
    'Decsision Tree': DecisionTreeClassifier(random_state=3),
    'AdaBoost': AdaBoostClassifier(random_state=3),
    'Gaussian Naive Bayes': GaussianNB(),
    'K Nearest Neighbor': KNeighborsClassifier(),
}

In [252]:
feature_col = 'tokens'
target_col = 'hypertension'
target_mapper = {"Yes": 1, "Maybe": 1, "No": 0}

X_train, y_train, X_test, y_test = get_train_test_data(feature_col, target_col, target_mapper )

test_multiple_classifiers(classifier_dict, X_train, y_train, X_test, y_test)

Unnamed: 0_level_0,accuracy,precision,recall,f1
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Stochastic Gradient Descent,0.7,0.7,0.7,0.7
Random Forest,0.66,0.66,0.66,0.66
Logistic Regression,0.66,0.66,0.66,0.66
AdaBoost,0.64,0.64,0.64,0.64
Gaussian Naive Bayes,0.63,0.63,0.63,0.63
K Nearest Neighbor,0.62,0.62,0.62,0.62
Decsision Tree,0.55,0.55,0.55,0.55


In [253]:
target_col = 'hypertension'
feature_cols = ['tokens', 'umls_tokens']
test_multiple_features(feature_cols, target_col, target_mapper)

Unnamed: 0,umls_tokens: accuracy,umls_tokens: precision,umls_tokens: recall,umls_tokens: f1,tokens: accuracy,tokens: precision,tokens: recall,tokens: f1
Logistic Regression,0.76,0.76,0.76,0.76,0.66,0.66,0.66,0.66
Stochastic Gradient Descent,0.75,0.75,0.75,0.75,0.7,0.7,0.7,0.7
AdaBoost,0.68,0.68,0.68,0.68,0.64,0.64,0.64,0.64
Random Forest,0.68,0.68,0.68,0.68,0.66,0.66,0.66,0.66
Gaussian Naive Bayes,0.67,0.67,0.67,0.67,0.63,0.63,0.63,0.63
Decsision Tree,0.6,0.6,0.6,0.6,0.55,0.55,0.55,0.55
K Nearest Neighbor,0.58,0.58,0.58,0.58,0.62,0.62,0.62,0.62


In [255]:
target_col = 'peptic_ulcer'
test_multiple_features(feature_cols, target_col, target_mapper)

Unnamed: 0,umls_tokens: accuracy,umls_tokens: precision,umls_tokens: recall,umls_tokens: f1,tokens: accuracy,tokens: precision,tokens: recall,tokens: f1
Gaussian Naive Bayes,0.65,0.6,0.6,0.6,0.61,0.6,0.61,0.6
Logistic Regression,0.7,0.65,0.6,0.6,0.7,0.66,0.61,0.62
AdaBoost,0.66,0.58,0.56,0.56,0.66,0.6,0.58,0.58
Decsision Tree,0.57,0.52,0.52,0.52,0.63,0.58,0.59,0.59
K Nearest Neighbor,0.61,0.51,0.51,0.5,0.66,0.59,0.57,0.57
Stochastic Gradient Descent,0.68,0.65,0.53,0.49,0.68,0.68,0.53,0.47
Random Forest,0.67,0.58,0.52,0.47,0.69,0.64,0.57,0.55


In [256]:
target_col = 'cancer'
test_multiple_features(feature_cols, target_col, target_mapper)

Unnamed: 0,umls_tokens: accuracy,umls_tokens: precision,umls_tokens: recall,umls_tokens: f1,tokens: accuracy,tokens: precision,tokens: recall,tokens: f1
Logistic Regression,0.73,0.67,0.61,0.62,0.82,0.81,0.73,0.75
K Nearest Neighbor,0.72,0.65,0.6,0.61,0.77,0.73,0.68,0.7
Gaussian Naive Bayes,0.69,0.62,0.6,0.61,0.76,0.72,0.68,0.69
AdaBoost,0.7,0.61,0.58,0.58,0.77,0.72,0.69,0.7
Decsision Tree,0.63,0.57,0.57,0.57,0.69,0.63,0.63,0.63
Stochastic Gradient Descent,0.74,0.74,0.58,0.56,0.8,0.84,0.67,0.69
Random Forest,0.73,0.72,0.57,0.56,0.79,0.8,0.68,0.7


In [257]:
target_col = 'obesity'
test_multiple_features(feature_cols, target_col, target_mapper)

Unnamed: 0,umls_tokens: accuracy,umls_tokens: precision,umls_tokens: recall,umls_tokens: f1,tokens: accuracy,tokens: precision,tokens: recall,tokens: f1
Gaussian Naive Bayes,0.69,0.59,0.59,0.59,0.7,0.6,0.6,0.6
AdaBoost,0.74,0.6,0.55,0.55,0.74,0.59,0.55,0.55
K Nearest Neighbor,0.72,0.57,0.54,0.53,0.72,0.59,0.56,0.56
Decsision Tree,0.65,0.53,0.53,0.53,0.67,0.56,0.57,0.56
Logistic Regression,0.75,0.62,0.54,0.53,0.75,0.63,0.58,0.58
Random Forest,0.75,0.61,0.52,0.48,0.75,0.64,0.55,0.55
Stochastic Gradient Descent,0.76,0.38,0.5,0.43,0.76,0.69,0.51,0.45


In [258]:
target_col = 'fluid_electrolyte_disorder'
test_multiple_features(feature_cols, target_col, target_mapper)

Unnamed: 0,umls_tokens: accuracy,umls_tokens: precision,umls_tokens: recall,umls_tokens: f1,tokens: accuracy,tokens: precision,tokens: recall,tokens: f1
Logistic Regression,0.78,0.69,0.64,0.65,0.81,0.73,0.69,0.71
Gaussian Naive Bayes,0.7,0.62,0.65,0.63,0.59,0.62,0.67,0.57
AdaBoost,0.76,0.65,0.61,0.62,0.77,0.68,0.65,0.66
K Nearest Neighbor,0.75,0.64,0.59,0.6,0.76,0.65,0.6,0.61
Decsision Tree,0.7,0.59,0.6,0.59,0.71,0.61,0.62,0.62
Random Forest,0.78,0.76,0.55,0.53,0.78,0.7,0.6,0.61
Stochastic Gradient Descent,0.77,0.82,0.53,0.49,0.78,0.75,0.54,0.51


In [259]:
target_col = 'dyslipidemia'
test_multiple_features(feature_cols, target_col, target_mapper)

Unnamed: 0,umls_tokens: accuracy,umls_tokens: precision,umls_tokens: recall,umls_tokens: f1,tokens: accuracy,tokens: precision,tokens: recall,tokens: f1
Gaussian Naive Bayes,0.7,0.66,0.65,0.66,0.71,0.68,0.68,0.68
AdaBoost,0.71,0.68,0.65,0.65,0.73,0.7,0.66,0.67
Logistic Regression,0.72,0.7,0.64,0.65,0.76,0.76,0.69,0.7
Stochastic Gradient Descent,0.73,0.76,0.62,0.62,0.75,0.81,0.64,0.64
K Nearest Neighbor,0.69,0.65,0.61,0.61,0.71,0.67,0.64,0.65
Random Forest,0.71,0.71,0.61,0.6,0.73,0.73,0.64,0.64
Decsision Tree,0.6,0.56,0.56,0.56,0.62,0.58,0.58,0.58


In [260]:
target_col = 'readmission'
test_multiple_features(feature_cols, target_col, target_mapper)

Unnamed: 0,umls_tokens: accuracy,umls_tokens: precision,umls_tokens: recall,umls_tokens: f1,tokens: accuracy,tokens: precision,tokens: recall,tokens: f1
Gaussian Naive Bayes,0.61,0.54,0.55,0.54,0.59,0.6,0.62,0.56
K Nearest Neighbor,0.71,0.55,0.53,0.52,0.71,0.57,0.54,0.54
Logistic Regression,0.74,0.64,0.53,0.51,0.74,0.61,0.53,0.51
Decsision Tree,0.61,0.5,0.5,0.5,0.62,0.52,0.52,0.52
AdaBoost,0.71,0.54,0.52,0.49,0.72,0.57,0.53,0.52
Random Forest,0.74,0.54,0.5,0.43,0.74,0.64,0.51,0.46
Stochastic Gradient Descent,0.74,0.37,0.5,0.43,0.74,0.37,0.5,0.43


In [277]:
#Hyperparameter tuning

#SGD
loss =  ['hinge', 'log']
penalty = ['l2','l1']
alpha = [1e-6, 1e-3, 1e-1, 1e0]
max_iter = [1000, 10000]
tol = [None, 1e-3]
eta0 = [0, 0.1, 0.001]

random_state = [0]

X = get_vectorized_data(df['umls_tokens'].astype('str'))
y = df['hypertension'].map({"Yes": 1, "Maybe": 1, "No": 0}).values

clf = SGDClassifier()

params = dict(loss=loss,
              penalty=penalty,
              alpha=alpha,
              max_iter=max_iter,
              tol=tol,
              random_state=random_state)

gridsearch = GridSearchCV(clf,
                          params,
                          cv = 5,
                          verbose = 1, 
                          n_jobs = -1)

sgd_best_model = gridsearch.fit(X, y)

Fitting 5 folds for each of 128 candidates, totalling 640 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  89 tasks      | elapsed:   11.3s
[Parallel(n_jobs=-1)]: Done 264 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 514 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 640 out of 640 | elapsed:  7.0min finished


In [278]:
pd.DataFrame(gridsearch.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_loss,param_max_iter,param_penalty,param_random_state,param_tol,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.01,0.01,0.00,0.00,0.00,hinge,5,l2,0,,"{'alpha': 1e-06, 'loss': 'hinge', 'max_iter': ...",0.50,0.72,0.71,0.70,0.66,0.66,0.08,47
1,0.01,0.00,0.00,0.00,0.00,hinge,5,l2,0,0.00,"{'alpha': 1e-06, 'loss': 'hinge', 'max_iter': ...",0.50,0.72,0.71,0.70,0.66,0.66,0.08,47
2,0.01,0.01,0.00,0.00,0.00,hinge,5,l1,0,,"{'alpha': 1e-06, 'loss': 'hinge', 'max_iter': ...",0.52,0.52,0.73,0.72,0.67,0.63,0.09,56
3,0.03,0.01,0.00,0.00,0.00,hinge,5,l1,0,0.00,"{'alpha': 1e-06, 'loss': 'hinge', 'max_iter': ...",0.52,0.52,0.73,0.72,0.67,0.63,0.09,56
4,0.07,0.00,0.00,0.00,0.00,hinge,100,l2,0,,"{'alpha': 1e-06, 'loss': 'hinge', 'max_iter': ...",0.65,0.75,0.57,0.66,0.64,0.65,0.06,49
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,0.03,0.00,0.00,0.00,1.00,log,1000,l1,0,0.00,"{'alpha': 1.0, 'loss': 'log', 'max_iter': 1000...",0.51,0.51,0.51,0.51,0.51,0.51,0.00,73
124,10.62,0.11,0.00,0.00,1.00,log,10000,l2,0,,"{'alpha': 1.0, 'loss': 'log', 'max_iter': 1000...",0.51,0.51,0.51,0.51,0.51,0.51,0.00,73
125,0.01,0.00,0.00,0.00,1.00,log,10000,l2,0,0.00,"{'alpha': 1.0, 'loss': 'log', 'max_iter': 1000...",0.51,0.51,0.51,0.51,0.51,0.51,0.00,73
126,32.01,2.21,0.00,0.00,1.00,log,10000,l1,0,,"{'alpha': 1.0, 'loss': 'log', 'max_iter': 1000...",0.51,0.51,0.51,0.51,0.51,0.51,0.00,73


In [279]:
sgd_best_model.best_score_

0.7555729000501344

In [280]:
sgd_best_model.best_estimator_

SGDClassifier(alpha=1e-06, max_iter=10000, random_state=0, tol=None)

In [272]:
#Hyperparameter tuning

#LR

penalty = ['l2','l1', 'elasticnet']
C=[1, 100, 1000, 1e5]
max_iter = [100, 1000, 10000]
random_state = [3]

X = get_vectorized_data(df['umls_tokens'].astype('str'))
y = df['hypertension'].map({"Yes": 1, "Maybe": 1, "No": 0}).values

clf = LogisticRegression()

params = dict(
              penalty=penalty,
              max_iter=max_iter,
              tol=tol,
              random_state=random_state)

gridsearch = GridSearchCV(clf,
                          params,
                          cv = 5,
                          verbose = 1, 
                          n_jobs = -1)

lr_best_model = gridsearch.fit(X, y)

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 312 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed:    1.0s finished


In [273]:
pd.DataFrame(gridsearch.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_iter,param_penalty,param_random_state,param_tol,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.00,0.00,0.00,0.00,5,l2,0,,"{'max_iter': 5, 'penalty': 'l2', 'random_state...",,,,,,,,52
1,0.03,0.01,0.00,0.00,5,l2,0,0.00,"{'max_iter': 5, 'penalty': 'l2', 'random_state...",0.64,0.68,0.71,0.69,0.66,0.67,0.02,13
2,0.00,0.00,0.00,0.00,5,l2,3,,"{'max_iter': 5, 'penalty': 'l2', 'random_state...",,,,,,,,72
3,0.04,0.01,0.00,0.00,5,l2,3,0.00,"{'max_iter': 5, 'penalty': 'l2', 'random_state...",0.64,0.68,0.71,0.69,0.66,0.67,0.02,13
4,0.00,0.00,0.00,0.00,5,l2,50,,"{'max_iter': 5, 'penalty': 'l2', 'random_state...",,,,,,,,71
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,0.00,0.00,0.00,0.00,10000,elasticnet,3,0.00,"{'max_iter': 10000, 'penalty': 'elasticnet', '...",,,,,,,,47
92,0.00,0.00,0.00,0.00,10000,elasticnet,50,,"{'max_iter': 10000, 'penalty': 'elasticnet', '...",,,,,,,,49
93,0.00,0.00,0.00,0.00,10000,elasticnet,50,0.00,"{'max_iter': 10000, 'penalty': 'elasticnet', '...",,,,,,,,29
94,0.00,0.00,0.00,0.00,10000,elasticnet,99,,"{'max_iter': 10000, 'penalty': 'elasticnet', '...",,,,,,,,78


In [276]:
lr_best_model.best_score_

0.730935235404038