This notebook will attempt to find optimal hyperparameters for classifiers on chest imaging reports from Hospital A (2013) data. This was inspired by the following post, which also contributed a list of hyperparameters that should be optimized for random forest: https://towardsdatascience.com/hyperparameters-optimization-526348bb8e2d

Idea is that if one wants to compare models, one might want to compare them after they have been tuned. Otherwise comparisons are a bit uncalled for.

In [None]:
# Generic imports
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import json
from joblib import Parallel, delayed
from pathlib import Path

In [None]:
# Notebook-specific imports
from custom_functions import tokenizer_better

In [None]:
# Data locations
basedir = Path("../..")
analysis_location = basedir / 'Analysis_Data'
training_path = analysis_location / 'train_ML'
train_data1a = training_path / 'hospital_a_2013_bi_data_processed_minus_history_plus_conclusion-03-2021.csv'
train_data1b = training_path / 'hospital_a_2013_cxr_annotated.csv'

### Hospital A (2013) data read in and processing

In [None]:
# Open hospital_a_2013 processed files - segmented_reports
segmented = pd.read_csv(train_data1b)

# Replace some remaining punctuation marks
segmented["seg_cxr_text"] = segmented["seg_cxr_text"].str.replace(r"'", r"", regex=True)
segmented["seg_cxr_text"] = segmented["seg_cxr_text"].str.replace(r"\[", r"", regex=True)
segmented["seg_cxr_text"] = segmented["seg_cxr_text"].str.replace(r"\]", r"", regex=True)
segmented["seg_cxr_text"] = segmented["seg_cxr_text"].str.replace(r",", r"", regex=True)

encounters = segmented['encounter_id'].unique()

### Importing machine learning libraries

In [None]:
# Models/algorithms/classifiers
from sklearn import tree
from sklearn.linear_model import LogisticRegression as logit
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Performance metrics
from sklearn.metrics import roc_auc_score, brier_score_loss, log_loss

# Cross-validation
from sklearn.model_selection import KFold
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from hyperopt.early_stop import no_progress_loss
from hyperopt.pyll.base import scope 

# Text vectorizer
from sklearn.feature_extraction.text import CountVectorizer

### Evaluating performance of models with default hyperparameters

In [None]:
DT_train_auc = []
DT_test_auc = []
DT_train_brier = []
DT_test_brier = []

LR_train_auc = []
LR_test_auc = []
LR_train_brier = []
LR_test_brier = []

RF_train_auc = []
RF_test_auc = []
RF_train_brier = []
RF_test_brier = []

XG_train_auc = []
XG_test_auc = []
XG_train_brier = []
XG_test_brier = []
XG_test_ll = []

### Use this loop if wanting to ensure the split is done at the encounter level (avoiding potential data leakage)

In [None]:
# Default is to do 5-fold CV
cv = KFold()

for train_index, test_index in cv.split(encounters):
    train_encounters = encounters[train_index]
    test_encounters = encounters[test_index]
    
    train = segmented['encounter_id'].isin(train_encounters)
    test = segmented['encounter_id'].isin(test_encounters)
    
    X_train, X_test = segmented.loc[train, "seg_cxr_text"].to_numpy(), segmented.loc[test, "seg_cxr_text"].to_numpy()
    Y_train, Y_test = segmented.loc[train, "cxr_score"].to_numpy(), segmented.loc[test, "cxr_score"].to_numpy()
    
    
    #vectorize
    vect = CountVectorizer(
        tokenizer=tokenizer_better,
        ngram_range=(1,2),
        max_features=200
        )
    vect.fit(X_train)
    X_train_vect = vect.transform(X_train).toarray()
    X_test_vect = vect.transform(X_test).toarray()
    features = {value: key for key, value in vect.vocabulary_.items()}
    
    # Train models
    DT_model = tree.DecisionTreeClassifier(random_state=0)
    DT_model.fit(X_train_vect, Y_train)
    
    LR_model = logit(
        random_state=0,
        max_iter=10000   # Setting it up to this number to avoid error message about not converging
        )
    LR_model.fit(X_train_vect, Y_train)
    
    RF_model = RandomForestClassifier(random_state=0)
    RF_model.fit(X_train_vect, Y_train)

    XG_model = XGBClassifier(random_state=0)
    XG_model.fit(
        X_train_vect,
        Y_train,
        eval_set=[(X_test_vect, Y_test)],
        verbose=False
        )
       
    
    # Predictions
    DT_train_preds = DT_model.predict_proba(X_train_vect)[:,1]
    DT_test_preds = DT_model.predict_proba(X_test_vect)[:,1]
    
    LR_train_preds = LR_model.predict_proba(X_train_vect)[:,1]
    LR_test_preds = LR_model.predict_proba(X_test_vect)[:,1]
    
    RF_train_preds = RF_model.predict_proba(X_train_vect)[:,1]
    RF_test_preds = RF_model.predict_proba(X_test_vect)[:,1]
    
    XG_train_preds = XG_model.predict_proba(X_train_vect)[:,1]
    XG_test_preds = XG_model.predict_proba(X_test_vect)[:,1]
    
    
    # Gathering AUCs and Brier scores
    DT_train_auc.append(roc_auc_score(Y_train, DT_train_preds))
    DT_test_auc.append(roc_auc_score(Y_test, DT_test_preds))
    DT_train_brier.append(brier_score_loss(Y_train, DT_train_preds))
    DT_test_brier.append(brier_score_loss(Y_test, DT_test_preds))
    
    LR_train_auc.append(roc_auc_score(Y_train, LR_train_preds))
    LR_test_auc.append(roc_auc_score(Y_test, LR_test_preds))
    LR_train_brier.append(brier_score_loss(Y_train, LR_train_preds))
    LR_test_brier.append(brier_score_loss(Y_test, LR_test_preds))
    
    RF_train_auc.append(roc_auc_score(Y_train, RF_train_preds))
    RF_test_auc.append(roc_auc_score(Y_test, RF_test_preds))
    RF_train_brier.append(brier_score_loss(Y_train, RF_train_preds))
    RF_test_brier.append(brier_score_loss(Y_test, RF_test_preds))
    
    XG_train_auc.append(roc_auc_score(Y_train, XG_train_preds))
    XG_test_auc.append(roc_auc_score(Y_test, XG_test_preds))
    XG_train_brier.append(brier_score_loss(Y_train, XG_train_preds))
    XG_test_brier.append(brier_score_loss(Y_test, XG_test_preds))
    XG_test_ll.append(log_loss(Y_test, XG_test_preds))

### Use this loop if assuming each CXR report is an independent sample

In [None]:
# # Default is to do 5-fold CV
# cv = KFold()

# for train_index, test_index in cv.split(X):
#     X_train, X_test = X[train_index], X[test_index]
#     Y_train, Y_test = Y[train_index], Y[test_index]
    
    
#     #vectorize
#     vect = CountVectorizer(tokenizer=tokenizer_better,
#                            ngram_range=(1,2),
#                            max_features=200)
#     vect.fit(X_train)
#     X_train_vect = vect.transform(X_train).toarray()
#     X_test_vect = vect.transform(X_test).toarray()
    
#     # Train models
#     DT_model = tree.DecisionTreeClassifier(random_state=0)
#     DT_model.fit(X_train_vect, Y_train)
    
#     LR_model = logit(random_state=0,
#                      max_iter=10000)  # Setting it up to this number to avoid error message about not converging
#     LR_model.fit(X_train_vect, Y_train)
    
#     RF_model = RandomForestClassifier(random_state=0,
#                                       n_jobs=-1)
#     RF_model.fit(X_train_vect, Y_train)
    
#     XG_model = XGBClassifier(random_state=0,
#                              eval_metric='logloss',
#                              n_jobs=-1)
#     XG_model.fit(X_train_vect, Y_train)
    
    
#     # Predictions
#     DT_train_preds = DT_model.predict_proba(X_train_vect)[:,1]
#     DT_test_preds = DT_model.predict_proba(X_test_vect)[:,1]
    
#     LR_train_preds = LR_model.predict_proba(X_train_vect)[:,1]
#     LR_test_preds = LR_model.predict_proba(X_test_vect)[:,1]
    
#     RF_train_preds = RF_model.predict_proba(X_train_vect)[:,1]
#     RF_test_preds = RF_model.predict_proba(X_test_vect)[:,1]
    
#     XG_train_preds = XG_model.predict_proba(X_train_vect)[:,1]
#     XG_test_preds = XG_model.predict_proba(X_test_vect)[:,1]
    
    
#     # Gathering AUCs and Brier scores
#     DT_train_auc.append(roc_auc_score(Y_train, DT_train_preds))
#     DT_test_auc.append(roc_auc_score(Y_test, DT_test_preds))
#     DT_train_brier.append(brier_score_loss(Y_train, DT_train_preds))
#     DT_test_brier.append(brier_score_loss(Y_test, DT_test_preds))
    
#     LR_train_auc.append(roc_auc_score(Y_train, LR_train_preds))
#     LR_test_auc.append(roc_auc_score(Y_test, LR_test_preds))
#     LR_train_brier.append(brier_score_loss(Y_train, LR_train_preds))
#     LR_test_brier.append(brier_score_loss(Y_test, LR_test_preds))
    
#     RF_train_auc.append(roc_auc_score(Y_train, RF_train_preds))
#     RF_test_auc.append(roc_auc_score(Y_test, RF_test_preds))
#     RF_train_brier.append(brier_score_loss(Y_train, RF_train_preds))
#     RF_test_brier.append(brier_score_loss(Y_test, RF_test_preds))
    
#     XG_train_auc.append(roc_auc_score(Y_train, XG_train_preds))
#     XG_test_auc.append(roc_auc_score(Y_test, XG_test_preds))
#     XG_train_brier.append(brier_score_loss(Y_train, XG_train_preds))
#     XG_test_brier.append(brier_score_loss(Y_test, XG_test_preds))

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(7, 7))
ax[0,0].boxplot(
    [DT_train_auc, LR_train_auc, RF_train_auc, XG_train_auc],
    tick_labels=['DT', 'LR', 'RF', 'XG']
    )
ax[0,0].set(ylim=[0.5, 1.05])
ax[0,0].set_ylabel("AUROC")
ax[0,0].tick_params(axis='x')
ax[0,0].tick_params(axis='y')
ax[0,0].set_title("Train AUC")

ax[0,1].boxplot(
    [DT_test_auc, LR_test_auc, RF_test_auc, XG_test_auc],
    tick_labels=['DT', 'LR', 'RF', 'XG']
    )
ax[0,1].set_title("Test AUC")
ax[0,1].tick_params(axis='x')
ax[0,1].tick_params(axis='y')
ax[0,1].set(ylim=[0.5, 1.05])

ax[1,0].boxplot(
    [DT_train_brier, LR_train_brier, RF_train_brier, XG_train_brier],
    tick_labels=['DT', 'LR', 'RF', 'XG']
    )
ax[1,0].set_title("Train Brier")
ax[1,0].set_ylabel("Brier score")
ax[1,0].tick_params(axis='x')
ax[1,0].tick_params(axis='y')
ax[1,0].set(ylim=[0.00, 0.25])

ax[1,1].boxplot(
    [DT_test_brier, LR_test_brier, RF_test_brier, XG_test_brier],
    tick_labels=['DT', 'LR', 'RF', 'XG']
    )
ax[1,1].set_title("Test Brier")
ax[1,1].tick_params(axis='x')
ax[1,1].tick_params(axis='y')
ax[1,1].set(ylim=[0.00, 0.25])
plt.show()

### Now, hyperparameter tuning.

Choosing to do Bayesian optimization as it is more efficient (i.e. shorter time to reach a solution).

In [None]:
def custom_cv_func(model, df, encounters, train_index, test_index, score='auc'):
    train_encounters = encounters[train_index]
    test_encounters = encounters[test_index]
    
    train = df['encounter_id'].isin(train_encounters)
    test = df['encounter_id'].isin(test_encounters)
    
    X_train = df.loc[train, "seg_cxr_text"].to_numpy()
    X_test = df.loc[test, "seg_cxr_text"].to_numpy()
    Y_train = df.loc[train, "cxr_score"].to_numpy()
    Y_test = df.loc[test, "cxr_score"].to_numpy()
    
    #vectorize
    vect = CountVectorizer(
        tokenizer=tokenizer_better,
        ngram_range=(1, 2),
        max_features=200
        )
    
    vect.fit(X_train)
    X_train_vect = vect.transform(X_train).toarray()
    X_test_vect = vect.transform(X_test).toarray()
    
    model.fit(X_train_vect, Y_train)
    
    test_preds = model.predict_proba(X_test_vect)[:,1]
    
    if score == 'auc':
        test_score = roc_auc_score(Y_test, test_preds)
    elif score == 'brier':
        test_score = brier_score_loss(Y_test, test_preds)
    elif score == 'log_loss':
        test_score = log_loss(Y_test, test_preds)
    else:
        raise ValueError("Invalid scoring scheme, enter either 'auc' or 'brier'")
        
    return test_score

#### XGBoost

In [None]:
XG_param_grid = {
    'base_score': hp.uniform('base_score', 0.0, 1.0),
    'n_estimators': scope.int(hp.quniform("n_estimators", 10, 10000, 10)),
    'max_depth': scope.int(hp.quniform("max_depth", 10, 10000, 10)),
    'learning_rate': hp.uniform('learning_rate', 0.0, 1.0),
    'gamma': hp.uniform('gamma', 0.0, 10.0),
    'min_child_weight': hp.uniform('min_child_weight', 0.0, 100.0),
    'max_delta_step': hp.uniform("max_delta_step", 0.0, 100.0),
    'subsample': hp.uniform('subsample', 0.001, 1.0)
    }

In [None]:
# Bayesian Optimization
def objective(XG_param_grid):
    XG_model = XGBClassifier(
        eval_metric='logloss',
        objective='binary:logistic',
        base_score=XG_param_grid['base_score'],
        n_estimators=XG_param_grid['n_estimators'],
        max_depth=XG_param_grid['max_depth'],
        learning_rate=XG_param_grid['learning_rate'],
        gamma=XG_param_grid['gamma'],
        min_child_weight=XG_param_grid['min_child_weight'],
        max_delta_step=XG_param_grid['max_delta_step'],
        subsample=XG_param_grid['subsample'],
        tree_method='hist',
        random_state=0)
    
    cv = KFold()
    logloss = Parallel(n_jobs=5)(delayed(custom_cv_func)(
        XG_model,
        segmented,
        encounters,
        train_index,
        test_index,
        score='log_loss'
        ) for train_index, test_index in cv.split(encounters))
    
    mean_logloss = np.mean(logloss)
    var_logloss = np.var(logloss, ddof=1)

    return {'loss': mean_logloss, 'loss_variance': var_logloss, 'status': STATUS_OK}

In [None]:
xg_trials = Trials()

# max_evals = 20*(# of ordinal hyperparams) + 15*(# categorical choices) = 185
# stopping criteria, if needed: no improvement within window = 0.25*max_eval
if __name__ == "__main__":
    best_XG = fmin(
        fn=objective,
        space=XG_param_grid,
        algo=tpe.suggest,
        max_evals=160,
        trials=xg_trials,
        early_stop_fn=no_progress_loss(40)
        )

In [None]:
print(f"Default AUROC for XG: {np.mean(XG_test_auc):.3f}")
print(f"Default Brier for XG: {np.mean(XG_test_brier):.3f}")
print(f"Default Log loss for XG: {np.mean(XG_test_ll):.3f}")

In [None]:
XG_model_tuned = XGBClassifier(
    eval_metric='logloss',
    objective='binary:logistic',
    base_score=best_XG['base_score'],
    n_estimators=int(best_XG['n_estimators']),
    max_depth=int(best_XG['max_depth']),
    learning_rate=best_XG['learning_rate'],
    gamma=best_XG['gamma'],
    min_child_weight=best_XG['min_child_weight'],
    max_delta_step=best_XG['max_delta_step'],
    subsample=best_XG['subsample'],
    tree_method='hist',
    random_state=0
    )

cv = KFold()
auc = Parallel(n_jobs=5)(delayed(custom_cv_func)(
    XG_model_tuned,
    segmented,
    encounters,
    train_index,
    test_index
    ) for train_index, test_index in cv.split(encounters))

cv = KFold()
brier = Parallel(n_jobs=5)(delayed(custom_cv_func)(
    XG_model_tuned,
    segmented,
    encounters,
    train_index,
    test_index,
    score='brier'
    ) for train_index, test_index in cv.split(encounters))

cv = KFold()
Logloss = Parallel(n_jobs=5)(delayed(custom_cv_func)(
    XG_model_tuned,
    segmented,
    encounters,
    train_index,
    test_index,
    score='log_loss'
    ) for train_index, test_index in cv.split(encounters))

print(f"Tuned AUROC for XG: {np.mean(auc):.3f}")
print(f"Tuned Brier for XG: {np.mean(brier):.3f}")
print(f"Tuned Log loss for XG: {np.mean(Logloss):.3f}")

In [None]:
xg_hyperparam = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'base_score': str(best_XG['base_score']),
    'n_estimators': str(int(best_XG['n_estimators'])),
    'max_depth': str(int(best_XG['max_depth'])),
    'learning_rate': str(best_XG['learning_rate']),
    'gamma': str(best_XG['gamma']),
    'min_child_weight': str(best_XG['min_child_weight']),
    'max_delta_step': str(best_XG['max_delta_step']),
    'subsample': str(best_XG['subsample'])
    }

xg_hyperparam

In [None]:
with open("hyperparameters/XG_hyperparams_hospital_a_2013.json", 'w') as file_json:
    json.dump(xg_hyperparam, file_json)