This notebook will attempt to find optimal hyperparameters for classifiers on chest imaging reports from Hospital System A (2013), Hospital System A (2016), and Hospital System B (2017-2018), which is collectively known as the whole training dataset. This was inspired by the following post, which also contributed a list of hyperparameters that should be optimized for random forest: https://towardsdatascience.com/hyperparameters-optimization-526348bb8e2d

In [None]:
# Generic imports
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import json
from joblib import Parallel, delayed
from pathlib import Path

In [None]:
# Notebook-specific imports
from custom_functions import tokenizer_better

In [None]:
# Data locations
basedir = Path("../..")
training_dataset = basedir / "Analysis_Data" / "train_ML" / "cxr_whole_training_dataset.csv"
training_data = pd.read_csv(
    training_dataset,
    dtype={
        "encounter_id": str,
        "cxr_timestamp": str,
        "cxr_score": int,
        "cxr_findings": str,
        "seg_cxr_text": str
        }
    )

# Removing remaining punctuation marks
training_data['seg_cxr_text'] = training_data['seg_cxr_text'].str.replace(r"'", r"", regex=True)
training_data['seg_cxr_text'] = training_data['seg_cxr_text'].str.replace(r"\[", r"", regex=True)
training_data['seg_cxr_text'] = training_data['seg_cxr_text'].str.replace(r"\]", r"", regex=True)
training_data['seg_cxr_text'] = training_data['seg_cxr_text'].str.replace(r",", r"", regex=True)

In [None]:
encounters = training_data["encounter_id"].unique()

### Importing machine learning libraries

In [None]:
# Models/algorithms/classifiers
from xgboost import XGBClassifier

# Performance metrics
from sklearn.metrics import roc_auc_score, average_precision_score, log_loss, accuracy_score

# Cross-validation
from sklearn.model_selection import KFold
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from hyperopt.early_stop import no_progress_loss
from hyperopt.pyll.base import scope 

# Text vectorizer
from sklearn.feature_extraction.text import CountVectorizer

### Now, hyperparameter tuning.

In [None]:
def custom_cv_func(model, df, encounters, train_index, test_index, score='auc'):
    train_encounters = encounters[train_index]
    test_encounters = encounters[test_index]
    
    train = df['encounter_id'].isin(train_encounters)
    test = df['encounter_id'].isin(test_encounters)
    
    X_train = df.loc[train, "seg_cxr_text"].to_numpy()
    X_test = df.loc[test, "seg_cxr_text"].to_numpy()
    Y_train = df.loc[train, "cxr_score"].to_numpy()
    Y_test = df.loc[test, "cxr_score"].to_numpy()
    
    #vectorize
    vect = CountVectorizer(
        tokenizer=tokenizer_better,
        ngram_range=(1,2),
        max_features=200
        )
    
    vect.fit(X_train)
    X_train_vect = vect.transform(X_train).toarray()
    X_test_vect = vect.transform(X_test).toarray()
    
    model.fit(X_train_vect, Y_train)
    
    if score == 'auc':
        test_preds = model.predict_proba(X_test_vect)[:,1]
        test_score = roc_auc_score(Y_test, test_preds)
    elif score == 'aucpr':
        test_preds = model.predict_proba(X_test_vect)[:,1]
        test_score = average_precision_score(Y_test, test_preds)
    elif score == 'log_loss':
        test_preds = model.predict_proba(X_test_vect)[:,1]
        test_score = log_loss(Y_test, test_preds)
    elif score == 'accuracy':
        test_preds = model.predict(X_test_vect)
        test_score = accuracy_score(Y_test, test_preds)
    else:
        raise ValueError("Invalid scoring scheme, enter either 'auc' or 'brier'")
        
    return test_score

In [None]:
XG_param_grid = {
    'base_score': hp.uniform('base_score', 0.0, 1.0),
    'n_estimators': scope.int(hp.quniform("n_estimators", 10, 10000, 10)),
    'max_depth': scope.int(hp.quniform("max_depth", 10, 10000, 10)),
    'learning_rate': hp.uniform('learning_rate', 0.0, 1.0),
    'gamma': hp.uniform('gamma', 0.0, 10.0),
    'min_child_weight': hp.uniform('min_child_weight', 0.0, 100.0),
    'max_delta_step': hp.uniform("max_delta_step", 0.0, 100.0),   # This is the hyperparameter that could help with label imbalance
    'subsample': hp.uniform('subsample', 0.001, 1.0)}

In [None]:
# Bayesian Optimization
def objective(XG_param_grid):
    XG_model = XGBClassifier(
        eval_metric='logloss',
        objective='binary:logistic',
        base_score=XG_param_grid['base_score'],
        n_estimators=XG_param_grid['n_estimators'],
        max_depth=XG_param_grid['max_depth'],
        learning_rate=XG_param_grid['learning_rate'],
        gamma=XG_param_grid['gamma'],
        min_child_weight=XG_param_grid['min_child_weight'],
        max_delta_step=XG_param_grid['max_delta_step'],
        subsample=XG_param_grid['subsample'],
        tree_method='hist',
        random_state=0
    )
    
    cv = KFold()
    logloss = Parallel(n_jobs=5)(delayed(custom_cv_func)(
        XG_model,
        training_data,
        encounters,
        train_index,
        test_index,
        score='log_loss'
        ) for train_index, test_index in cv.split(encounters))
    
    mean_logloss = np.mean(logloss)
    var_logloss = np.var(logloss, ddof=1)

    return {'loss': mean_logloss, 'loss_variance': var_logloss, 'status': STATUS_OK}

In [None]:
xg_trials = Trials()

# max_evals = 20*(# of ordinal hyperparams) + 15*(# categorical choices) = 160
# stopping criteria, if needed: no improvement within window = 0.25*max_eval
if __name__ == "__main__":
    best_XG = fmin(
        fn=objective,
        space=XG_param_grid,
        algo=tpe.suggest,
        max_evals=160,
        trials=xg_trials,
        early_stop_fn=no_progress_loss(40)
        )

In [None]:
XG_model_tuned = XGBClassifier(
   eval_metric='logloss',
   objective='binary:logistic',
   base_score=best_XG['base_score'],
   n_estimators=int(best_XG['n_estimators']),
   max_depth=int(best_XG['max_depth']),
   learning_rate=best_XG['learning_rate'],
   gamma=best_XG['gamma'],
   min_child_weight=best_XG['min_child_weight'],
   max_delta_step=best_XG['max_delta_step'],
   subsample=best_XG['subsample'],
   tree_method='hist',
   random_state=0
   )

In [None]:
cv = KFold()
auc = Parallel(n_jobs=5)(delayed(custom_cv_func)(
   XG_model_tuned,
   training_data,
   encounters,
   train_index,
   test_index,
   score='auc') for train_index, test_index in cv.split(encounters))

cv = KFold()
brier = Parallel(n_jobs=5)(delayed(custom_cv_func)(
   XG_model_tuned,
   training_data,
   encounters,
   train_index,
   test_index,
   score='accuracy') for train_index, test_index in cv.split(encounters))

cv = KFold()
Logloss = Parallel(n_jobs=5)(delayed(custom_cv_func)(
   XG_model_tuned,
   training_data,
   encounters,
   train_index,
   test_index,
   score='log_loss') for train_index, test_index in cv.split(encounters))

print(f"Tuned AUROC for XG: {np.mean(auc):.3f}")
print(f"Tuned accuracy for XG: {np.mean(brier):.3f}")
print(f"Tuned Log loss for XG: {np.mean(Logloss):.3f}")

In [None]:
xg_hyperparam = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'base_score': str(best_XG['base_score']),
    'n_estimators': str(int(best_XG['n_estimators'])),
    'max_depth': str(int(best_XG['max_depth'])),
    'learning_rate': str(best_XG['learning_rate']),
    'gamma': str(best_XG['gamma']),
    'min_child_weight': str(best_XG['min_child_weight']),
    'max_delta_step': str(best_XG['max_delta_step']),
    'subsample': str(best_XG['subsample'])
    }

xg_hyperparam

In [None]:
# with open("hyperparameters/bilateral_infiltrates_model_hyperparams.json", 'w') as file_json:
#     json.dump(xg_hyperparam, file_json)