This notebook will attempt to find optimal hyperparameters for XGBoost applied on pneumonia, chf and aspiration risk factor annotations.

In [None]:
# Generic imports
import matplotlib.pyplot as plt
# %load_ext cudf.pandas
import pandas as pd
import numpy as np
import json
from joblib import Parallel, delayed
from pathlib import Path

In [None]:
# Custom display of tables for easier inspection
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [None]:
# Notebook-specific imports
from custom_functions import tokenizer_better

In [None]:
# Data locations
basedir = Path("../..")
analysis_location = basedir / 'Analysis_Data'
cohort = 'hospital_a_2013'
path = analysis_location / cohort

# Figures
figure_path = basedir / "Figures"

### File reading and preprocessing

In [None]:
notes_annot = pd.read_csv(path / "attending_notes_annotated.csv")
notes_annot['notes_timestamp'] = pd.to_timedelta(notes_annot['notes_timestamp'])

In [None]:
# Subsetting tables to only have entries that matched, and relevant columns
pneumonia_notes = notes_annot.loc[
    notes_annot['seg_pneumonia'] != "Invalid",
    [
        'encounter_id',
        'notes_timestamp',
        'notes_text',
        'pneumonia',
        'pneumonia_sw',
        'seg_pneumonia'
    ]
]
pneumonia_notes['seg_pneumonia'] = pneumonia_notes['seg_pneumonia'].str.replace(r"'", r"", regex=True)
pneumonia_notes['seg_pneumonia'] = pneumonia_notes['seg_pneumonia'].str.replace(r"\[", r"", regex=True)
pneumonia_notes['seg_pneumonia'] = pneumonia_notes['seg_pneumonia'].str.replace(r"\]", r"", regex=True)
pneumonia_notes['seg_pneumonia'] = pneumonia_notes['seg_pneumonia'].str.replace(r",", r"", regex=True)


chf_notes = notes_annot.loc[
    notes_annot['seg_chf'] != "Invalid",
    [
        'encounter_id',
        'notes_timestamp',
        "notes_text",
        'chf',
        'chf_sw',
        'seg_chf'
    ]
]
chf_notes['seg_chf'] = chf_notes['seg_chf'].str.replace(r"'", r"", regex=True)
chf_notes['seg_chf'] = chf_notes['seg_chf'].str.replace(r"\[", r"", regex=True)
chf_notes['seg_chf'] = chf_notes['seg_chf'].str.replace(r"\]", r"", regex=True)
chf_notes['seg_chf'] = chf_notes['seg_chf'].str.replace(r",", r"", regex=True)


aspiration_notes = notes_annot.loc[
    notes_annot['seg_aspiration'] != "Invalid",
    [
        'encounter_id',
        'notes_timestamp',
        'notes_text',
        'aspiration',
        'aspiration_sw',
        'seg_aspiration'
    ]
]
aspiration_notes['seg_aspiration'] = aspiration_notes['seg_aspiration'].str.replace(r"'", r"", regex=True)
aspiration_notes['seg_aspiration'] = aspiration_notes['seg_aspiration'].str.replace(r"\[", r"", regex=True)
aspiration_notes['seg_aspiration'] = aspiration_notes['seg_aspiration'].str.replace(r"\]", r"", regex=True)
aspiration_notes['seg_aspiration'] = aspiration_notes['seg_aspiration'].str.replace(r",", r"", regex=True)


sepsis_notes = notes_annot.loc[
    notes_annot['seg_sepsis'] != "Invalid",
    [
        'encounter_id',
        'notes_timestamp',
        'notes_text',
        'sepsis',
        'sepsis_sw',
        'seg_sepsis'
    ]
]
sepsis_notes['seg_sepsis'] = sepsis_notes['seg_sepsis'].str.replace(r"'", r"", regex=True)
sepsis_notes['seg_sepsis'] = sepsis_notes['seg_sepsis'].str.replace(r"\[", r"", regex=True)
sepsis_notes['seg_sepsis'] = sepsis_notes['seg_sepsis'].str.replace(r"\]", r"", regex=True)
sepsis_notes['seg_sepsis'] = sepsis_notes['seg_sepsis'].str.replace(r",", r"", regex=True)

In [None]:
# Imputing null SW adjudications as zero
pneumonia_notes['pneumonia_sw'] = pneumonia_notes['pneumonia_sw'].fillna(0)
chf_notes['chf_sw'] = chf_notes['chf_sw'].fillna(0)
aspiration_notes['aspiration_sw'] = aspiration_notes['aspiration_sw'].fillna(0)
sepsis_notes['sepsis_sw'] = sepsis_notes['sepsis_sw'].fillna(0)

In [None]:
# Getting encounters from each of the datasets
pneumonia_enctrs = pneumonia_notes['encounter_id'].unique()
chf_enctrs = chf_notes['encounter_id'].unique()
aspiration_enctrs = aspiration_notes['encounter_id'].unique()
sepsis_enctrs = sepsis_notes['encounter_id'].unique()

### Importing machine learning libraries

In [None]:
# Models/algorithms/classifiers
from xgboost import XGBClassifier

# Performance metrics
from sklearn.metrics import roc_auc_score, log_loss, brier_score_loss

# Cross-validation
from sklearn.model_selection import KFold
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from hyperopt.early_stop import no_progress_loss
from hyperopt.pyll.base import scope 

# Text vectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
def custom_cv_func(hyperparams, df, encounters, train_col, label_col, train_index, test_index, score='log_loss'):
    train_encounters = encounters[train_index]
    test_encounters = encounters[test_index]
    
    train = df['encounter_id'].isin(train_encounters)
    test = df['encounter_id'].isin(test_encounters)
    
    X_train = df.loc[train, train_col]
    X_test = df.loc[test, train_col]
    Y_train = df.loc[train, label_col]
    Y_test = df.loc[test, label_col]
    
    #vectorize
    vect = CountVectorizer(
        tokenizer=tokenizer_better,
        ngram_range=(1,2),
        max_features=200
        )
    
    vect.fit(X_train)
    X_train_vect = vect.transform(X_train)
    X_test_vect = vect.transform(X_test)
    
    model = XGBClassifier(
        base_score=hyperparams['base_score'],
        n_estimators=hyperparams['n_estimators'],
        max_depth=hyperparams['max_depth'],
        learning_rate=hyperparams['learning_rate'],
        gamma=hyperparams['gamma'],
        min_child_weight=hyperparams['min_child_weight'],
        max_delta_step=hyperparams['max_delta_step'],
        subsample=hyperparams['subsample'],
        random_state=0,
        tree_method='hist'
        )
    
    model.fit(X_train_vect, Y_train)
    
    test_preds = model.predict_proba(X_test_vect)[:,1]
    
    if score=='auc':
        test_score = roc_auc_score(Y_test, test_preds)
    elif score=='brier':
        test_score = brier_score_loss(Y_test, test_preds)
    elif score=='log_loss':
        test_score = log_loss(Y_test, test_preds)
    else:
        raise ValueError("Invalid scoring scheme, enter either 'auc', 'logloss', or 'brier'")
        
    return test_score

### Evaluating performance of models with default hyperparameters

In [None]:
auc_train_pna = []
auc_test_pna = []
brier_train_pna = []
brier_test_pna = []

auc_train_chf = []
auc_test_chf = []
brier_train_chf = []
brier_test_chf = []

auc_train_aspiration = []
auc_test_aspiration = []
brier_train_aspiration = []
brier_test_aspiration = []

auc_train_sepsis = []
auc_test_sepsis = []
brier_train_sepsis = []
brier_test_sepsis = []

In [None]:
X_pna = pneumonia_notes['seg_pneumonia']
Y_pna = pneumonia_notes['pneumonia_sw']

X_chf = chf_notes['seg_chf']
Y_chf = chf_notes['chf_sw']

X_aspiration = aspiration_notes['seg_aspiration']
Y_aspiration = aspiration_notes['aspiration_sw']

X_sepsis = sepsis_notes['seg_sepsis']
Y_sepsis = sepsis_notes['sepsis_sw']

In [None]:
# Default is to do 5-fold CV
cv = KFold()
list_shap_values = list()
list_test_sets = list()

# We are not splitting by sample, but by encounter
for train_index, test_index in cv.split(pneumonia_enctrs):
    train_encounters = pneumonia_enctrs[train_index]
    test_encounters = pneumonia_enctrs[test_index]
    
    train = pneumonia_notes['encounter_id'].isin(train_encounters)
    test = pneumonia_notes['encounter_id'].isin(test_encounters)
    
    X_train, X_test = (
        pneumonia_notes.loc[train, "seg_pneumonia"].values,
        pneumonia_notes.loc[test, "seg_pneumonia"].values
        )
    Y_train, Y_test = (
        pneumonia_notes.loc[train, "pneumonia_sw"].values,
        pneumonia_notes.loc[test, "pneumonia_sw"].values
        )
    
    # Vectorize
    vect_pna = CountVectorizer(
        tokenizer=tokenizer_better,
        ngram_range=(1, 2),
        max_features=200
        )

    vect_pna.fit(X_train)
    X_train_vect = vect_pna.transform(X_train).toarray()
    X_test_vect = vect_pna.transform(X_test).toarray()
    features = {value: key for key, value in vect_pna.vocabulary_.items()}
    
    # Train model
    pna_model = XGBClassifier(random_state=0, device='cpu', tree_method='hist')
    pna_model.fit(X_train_vect, Y_train)
    
    # Predictions    
    pna_train_preds = pna_model.predict_proba(X_train_vect)[:,1]
    pna_test_preds = pna_model.predict_proba(X_test_vect)[:,1]
    
    # Gathering AUCs and Brier scores    
    auc_train_pna.append(roc_auc_score(Y_train, pna_train_preds))
    auc_test_pna.append(roc_auc_score(Y_test, pna_test_preds))
    brier_train_pna.append(brier_score_loss(Y_train, pna_train_preds))
    brier_test_pna.append(brier_score_loss(Y_test, pna_test_preds))

In [None]:
cv = KFold()

for train_index, test_index in cv.split(chf_enctrs):
    train_encounters = chf_enctrs[train_index]
    test_encounters = chf_enctrs[test_index]
    
    train = chf_notes['encounter_id'].isin(train_encounters)
    test = chf_notes['encounter_id'].isin(test_encounters)
    
    X_train, X_test = (
        chf_notes.loc[train, "seg_chf"].values,
        chf_notes.loc[test, "seg_chf"].values
        )
    Y_train, Y_test = (
        chf_notes.loc[train, "chf_sw"].values,
        chf_notes.loc[test, "chf_sw"].values
        )
    
    # Vectorize
    vect_chf = CountVectorizer(
        tokenizer=tokenizer_better,
        ngram_range=(1, 2),
        max_features=200
        )

    vect_chf.fit(X_train)
    X_train_vect = vect_chf.transform(X_train).toarray()
    X_test_vect = vect_chf.transform(X_test).toarray()
    features = {value: key for key, value in vect_chf.vocabulary_.items()}
    
    # Train model
    chf_model = XGBClassifier(random_state=0)
    chf_model.fit(X_train_vect, Y_train)
    
    # Predictions    
    chf_train_preds = chf_model.predict_proba(X_train_vect)[:,1]
    chf_test_preds = chf_model.predict_proba(X_test_vect)[:,1]
    
    # Gathering AUCs and Brier scores    
    auc_train_chf.append(roc_auc_score(Y_train, chf_train_preds))
    auc_test_chf.append(roc_auc_score(Y_test, chf_test_preds))
    brier_train_chf.append(brier_score_loss(Y_train, chf_train_preds))
    brier_test_chf.append(brier_score_loss(Y_test, chf_test_preds))

In [None]:
cv = KFold()

for train_index, test_index in cv.split(aspiration_enctrs):
    train_encounters = aspiration_enctrs[train_index]
    test_encounters = aspiration_enctrs[test_index]
    
    train = aspiration_notes['encounter_id'].isin(train_encounters)
    test = aspiration_notes['encounter_id'].isin(test_encounters)
    
    X_train, X_test = (
        aspiration_notes.loc[train, "seg_aspiration"].values,
        aspiration_notes.loc[test, "seg_aspiration"].values
        )
    Y_train, Y_test = (
        aspiration_notes.loc[train, "aspiration_sw"].values,
        aspiration_notes.loc[test, "aspiration_sw"].values
        )
    
    # Vectorize
    vect_aspiration = CountVectorizer(
        tokenizer=tokenizer_better,
        ngram_range=(1, 2),
        max_features=200
        )

    vect_aspiration.fit(X_train)
    X_train_vect = vect_aspiration.transform(X_train).toarray()
    X_test_vect = vect_aspiration.transform(X_test).toarray()
    features = {value: key for key, value in vect_aspiration.vocabulary_.items()}
    
    # Train model
    aspiration_model = XGBClassifier(random_state=0)
    aspiration_model.fit(X_train_vect, Y_train)
    
    # Predictions
    aspiration_train_preds = aspiration_model.predict_proba(X_train_vect)[:,1]
    aspiration_test_preds = aspiration_model.predict_proba(X_test_vect)[:,1]
    
    # Gathering AUCs and Brier scores
    auc_train_aspiration.append(roc_auc_score(Y_train, aspiration_train_preds))
    auc_test_aspiration.append(roc_auc_score(Y_test, aspiration_test_preds))
    brier_train_aspiration.append(brier_score_loss(Y_train, aspiration_train_preds))
    brier_test_aspiration.append(brier_score_loss(Y_test, aspiration_test_preds))

In [None]:
cv = KFold()

for train_index, test_index in cv.split(sepsis_enctrs):
    train_encounters = sepsis_enctrs[train_index]
    test_encounters = sepsis_enctrs[test_index]
    
    train = sepsis_notes['encounter_id'].isin(train_encounters)
    test = sepsis_notes['encounter_id'].isin(test_encounters)
    
    X_train, X_test = (
        sepsis_notes.loc[train, "seg_sepsis"].values,
        sepsis_notes.loc[test, "seg_sepsis"].values
        )
    Y_train, Y_test = (
        sepsis_notes.loc[train, "sepsis_sw"].values,
        sepsis_notes.loc[test, "sepsis_sw"].values
        )
    
    # Vectorize
    vect_sepsis = CountVectorizer(
        tokenizer=tokenizer_better,
        ngram_range=(1, 2),
        max_features=200
        )

    vect_sepsis.fit(X_train)
    X_train_vect = vect_sepsis.transform(X_train).toarray()
    X_test_vect = vect_sepsis.transform(X_test).toarray()
    features = {value: key for key, value in vect_sepsis.vocabulary_.items()}
    
    # Train model
    sepsis_model = XGBClassifier(random_state=0)
    sepsis_model.fit(X_train_vect, Y_train)
    
    # Predictions
    sepsis_train_preds = sepsis_model.predict_proba(X_train_vect)[:,1]
    sepsis_test_preds = sepsis_model.predict_proba(X_test_vect)[:,1]
    
    # Gathering AUCs and Brier scores
    auc_train_sepsis.append(roc_auc_score(Y_train, sepsis_train_preds))
    auc_test_sepsis.append(roc_auc_score(Y_test, sepsis_test_preds))
    brier_train_sepsis.append(brier_score_loss(Y_train, sepsis_train_preds))
    brier_test_sepsis.append(brier_score_loss(Y_test, sepsis_test_preds))

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(12,12))

ax[0,0].boxplot(
    [auc_train_pna, auc_train_chf, auc_train_aspiration, auc_train_sepsis],
    labels=['PNA','CHF','Aspiration', 'Sepsis']
    )
ax[0,0].set(ylim=[0.5, 1.05])
ax[0,0].set_ylabel("AUROC", size=18)
ax[0,0].tick_params(axis='x', labelsize=18)
ax[0,0].tick_params(axis='y', labelsize=18)
ax[0,0].set_title("Train AUC")

ax[0,1].boxplot(
    [auc_test_pna, auc_test_chf, auc_test_aspiration, auc_test_sepsis],
    labels=['PNA','CHF','Aspiration', 'Sepsis']
    )
ax[0,1].set_title("Test AUC")
ax[0,1].tick_params(axis='x', labelsize=18)
ax[0,1].tick_params(axis='y', labelsize=18)
ax[0,1].set(ylim=[0.5, 1.05])

ax[1,0].boxplot(
    [brier_train_pna, brier_train_chf, brier_train_aspiration, brier_train_sepsis],
    labels=['PNA','CHF','Aspiration', 'Sepsis']
    )
ax[1,0].set_title("Train Brier")
ax[1,0].set_ylabel("Brier score", size=18)
ax[1,0].tick_params(axis='x', labelsize=18)
ax[1,0].tick_params(axis='y', labelsize=18)
ax[1,0].set(ylim=[0.00, 0.35])

ax[1,1].boxplot(
    [brier_test_pna, brier_test_chf, brier_test_aspiration, brier_test_sepsis],
    labels=['PNA','CHF','Aspiration', 'Sepsis']
    )
ax[1,1].set_title("Test Brier")
ax[1,1].tick_params(axis='x', labelsize=18)
ax[1,1].tick_params(axis='y', labelsize=18)
ax[1,1].set(ylim=[0.00, 0.35])
plt.show()

### Now, hyperparameter tuning.

In [None]:
XG_param_grid = {
    'base_score': hp.uniform('base_score', 0.0, 1.0),
    'n_estimators': scope.int(hp.quniform("n_estimators", 10, 10000, 10)),
    'max_depth': scope.int(hp.quniform("max_depth", 10, 10000, 10)),
    'learning_rate': hp.uniform('learning_rate', 0.0, 1.0),
    'gamma': hp.uniform('gamma', 0.0, 10.0),
    'min_child_weight': hp.uniform('min_child_weight', 0.0, 100.0),
    'max_delta_step': hp.uniform("max_delta_step", 0.0, 100.0),
    'subsample': hp.uniform('subsample', 0.001, 1.0)
    }

#### Pneumonia

In [None]:
# Bayesian Optimization
def objective(XG_param_grid):
    cv = KFold()
    logloss = Parallel(n_jobs=5)(delayed(custom_cv_func)(
        XG_param_grid,
        pneumonia_notes,
        pneumonia_enctrs,
        "seg_pneumonia",
        "pneumonia_sw",
        train_index,
        test_index
        ) for train_index, test_index in cv.split(pneumonia_enctrs))
    
    mean_logloss = np.mean(logloss)
    var_logloss = np.var(logloss, ddof=1)

    return {'loss': mean_logloss, 'loss_variance': var_logloss, 'status': STATUS_OK}

In [None]:
# max_evals = 20*(# of ordinal hyperparams) + 15*(# categorical choices) = 160
# stopping criteria, if needed: no improvement within window = 0.25*max_eval
if __name__ == "__main__":
    best_XG = fmin(
        fn=objective,
        space=XG_param_grid,
        algo=tpe.suggest,
        max_evals=160,
        trials=Trials(),
        early_stop_fn=no_progress_loss(40))
    
best_XG['n_estimators'] = int(best_XG['n_estimators'])
best_XG['max_depth'] = int(best_XG['max_depth'])

In [None]:
print(f"Default AUROC for XG: {np.mean(auc_test_pna):.3f}")
print(f"Default Brier for XG: {np.mean(brier_test_pna):.3f}\n")

cv = KFold()
auc = Parallel(n_jobs=5)(delayed(custom_cv_func)(
    best_XG,
    pneumonia_notes,
    pneumonia_enctrs,
    "seg_pneumonia",
    "pneumonia_sw",
    train_index,
    test_index,
    score='auc'
    ) for train_index, test_index in cv.split(pneumonia_enctrs))

brier = Parallel(n_jobs=5)(delayed(custom_cv_func)(
    best_XG,
    pneumonia_notes,
    pneumonia_enctrs,
    "seg_pneumonia",
    "pneumonia_sw",
    train_index,
    test_index,
    score='brier'
    ) for train_index, test_index in cv.split(pneumonia_enctrs))

print(f"Tuned AUROC for XG: {np.mean(auc):.3f}")
print(f"Tuned Brier for XG: {np.mean(brier):.3f}")

In [None]:
pna_hyperparam = {
  'base_score': str(best_XG['base_score']),
  'n_estimators': str(int(best_XG['n_estimators'])),
  'max_depth': str(int(best_XG['max_depth'])),
  'learning_rate': str(best_XG['learning_rate']),
  'gamma': str(best_XG['gamma']),
  'min_child_weight': str(best_XG['min_child_weight']),
  'max_delta_step': str(best_XG['max_delta_step']),
  'subsample': str(best_XG['subsample'])
  }

pna_hyperparam

In [None]:
# with open("hyperparameters/pna_XG_hyperparams.json", 'w') as file_json:
#     json.dump(pna_hyperparam, file_json)

#### Congestive Heart Failure

In [None]:
def objective(XG_param_grid):
    cv = KFold()
    logloss = Parallel(n_jobs=5)(delayed(custom_cv_func)(
        XG_param_grid,
        chf_notes,
        chf_enctrs,
        "seg_chf",
        "chf_sw",
        train_index,
        test_index
        ) for train_index, test_index in cv.split(chf_enctrs))
    
    mean_logloss = np.mean(logloss)
    var_logloss = np.var(logloss, ddof=1)

    return {'loss': mean_logloss, 'loss_variance': var_logloss, 'status': STATUS_OK}

In [None]:
# max_evals = 20*(# of ordinal hyperparams) + 15*(# categorical choices) = 160
# stopping criteria, if needed: no improvement within window = 0.25*max_eval
if __name__ == "__main__":
    best_XG = fmin(
        fn=objective,
        space=XG_param_grid,
        algo=tpe.suggest,
        max_evals=160,
        trials=Trials(),
        early_stop_fn=no_progress_loss(40)
        )
    
best_XG['n_estimators'] = int(best_XG['n_estimators'])
best_XG['max_depth'] = int(best_XG['max_depth'])

In [None]:
print(f"Default AUROC for XG: {np.mean(auc_test_chf):.3f}")
print(f"Default Brier for XG: {np.mean(brier_test_chf):.3f}\n")

cv = KFold()
auc = Parallel(n_jobs=5)(delayed(custom_cv_func)(
    best_XG,
    chf_notes,
    chf_enctrs,
    "seg_chf",
    "chf_sw",
    train_index,
    test_index,
    score='auc'
    ) for train_index, test_index in cv.split(chf_enctrs))

brier = Parallel(n_jobs=5)(delayed(custom_cv_func)(
    best_XG,
    chf_notes,
    chf_enctrs,
    "seg_chf",
    "chf_sw",
    train_index,
    test_index,
    score='brier'
    ) for train_index, test_index in cv.split(chf_enctrs))

print(f"Tuned AUROC for XG: {np.mean(auc):.3f}")
print(f"Tuned Brier for XG: {np.mean(brier):.3f}")

In [None]:
chf_hyperparam = {
    'base_score': str(best_XG['base_score']),
    'n_estimators': str(int(best_XG['n_estimators'])),
    'max_depth': str(int(best_XG['max_depth'])),
    'learning_rate': str(best_XG['learning_rate']),
    'gamma': str(best_XG['gamma']),
    'min_child_weight': str(best_XG['min_child_weight']),
    'max_delta_step': str(best_XG['max_delta_step']),
    'subsample': str(best_XG['subsample'])
    }

chf_hyperparam

In [None]:
# with open("hyperparameters/chf_XG_hyperparams.json", 'w') as file_json:
#     json.dump(chf_hyperparam, file_json)

#### Aspiration

In [None]:
def objective(XG_param_grid):
    cv = KFold()
    logloss = Parallel(n_jobs=5)(delayed(custom_cv_func)(
        XG_param_grid,
        aspiration_notes,
        aspiration_enctrs,
        "seg_aspiration",
        "aspiration_sw",
        train_index,
        test_index
        ) for train_index, test_index in cv.split(aspiration_enctrs))
    
    mean_logloss = np.mean(logloss)
    var_logloss = np.var(logloss, ddof=1)

    return {'loss': mean_logloss, 'loss_variance': var_logloss, 'status': STATUS_OK}

In [None]:
# max_evals = 20*(# of ordinal hyperparams) + 15*(# categorical choices) = 160
# stopping criteria, if needed: no improvement within window = 0.25*max_eval
if __name__ == "__main__":
    best_XG = fmin(
        fn=objective,
        space=XG_param_grid,
        algo=tpe.suggest,
        max_evals=160,
        trials=Trials(),
        early_stop_fn=no_progress_loss(40)
        )
    
best_XG['n_estimators'] = int(best_XG['n_estimators'])
best_XG['max_depth'] = int(best_XG['max_depth'])

In [None]:
print(f"Default AUROC for XG: {np.mean(auc_test_aspiration):.3f}")
print(f"Default Brier for XG: {np.mean(brier_test_aspiration):.3f}\n")

cv = KFold()
auc = Parallel(n_jobs=5)(delayed(custom_cv_func)(
    best_XG,
    aspiration_notes,
    aspiration_enctrs,
    "seg_aspiration",
    "aspiration_sw",
    train_index,
    test_index,
    score='auc'
    ) for train_index, test_index in cv.split(aspiration_enctrs))

brier = Parallel(n_jobs=5)(delayed(custom_cv_func)(
    best_XG,
    aspiration_notes,
    aspiration_enctrs,
    "seg_aspiration",
    "aspiration_sw",
    train_index,
    test_index,
    score='brier'
    ) for train_index, test_index in cv.split(aspiration_enctrs))

print(f"Tuned AUROC for XG: {np.mean(auc):.3f}")
print(f"Tuned Brier for XG: {np.mean(brier):.3f}")

In [None]:
aspiration_hyperparam = {
    'base_score': str(best_XG['base_score']),
    'n_estimators': str(int(best_XG['n_estimators'])),
    'max_depth': str(int(best_XG['max_depth'])),
    'learning_rate': str(best_XG['learning_rate']),
    'gamma': str(best_XG['gamma']),
    'min_child_weight': str(best_XG['min_child_weight']),
    'max_delta_step': str(best_XG['max_delta_step']),
    'subsample': str(best_XG['subsample'])
    }

aspiration_hyperparam

In [None]:
# with open("hyperparameters/aspiration_XG_hyperparams.json", 'w') as file_json:
#     json.dump(aspiration_hyperparam, file_json)

#### Sepsis

In [None]:
def objective(XG_param_grid):
    cv = KFold()
    logloss = Parallel(n_jobs=5)(delayed(custom_cv_func)(
        XG_param_grid,
        sepsis_notes,
        sepsis_enctrs,
        "seg_sepsis",
        "sepsis_sw",
        train_index,
        test_index
        ) for train_index, test_index in cv.split(sepsis_enctrs))
    
    mean_logloss = np.mean(logloss)
    var_logloss = np.var(logloss, ddof=1)

    return {'loss': mean_logloss, 'loss_variance': var_logloss, 'status': STATUS_OK}

In [None]:
# max_evals = 20*(# of ordinal hyperparams) + 15*(# categorical choices) = 140
# stopping criteria, if needed: no improvement within window = 0.25*max_eval
if __name__ == "__main__":
    best_XG = fmin(
        fn=objective,
        space=XG_param_grid,
        algo=tpe.suggest,
        max_evals=160,
        trials=Trials(),
        early_stop_fn=no_progress_loss(40))
    
best_XG['n_estimators'] = int(best_XG['n_estimators'])
best_XG['max_depth'] = int(best_XG['max_depth'])

In [None]:
print(f"Default AUROC for XG: {np.mean(auc_test_sepsis):.3f}")
print(f"Default Brier for XG: {np.mean(brier_test_sepsis):.3f}\n")

cv = KFold()
auc = Parallel(n_jobs=5)(delayed(custom_cv_func)(
    best_XG,
    sepsis_notes,
    sepsis_enctrs,
    "seg_sepsis",
    "sepsis_sw",
    train_index,
    test_index,
    score='auc'
    ) for train_index, test_index in cv.split(sepsis_enctrs))

brier = Parallel(n_jobs=5)(delayed(custom_cv_func)(
    best_XG,
    sepsis_notes,
    sepsis_enctrs,
    "seg_sepsis",
    "sepsis_sw",
    train_index,
    test_index,
    score='brier'
    ) for train_index, test_index in cv.split(sepsis_enctrs))

print(f"Tuned AUROC for XG: {np.mean(auc):.3f}")
print(f"Tuned Brier for XG: {np.mean(brier):.3f}")

In [None]:
sepsis_hyperparam = {
    'base_score': str(best_XG['base_score']),
    'n_estimators': str(int(best_XG['n_estimators'])),
    'max_depth': str(int(best_XG['max_depth'])),
    'learning_rate': str(best_XG['learning_rate']),
    'gamma': str(best_XG['gamma']),
    'min_child_weight': str(best_XG['min_child_weight']),
    'max_delta_step': str(best_XG['max_delta_step']),
    'subsample': str(best_XG['subsample'])
    }

sepsis_hyperparam

In [None]:
# with open("hyperparameters/sepsis_XG_hyperparams.json", 'w') as file_json:
#     json.dump(sepsis_hyperparam, file_json)