# Appendix. Hyperparameter Tuning

Import necessary libraries and custom function to build sklearn pipes for stroke dataset:

In [20]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import xgboost
from xgboost import XGBClassifier
import lightgbm
from lightgbm import LGBMClassifier
import joblib
from helper_functions.custom_pipes import build_full_pipe

Load training data:

In [21]:
X_train = pd.read_pickle("X_train.pkl")
y_train = pd.read_pickle("y_train.pkl")

Create pipelines for all models being tuned.

In [22]:
logreg = LogisticRegression(class_weight="balanced", random_state=0)
forest = RandomForestClassifier(class_weight="balanced", random_state=0)
xgb = XGBClassifier(objective="binary:logistic", random_state=0)
lgbm = LGBMClassifier(class_weight="balanced", random_state=0, verbose=-1)
pipe_logreg = build_full_pipe(logreg)
pipe_forest = build_full_pipe(forest)
pipe_xgb = build_full_pipe(xgb)
pipe_lgbm = build_full_pipe(lgbm)

Tune Logistic Regression model and save it to external file:

In [23]:
params_logreg = {
    "preprocessor__num__impute__strategy": ["mean", "median"], 
    "preprocessor__num__scale": [StandardScaler(), MinMaxScaler()],
    "model__solver": ["liblinear"],
    "model__penalty": ["l1", "l2"],
    "model__C": [1, 0.1, 0.01, 0.001],
}
grid = GridSearchCV(
    pipe_logreg, params_logreg, cv=5, scoring="average_precision", n_jobs=-1
).fit(X_train, y_train)
print("Best Logistic Regression parameters:\n", grid.best_params_)
print("Average Precision: ", round(grid.best_score_, 3))
best_logreg = grid.best_estimator_
joblib.dump(best_logreg, "./tuned_models/best_logreg.sav")

Best Logistic Regression parameters:
 {'model__C': 0.1, 'model__penalty': 'l2', 'model__solver': 'liblinear', 'preprocessor__num__impute__strategy': 'median', 'preprocessor__num__scale': MinMaxScaler()}
Average Precision:  0.191


['./tuned_models/best_logreg.sav']

Tune Random Forest classifier in two steps and save it to external file:

In [24]:
params_forest = {
    "preprocessor__num__impute__strategy": ["mean", "median"],
    "preprocessor__num__scale": [None],
    "model__criterion": ["gini", "entropy"],
    "model__max_features": ["sqrt", "log2"],
    "model__max_depth": [5],
    "model__max_leaf_nodes": [10],
}
grid = GridSearchCV(
    pipe_forest, params_forest, cv=5, scoring="average_precision", n_jobs=-1
).fit(X_train, y_train)
print("Best Random Forest parameters:\n", grid.best_params_)
print("Average Precision: ", round(grid.best_score_, 3))

Best Random Forest parameters:
 {'model__criterion': 'gini', 'model__max_depth': 5, 'model__max_features': 'sqrt', 'model__max_leaf_nodes': 10, 'preprocessor__num__impute__strategy': 'median', 'preprocessor__num__scale': None}
Average Precision:  0.182


In [25]:
params_forest = {
    "preprocessor__num__impute__strategy": ["median"],
    "preprocessor__num__scale": [None],
    "model__criterion": ["gini"],
    "model__max_features": ["sqrt"],
    "model__n_estimators": [30, 40, 50, 60],
    "model__max_depth": [3, 4, 5, 6],
    "model__max_leaf_nodes": [5, 10, 15, 20, 25],
    "model__min_samples_leaf": [1, 2, 3, 4],
}
grid = GridSearchCV(
    pipe_forest, params_forest, cv=5, scoring="average_precision", n_jobs=-1
).fit(X_train, y_train)
print("Best Random Forest parameters:\n", grid.best_params_)
print("Average Precision: ", round(grid.best_score_, 3))
best_forest = grid.best_estimator_
joblib.dump(best_forest, "./tuned_models/best_forest.sav")

Best Random Forest parameters:
 {'model__criterion': 'gini', 'model__max_depth': 5, 'model__max_features': 'sqrt', 'model__max_leaf_nodes': 10, 'model__min_samples_leaf': 3, 'model__n_estimators': 50, 'preprocessor__num__impute__strategy': 'median', 'preprocessor__num__scale': None}
Average Precision:  0.192


['./tuned_models/best_forest.sav']

For boosting classifiers positive class weight need to be calculated. Also, transformed dataset is needed for internal cross-validation.

In [26]:
preprocessor_pipe = Pipeline(pipe_logreg.steps[:-1])
X_train_transformed = preprocessor_pipe.fit_transform(X_train)
counter = Counter(y_train)

scale_pos_weight = counter[0] / counter[1]
scale_pos_weight

19.542713567839197

Stepwise tune XGBoost classifier. For the defined learning rate and some initial parameters, number of estimators is found.

In [27]:
params = {
    "objective": "binary:logistic",
    "random_state": 0,
    "learning_rate": 0.1,
    "scale_pos_weight": scale_pos_weight,
    "max_depth": 5,
    "min_child_weight": 1,
    "gamma": 0.1,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
}

xgtrain = xgboost.DMatrix(X_train_transformed.values, label=y_train.values)
cvresult = xgboost.cv(
    params,
    xgtrain,
    num_boost_round=500,
    nfold=5,
    stratified=True,
    metrics="aucpr",
    early_stopping_rounds=50,
)
print("Number of estimators for 0.1 learning rate:", cvresult.shape[0])

Number of estimators for 0.1 learning rate: 31


First, maximum tree depth and minimum leaf weight are found.

In [28]:
params_xgb = {
    "preprocessor__num__impute__strategy": ["constant"],
    "preprocessor__num__scale": [None],
    "model__scale_pos_weight": [19.577],
    "model__learning_rate": [0.1],
    "model__n_estimators": [31],
    "model__max_depth": [2, 3, 4, 5, 6],
    "model__min_child_weight": [1, 2, 3, 4, 5],
    "model__gamma": [0.1],
    "model__subsample": [0.8],
    "model__colsample_bytree": [0.8],
}
grid = GridSearchCV(
    pipe_xgb, params_xgb, cv=5, scoring="average_precision", n_jobs=-1
).fit(X_train, y_train)
print("Best XGBoost parameters at Step 1:\n", grid.best_params_)
print("Average Precision: ", round(grid.best_score_, 3))

Best XGBoost parameters at Step 1:
 {'model__colsample_bytree': 0.8, 'model__gamma': 0.1, 'model__learning_rate': 0.1, 'model__max_depth': 2, 'model__min_child_weight': 3, 'model__n_estimators': 31, 'model__scale_pos_weight': 19.577, 'model__subsample': 0.8, 'preprocessor__num__impute__strategy': 'constant', 'preprocessor__num__scale': None}
Average Precision:  0.239


Second, other tree-based sampling parameters are found.

In [29]:
params_xgb = {
    "preprocessor__num__impute__strategy": ["constant"],
    "preprocessor__num__scale": [None],
    "model__scale_pos_weight": [19.577],
    "model__learning_rate": [0.1],
    "model__n_estimators": [31],
    "model__max_depth": [2],
    "model__min_child_weight": [3],
    "model__gamma": [0, 0.1, 0.5, 1, 2, 3],
    "model__subsample": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    "model__colsample_bytree": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
}
grid = GridSearchCV(
    pipe_xgb, params_xgb, cv=5, scoring="average_precision", n_jobs=-1
).fit(X_train, y_train)
print("Best XGBoost parameters at Step 2:\n", grid.best_params_)
print("Average Precision: ", round(grid.best_score_, 3))

Best XGBoost parameters at Step 2:
 {'model__colsample_bytree': 0.6, 'model__gamma': 1, 'model__learning_rate': 0.1, 'model__max_depth': 2, 'model__min_child_weight': 3, 'model__n_estimators': 31, 'model__scale_pos_weight': 19.577, 'model__subsample': 0.8, 'preprocessor__num__impute__strategy': 'constant', 'preprocessor__num__scale': None}
Average Precision:  0.24


Finally, regularization parameters are found and XGBoost classifier is saved to external file.

In [30]:
params_xgb = {
    "preprocessor__num__impute__strategy": ["constant"],
    "preprocessor__num__scale": [None],
    "model__scale_pos_weight": [19.577],
    "model__learning_rate": [0.1],
    "model__n_estimators": [31],
    "model__max_depth": [2],
    "model__min_child_weight": [3],
    "model__gamma": [1],
    "model__subsample": [0.8],
    "model__colsample_bytree": [0.6],
    "model__reg_lambda": [0.01, 0.1, 0.5, 1, 2, 3],
    "model__reg_alpha": [0.01, 0.1, 0.5, 1, 2, 3, 4],
}
grid = GridSearchCV(
    pipe_xgb, params_xgb, cv=5, scoring="average_precision", n_jobs=-1
).fit(X_train, y_train)
print("Best XGBoost parameters at Step 3 (final):\n", grid.best_params_)
print("Average Precision: ", round(grid.best_score_, 3))
best_xgb = grid.best_estimator_
joblib.dump(best_xgb, "./tuned_models/best_xgb.sav")

Best XGBoost parameters at Step 3 (final):
 {'model__colsample_bytree': 0.6, 'model__gamma': 1, 'model__learning_rate': 0.1, 'model__max_depth': 2, 'model__min_child_weight': 3, 'model__n_estimators': 31, 'model__reg_alpha': 3, 'model__reg_lambda': 1, 'model__scale_pos_weight': 19.577, 'model__subsample': 0.8, 'preprocessor__num__impute__strategy': 'constant', 'preprocessor__num__scale': None}
Average Precision:  0.248


['./tuned_models/best_xgb.sav']

The same stepwise tuning approach is used for LightGBM classifier. For the defined learning rate, number of estimators is found.

In [31]:
params = {
    "objective": "binary",
    "random_state": 0,
    "scale_pos_weight": scale_pos_weight,
    "learning_rate": 0.1,
    "max_depth": 5,
    "num_leaves": 16,
    "min_child_weight": 1,
    "subsample_freq": 1,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "verbose": -1,
}
lgbmtrain = lightgbm.Dataset(X_train_transformed.values, label=y_train.values)
cvresult = lightgbm.cv(
    params,
    lgbmtrain,
    num_boost_round=500,
    nfold=5,
    metrics="average_precision",
    callbacks=[lightgbm.early_stopping(stopping_rounds=50)],
)
print(
    "Number of estimators for 0.1 learning rate:",
    np.argmax(cvresult["valid average_precision-mean"]) + 1,
)

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[16]	cv_agg's valid average_precision: 0.201244 + 0.0134795
Number of estimators for 0.1 learning rate: 16


First, tree-level parameters are tuned.

In [32]:
params_lgbm = {
    "preprocessor__num__impute__strategy": ["constant"],
    "preprocessor__num__scale": [None],
    "model__scale_pos_weight": [19.577],
    "model__learning_rate": [0.1],
    "model__n_estimators": [16],
    "model__max_depth": [3, 4, 5, 6, 7],
    "model__num_leaves": [5, 10, 16, 20, 25],
    "model__min_child_weight": [1, 2, 3, 4, 5, 6, 7],
    "model__subsample_freq": [1],
    "model__subsample": [0.8],
    "model__colsample_bytree": [0.8],
}
grid = GridSearchCV(
    pipe_lgbm, params_lgbm, cv=5, scoring="average_precision", n_jobs=-1
).fit(X_train, y_train)
print("Best LightGBM parameters at Step 1:\n", grid.best_params_)
print("Average Precision: ", round(grid.best_score_, 3))

Best LightGBM parameters at Step 1:
 {'model__colsample_bytree': 0.8, 'model__learning_rate': 0.1, 'model__max_depth': 4, 'model__min_child_weight': 6, 'model__n_estimators': 16, 'model__num_leaves': 10, 'model__scale_pos_weight': 19.577, 'model__subsample': 0.8, 'model__subsample_freq': 1, 'preprocessor__num__impute__strategy': 'constant', 'preprocessor__num__scale': None}
Average Precision:  0.235


Second, sampling parameters are tuned.

In [33]:
params_lgbm = {
    "preprocessor__num__impute__strategy": ["constant"],
    "preprocessor__num__scale": [None],
    "model__scale_pos_weight": [19.577],
    "model__learning_rate": [0.1],
    "model__n_estimators": [16],
    "model__max_depth": [4],
    "model__num_leaves": [10],
    "model__min_child_weight": [6],
    "model__subsample_freq": [1, 2, 5],
    "model__subsample": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
    "model__colsample_bytree": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
}
grid = GridSearchCV(
    pipe_lgbm, params_lgbm, cv=5, scoring="average_precision", n_jobs=-1
).fit(X_train, y_train)
print("Best LightGBM parameters at Step 2:\n", grid.best_params_)
print("Average Precision: ", round(grid.best_score_, 3))

Best LightGBM parameters at Step 2:
 {'model__colsample_bytree': 0.7, 'model__learning_rate': 0.1, 'model__max_depth': 4, 'model__min_child_weight': 6, 'model__n_estimators': 16, 'model__num_leaves': 10, 'model__scale_pos_weight': 19.577, 'model__subsample': 1.0, 'model__subsample_freq': 1, 'preprocessor__num__impute__strategy': 'constant', 'preprocessor__num__scale': None}
Average Precision:  0.237


Finally, regularization parameters are found and classifier is saved to external file.

In [34]:
params_lgbm = {
    "preprocessor__num__impute__strategy": ["constant"],
    "preprocessor__num__scale": [None],
    "model__scale_pos_weight": [19.577],
    "model__learning_rate": [0.1],
    "model__n_estimators": [16],
    "model__max_depth": [4],
    "model__num_leaves": [10],
    "model__min_child_weight": [6],
    "model__subsample_freq": [1],
    "model__subsample": [1.0],
    "model__colsample_bytree": [0.7],
    "model__reg_lambda": [0, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1],
    "model__reg_alpha": [0, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1],
}
grid = GridSearchCV(
    pipe_lgbm, params_lgbm, cv=5, scoring="average_precision", n_jobs=-1
).fit(X_train, y_train)
print("Best LightGBM parameters at Step 3 (final):\n", grid.best_params_)
print("Average Precision: ", round(grid.best_score_, 3))
best_lgbm = grid.best_estimator_
joblib.dump(best_lgbm, "./tuned_models/best_lgbm.sav")

Best LightGBM parameters at Step 3 (final):
 {'model__colsample_bytree': 0.7, 'model__learning_rate': 0.1, 'model__max_depth': 4, 'model__min_child_weight': 6, 'model__n_estimators': 16, 'model__num_leaves': 10, 'model__reg_alpha': 0.05, 'model__reg_lambda': 1, 'model__scale_pos_weight': 19.577, 'model__subsample': 1.0, 'model__subsample_freq': 1, 'preprocessor__num__impute__strategy': 'constant', 'preprocessor__num__scale': None}
Average Precision:  0.245


['./tuned_models/best_lgbm.sav']