# Imports

In [53]:
import numpy as np
import os
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.colors as mcolors
from pandas.plotting import scatter_matrix
from scipy.stats import spearmanr
from scipy.cluster import hierarchy
from collections import defaultdict

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.metrics import confusion_matrix, f1_score, average_precision_score, classification_report, fbeta_score, accuracy_score
from sklearn.feature_selection import RFECV

from statsmodels.stats.outliers_influence import variance_inflation_factor

import optuna
import statsmodels.api as sm
from boruta import BorutaPy

import custom_map

In [54]:
import importlib

importlib.reload(custom_map)

<module 'custom_map' from '/Users/dominikmika/PycharmProjects/Ridge-hillclimbing/custom_map.py'>

# Setup

In [55]:
data = pd.read_csv("healthcare-dataset-stroke-data.csv")
bmi_median = data['bmi'].median()
data['bmi'] = data['bmi'].fillna(bmi_median)

target = "stroke"

categorical_raw = data.select_dtypes(include=['object']).columns.tolist()
numerical_features = data.select_dtypes(include=['float64', 'int64']).columns.drop('id')

data = pd.get_dummies(
    data,
    columns=categorical_raw,
    drop_first=True,
    dtype=float
)

data = data.drop('id', axis=1)

binary_features = [
    'hypertension',
    'heart_disease',
    'stroke'
]

numerical_features = numerical_features.drop(binary_features)

numerical_binary_features = numerical_features.union(binary_features)

categorical_features = [
    col for col in data.columns
    if any(col.startswith(c + "_") for c in categorical_raw)
]

categorical_features = categorical_features + binary_features
categorical_features = list(dict.fromkeys(categorical_features))  # deduplikacja

print("Categorical:", categorical_features)
print("Numerical:", numerical_features.tolist())
print("Numerical + binary:", numerical_binary_features.tolist())


Categorical: ['gender_Male', 'gender_Other', 'ever_married_Yes', 'work_type_Never_worked', 'work_type_Private', 'work_type_Self-employed', 'work_type_children', 'Residence_type_Urban', 'smoking_status_formerly smoked', 'smoking_status_never smoked', 'smoking_status_smokes', 'hypertension', 'heart_disease', 'stroke']
Numerical: ['age', 'avg_glucose_level', 'bmi']
Numerical + binary: ['age', 'avg_glucose_level', 'bmi', 'heart_disease', 'hypertension', 'stroke']


In [56]:
X = data.drop(columns=[target])
y = data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=y, shuffle=True)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [57]:
## hard coding because of stochasticity

all_features = ["age", "hypertension", "heart_disease", "avg_glucose_level",
                "bmi", "gender_Male", "gender_Other", "ever_married_Yes",
                "work_type_Never_worked", "work_type_Private", "work_type_Self-employed",
                "work_type_children", "Residence_type_Urban",
                "smoking_status_formerly smoked", "smoking_status_never smoked",
                "smoking_status_smokes"]

boruta_features = ["age", "avg_glucose_level", "bmi"]

corr_features = ["age", "heart_disease", "avg_glucose_level", "hypertension",
                 "ever_married_Yes", "smoking_status_formerly smoked",
                 "work_type_Self-employed"]

mi_features = ["age", "hypertension", "gender_Other",
               "work_type_Private", "smoking_status_formerly smoked"]

rfe_features = ["age", "hypertension", "heart_disease", "avg_glucose_level",
                "bmi", "work_type_Never_worked", "work_type_children",
                "Residence_type_Urban", "smoking_status_never smoked",
                "smoking_status_smokes"]

FEATURE_SETS = {
    "all": all_features,
    "boruta": boruta_features,
    "correlation": corr_features,
    "mi": mi_features,
    "rfe": rfe_features
}

In [58]:
storage_url = "sqlite:///optuna_studies.db"
cv = StratifiedKFold(5, shuffle=True, random_state=42)

MODELS = [
    "logreg", "knn", "svm", "gnb", "dt",
    "rf", "ada", "gb", "extra",
    "lgbm", "xgb", "cat"
]

SMOTE_MODELS = {"logreg", "knn", "svm", "gnb"}

# Results

In [59]:
def find_best_f1_threshold(model, X, y, thresholds=np.linspace(0.01, 0.5, 50)):
    proba = model.predict_proba(X)[:, 1]

    best_f1 = 0.0
    best_thr = 0.5

    for t in thresholds:
        y_pred = (proba >= t).astype(int)
        f1 = f1_score(y, y_pred)

        if f1 > best_f1:
            best_f1 = f1
            best_thr = t

    return best_thr, best_f1

In [60]:
for feature_name, feature_list in FEATURE_SETS.items():

    X_train_sel = X_train[feature_list]

    for model_name in MODELS:

        study_name = f"{model_name}_{feature_name}_prob"

        with open(f"models/prob_f1/{study_name}.pkl", "rb") as f:
            artifact = pickle.load(f)

        model = artifact["model"]

        best_thr, best_f1 = find_best_f1_threshold(
            model, X_train_sel, y_train
        )

        artifact = {
            "model": model,
            "threshold": best_thr,
            "f1_train": best_f1
        }

        with open(f"models/{study_name}.pkl", "wb") as f:
            pickle.dump(artifact, f)

        print(
            f"SAVED {study_name} | "
            f"thr={best_thr:.2f} | "
            f"F1_train={best_f1:.4f}"
        )

SAVED logreg_all_prob | thr=0.50 | F1_train=0.2313
SAVED knn_all_prob | thr=0.48 | F1_train=0.8677
SAVED svm_all_prob | thr=0.14 | F1_train=0.4829
SAVED gnb_all_prob | thr=0.37 | F1_train=0.1063
SAVED dt_all_prob | thr=0.44 | F1_train=0.2526
SAVED rf_all_prob | thr=0.50 | F1_train=0.3609
SAVED ada_all_prob | thr=0.42 | F1_train=0.2910
SAVED gb_all_prob | thr=0.39 | F1_train=0.8757
SAVED extra_all_prob | thr=0.50 | F1_train=0.2575
SAVED lgbm_all_prob | thr=0.50 | F1_train=0.4887
SAVED xgb_all_prob | thr=0.31 | F1_train=0.7128
SAVED cat_all_prob | thr=0.50 | F1_train=0.6031
SAVED logreg_boruta_prob | thr=0.50 | F1_train=0.2262
SAVED knn_boruta_prob | thr=0.01 | F1_train=0.4090
SAVED svm_boruta_prob | thr=0.14 | F1_train=0.2886
SAVED gnb_boruta_prob | thr=0.12 | F1_train=0.2515
SAVED dt_boruta_prob | thr=0.46 | F1_train=0.2306
SAVED rf_boruta_prob | thr=0.50 | F1_train=0.3340
SAVED ada_boruta_prob | thr=0.32 | F1_train=0.2975
SAVED gb_boruta_prob | thr=0.32 | F1_train=0.8474
SAVED extra_b

In [61]:
results = pd.DataFrame(
    index=MODELS,
    columns=FEATURE_SETS.keys(),
    dtype=float
)

for feature_name, feature_list in FEATURE_SETS.items():

    X_test_sel = X_test[feature_list]

    for model_name in MODELS:

        study_name = f"{model_name}_{feature_name}_prob"

        with open(f"models/{study_name}.pkl", "rb") as f:
            artifact = pickle.load(f)

        model = artifact["model"]
        best_thr = artifact["threshold"]

        proba_test = model.predict_proba(X_test_sel)[:, 1]
        y_pred_test = (proba_test >= best_thr).astype(int)

        f1 = f1_score(y_test, y_pred_test)
        results.loc[model_name, feature_name] = f1

        print(
            f"{model_name}_{feature_name} | "
            f"thr={best_thr:.2f} | "
            f"F1_test={f1:.4f}"
        )

logreg_all | thr=0.50 | F1_test=0.2143
knn_all | thr=0.48 | F1_test=0.8043
svm_all | thr=0.14 | F1_test=0.4361
gnb_all | thr=0.37 | F1_test=0.1064
dt_all | thr=0.44 | F1_test=0.2474
rf_all | thr=0.50 | F1_test=0.3171
ada_all | thr=0.42 | F1_test=0.2637
gb_all | thr=0.39 | F1_test=0.7865
extra_all | thr=0.50 | F1_test=0.2346
lgbm_all | thr=0.50 | F1_test=0.4242
xgb_all | thr=0.31 | F1_test=0.7500
cat_all | thr=0.50 | F1_test=0.5298
logreg_boruta | thr=0.50 | F1_test=0.2164
knn_boruta | thr=0.01 | F1_test=0.3942
svm_boruta | thr=0.14 | F1_test=0.2679
gnb_boruta | thr=0.12 | F1_test=0.2550
dt_boruta | thr=0.46 | F1_test=0.2217
rf_boruta | thr=0.50 | F1_test=0.2974
ada_boruta | thr=0.32 | F1_test=0.2842
gb_boruta | thr=0.32 | F1_test=0.7551
extra_boruta | thr=0.50 | F1_test=0.2493
lgbm_boruta | thr=0.50 | F1_test=0.3009
xgb_boruta | thr=0.34 | F1_test=0.5600
cat_boruta | thr=0.50 | F1_test=0.4179
logreg_correlation | thr=0.50 | F1_test=0.1972
knn_correlation | thr=0.01 | F1_test=0.3846
svm

In [62]:
results = results.sort_values(
    by=results.columns.tolist(),
    ascending=False
)

results

Unnamed: 0,all,boruta,correlation,mi,rfe
knn,0.804348,0.394231,0.384615,0.235808,0.853933
gb,0.786517,0.755102,0.786517,0.191304,0.735632
xgb,0.75,0.56,0.826087,0.245902,0.817204
cat,0.529801,0.41791,0.42,0.255072,0.469274
svm,0.43609,0.267943,0.284024,0.242775,0.285714
lgbm,0.424242,0.30094,0.361446,0.259887,0.455959
rf,0.317073,0.297398,0.326923,0.27541,0.263666
ada,0.263736,0.284211,0.240437,0.273224,0.280899
dt,0.247368,0.221662,0.296875,0.259366,0.21875
extra,0.234568,0.249258,0.259136,0.246334,0.229226


In [63]:
def load_all_model_predictions(
    MODELS, FEATURE_SETS,
    X_train, y_train,
    X_test
):
    train_preds = []
    test_preds = []
    model_names = []

    for feature_name, feature_list in FEATURE_SETS.items():
        X_train_sel = X_train[feature_list]
        X_test_sel = X_test[feature_list]

        for model_name in MODELS:
            study_name = f"{model_name}_{feature_name}_prob"

            with open(f"models/{study_name}.pkl", "rb") as f:
                artifact = pickle.load(f)

            model = artifact["model"]

            p_train = model.predict_proba(X_train_sel)[:, 1]
            p_test = model.predict_proba(X_test_sel)[:, 1]

            train_preds.append(p_train)
            test_preds.append(p_test)
            model_names.append(study_name)

    P_train = np.column_stack(train_preds)
    P_test = np.column_stack(test_preds)

    return P_train, P_test, model_names


In [64]:
def extract_feature_set(model_name, FEATURE_SETS):
    for fs in FEATURE_SETS:
        if model_name.endswith(f"_{fs}"):
            return fs
    raise ValueError(f"Nie można dopasować feature setu do modelu: {model_name}")

def evaluate_single_model(
    artifact,
    model_name,
    X_train, y_train,
    X_test, y_test,
    FEATURE_SETS
):
    model = artifact["model"]
    threshold = artifact.get("threshold", 0.5)

    feature_set = extract_feature_set(model_name, FEATURE_SETS)
    features = FEATURE_SETS[feature_set]

    Xtr = X_train[features]
    Xte = X_test[features]

    train_proba = model.predict_proba(Xtr)[:, 1]
    test_proba = model.predict_proba(Xte)[:, 1]

    train_pred = (train_proba >= threshold).astype(int)
    test_pred = (test_proba >= threshold).astype(int)

    return (
        {
            "accuracy": accuracy_score(y_train, train_pred),
            "f1": f1_score(y_train, train_pred),
            "avg_precision": average_precision_score(y_train, train_proba),
        },
        {
            "accuracy": accuracy_score(y_test, test_pred),
            "f1": f1_score(y_test, test_pred),
            "avg_precision": average_precision_score(y_test, test_proba),
        },
        threshold
    )

def evaluate_saved_models(
    model_paths,
    X_train, y_train,
    X_test, y_test,
    MODELS,
    FEATURE_SETS
):
    rows = []

    for name, path in model_paths.items():

        with open(path, "rb") as f:
            artifact = pickle.load(f)

        # =========================
        # ENSEMBLE HILL CLIMBING
        # =========================
        if isinstance(artifact, dict) and artifact.get("type") == "hill_climbing_ensemble":

            weights = artifact["weights"]
            threshold = artifact["threshold"]

            P_train, P_test, model_names = load_all_model_predictions(
                MODELS, FEATURE_SETS,
                X_train, y_train,
                X_test
            )

            train_proba = P_train @ weights
            test_proba = P_test @ weights

            train_pred = (train_proba >= threshold).astype(int)
            test_pred = (test_proba >= threshold).astype(int)

            train_scores = {
                "accuracy": accuracy_score(y_train, train_pred),
                "f1": f1_score(y_train, train_pred),
                "avg_precision": average_precision_score(y_train, train_proba),
            }

            test_scores = {
                "accuracy": accuracy_score(y_test, test_pred),
                "f1": f1_score(y_test, test_pred),
                "avg_precision": average_precision_score(y_test, test_proba),
            }

        # =========================
        # POJEDYNCZY MODEL
        # =========================
        else:
            train_scores, test_scores, threshold = evaluate_single_model(
                artifact,
                name,
                X_train, y_train,
                X_test, y_test,
                FEATURE_SETS
            )

        rows.append({
            "model": name,

            "train_f1": train_scores["f1"],
            "test_f1": test_scores["f1"],

            "train_accuracy": train_scores["accuracy"],
            "test_accuracy": test_scores["accuracy"],

            "train_avg_precision": train_scores["avg_precision"],
            "test_avg_precision": test_scores["avg_precision"],

            "threshold": threshold
        })

    return (
        pd.DataFrame(rows)
        .set_index("model")
        .sort_values("test_avg_precision", ascending=False)
    )



In [65]:
MODEL_PATHS = {
    "knn_all": "models/prob_f1/knn_all_prob.pkl",
    "knn_rfe": "models/prob_f1/knn_rfe_prob.pkl",

    "gb_all": "models/prob_f1/gb_all_prob.pkl",
    "gb_boruta": "models/prob_f1/gb_boruta_prob.pkl",
    "gb_correlation": "models/prob_f1/gb_correlation_prob.pkl",
    "gb_rfe": "models/prob_f1/gb_rfe_prob.pkl",

    "xgb_all": "models/prob_f1/xgb_all_prob.pkl",
    "xgb_correlation": "models/prob_f1/xgb_correlation_prob.pkl",
    "xgb_rfe": "models/prob_f1/xgb_rfe_prob.pkl",

    "cat_all": "models/prob_f1/cat_all_prob.pkl",
    "ensemble_hill": "models/hc/hc_ensemble.pkl",
    "ensemble_hill_2": "models/hc/hc_ensemble_v2.pkl",
    "ensemble_hill_3": "models/hc/hc_ensemble_v3.pkl",
    "ensemble_hill_4": "models/hc/hc_ensemble_v4.pkl",
    "ensemble_hill_5": "models/hc/hc_ensemble_v5.pkl"
}

results = evaluate_saved_models(
    MODEL_PATHS,
    X_train, y_train,
    X_test, y_test,
    MODELS=MODELS,
    FEATURE_SETS=FEATURE_SETS
)

results = results.sort_values("test_f1", ascending=False)
results

Unnamed: 0_level_0,train_f1,test_f1,train_accuracy,test_accuracy,train_avg_precision,test_avg_precision,threshold
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ensemble_hill_5,0.891304,0.831461,0.990215,0.985323,0.903436,0.850767,0.49
ensemble_hill_3,0.88172,0.831461,0.989237,0.985323,0.900267,0.845455,0.5
ensemble_hill_4,0.891892,0.822222,0.990215,0.984344,0.901268,0.848073,0.5
ensemble_hill,0.894309,0.822222,0.99046,0.984344,0.90196,0.845741,0.5
xgb_correlation,0.807882,0.817204,0.98092,0.983366,0.81719,0.801281,0.33
gb_all,0.863874,0.808511,0.98728,0.982387,0.882328,0.806247,0.27
xgb_rfe,0.812352,0.808081,0.980675,0.981409,0.863142,0.822725,0.12
ensemble_hill_2,0.851282,0.8,0.985812,0.981409,0.893658,0.828619,0.5
gb_correlation,0.84399,0.774194,0.985078,0.979452,0.85814,0.772712,0.28
xgb_all,0.710327,0.762887,0.971869,0.977495,0.730478,0.687398,0.29


Conclusion:
- Weight is the most important hiperparameter (in scikitlearn None is default for some reason)
- Good ensemble is always better than no ensemble
- GBs are very good

Discussion:
- Oversampling poprzez imbalanced learn
- Avarage precision score in place of f1 for recall < precision importance for unbalanced classes
- Mutual information and feature selection should be more researched
- as well as other ensemble methods
- and statistics (ofc)