# Imports

In [127]:
import numpy as np
import os
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.colors as mcolors
from pandas.plotting import scatter_matrix
from scipy.stats import spearmanr
from scipy.cluster import hierarchy
from collections import defaultdict

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.metrics import confusion_matrix, f1_score, average_precision_score, classification_report, fbeta_score, accuracy_score
from sklearn.feature_selection import RFECV

from statsmodels.stats.outliers_influence import variance_inflation_factor

import optuna
import statsmodels.api as sm
from boruta import BorutaPy

import custom_map

In [128]:
import importlib

importlib.reload(custom_map)

<module 'custom_map' from '/Users/dominikmika/PycharmProjects/Ridge-hillclimbing/custom_map.py'>

# Setup

In [129]:
data = pd.read_csv("healthcare-dataset-stroke-data.csv")
bmi_median = data['bmi'].median()
data['bmi'] = data['bmi'].fillna(bmi_median)
data.info()

categorical_features = data.select_dtypes(['object']).columns.tolist()
numerical_features = data.select_dtypes(['float64', 'int64']).columns.drop('id')

data = pd.get_dummies(data, columns=categorical_features, drop_first=True, dtype=float)
data = data.drop('id', axis=1)

target = "stroke"

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                5110 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [130]:
X = data.drop(columns=[target])
y = data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=y, shuffle=True)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [131]:
## hard coding because of stochasticity

all_features = ["age", "hypertension", "heart_disease", "avg_glucose_level",
                "bmi", "gender_Male", "gender_Other", "ever_married_Yes",
                "work_type_Never_worked", "work_type_Private", "work_type_Self-employed",
                "work_type_children", "Residence_type_Urban",
                "smoking_status_formerly smoked", "smoking_status_never smoked",
                "smoking_status_smokes"]

boruta_features = ["age", "avg_glucose_level", "bmi"]

corr_features = ["age", "heart_disease", "avg_glucose_level", "hypertension",
                 "ever_married_Yes", "smoking_status_formerly smoked",
                 "work_type_Self-employed"]

mi_features = ["age", "hypertension", "gender_Other",
               "work_type_Private", "smoking_status_formerly smoked"]

rfe_features = ["age", "hypertension", "heart_disease", "avg_glucose_level",
                "bmi", "work_type_Never_worked", "work_type_children",
                "Residence_type_Urban", "smoking_status_never smoked",
                "smoking_status_smokes"]

FEATURE_SETS = {
    "all": all_features,
    "boruta": boruta_features,
    "correlation": corr_features,
    "mi": mi_features,
    "rfe": rfe_features
}

In [132]:
storage_url = "sqlite:///optuna_studies.db"
cv = StratifiedKFold(5, shuffle=True, random_state=42)

MODELS = [
    "logreg", "knn", "svm", "gnb", "dt",
    "rf", "ada", "gb", "extra",
    "lgbm", "xgb", "cat"
]

SMOTE_MODELS = {"logreg", "knn", "svm", "gnb"}

# Results

In [133]:
def find_best_f1_threshold(model, X, y, thresholds=np.linspace(0.01, 0.5, 50)):
    proba = model.predict_proba(X)[:, 1]

    best_f1 = 0.0
    best_thr = 0.5

    for t in thresholds:
        y_pred = (proba >= t).astype(int)
        f1 = f1_score(y, y_pred)

        if f1 > best_f1:
            best_f1 = f1
            best_thr = t

    return best_thr, best_f1

In [134]:
for feature_name, feature_list in FEATURE_SETS.items():

    X_train_sel = X_train[feature_list]

    for model_name in MODELS:

        study_name = f"{model_name}_{feature_name}_prob"

        with open(f"models/prob_f1/{study_name}.pkl", "rb") as f:
            artifact = pickle.load(f)

        model = artifact["model"]

        best_thr, best_f1 = find_best_f1_threshold(
            model, X_train_sel, y_train
        )

        artifact = {
            "model": model,
            "threshold": best_thr,
            "f1_train": best_f1
        }

        with open(f"models/{study_name}.pkl", "wb") as f:
            pickle.dump(artifact, f)

        print(
            f"SAVED {study_name} | "
            f"thr={best_thr:.2f} | "
            f"F1_train={best_f1:.4f}"
        )

SAVED logreg_all_prob | thr=0.50 | F1_train=0.2284
SAVED knn_all_prob | thr=0.50 | F1_train=0.8387
SAVED svm_all_prob | thr=0.14 | F1_train=0.4621
SAVED gnb_all_prob | thr=0.04 | F1_train=0.1069
SAVED dt_all_prob | thr=0.44 | F1_train=0.2476
SAVED rf_all_prob | thr=0.50 | F1_train=0.3497
SAVED ada_all_prob | thr=0.42 | F1_train=0.2782
SAVED gb_all_prob | thr=0.31 | F1_train=0.8410
SAVED extra_all_prob | thr=0.50 | F1_train=0.2521
SAVED lgbm_all_prob | thr=0.50 | F1_train=0.4619
SAVED xgb_all_prob | thr=0.28 | F1_train=0.7110
SAVED cat_all_prob | thr=0.50 | F1_train=0.5763
SAVED logreg_boruta_prob | thr=0.50 | F1_train=0.2242
SAVED knn_boruta_prob | thr=0.01 | F1_train=0.4018
SAVED svm_boruta_prob | thr=0.14 | F1_train=0.2759
SAVED gnb_boruta_prob | thr=0.12 | F1_train=0.2485
SAVED dt_boruta_prob | thr=0.46 | F1_train=0.2224
SAVED rf_boruta_prob | thr=0.50 | F1_train=0.3255
SAVED ada_boruta_prob | thr=0.32 | F1_train=0.2802
SAVED gb_boruta_prob | thr=0.34 | F1_train=0.8213
SAVED extra_b

In [135]:
results = pd.DataFrame(
    index=MODELS,
    columns=FEATURE_SETS.keys(),
    dtype=float
)

for feature_name, feature_list in FEATURE_SETS.items():

    X_test_sel = X_test[feature_list]

    for model_name in MODELS:

        study_name = f"{model_name}_{feature_name}_prob"

        with open(f"models/{study_name}.pkl", "rb") as f:
            artifact = pickle.load(f)

        model = artifact["model"]
        best_thr = artifact["threshold"]

        proba_test = model.predict_proba(X_test_sel)[:, 1]
        y_pred_test = (proba_test >= best_thr).astype(int)

        f1 = f1_score(y_test, y_pred_test)
        results.loc[model_name, feature_name] = f1

        print(
            f"{model_name}_{feature_name} | "
            f"thr={best_thr:.2f} | "
            f"F1_test={f1:.4f}"
        )

logreg_all | thr=0.50 | F1_test=0.2254
knn_all | thr=0.50 | F1_test=0.9278
svm_all | thr=0.14 | F1_test=0.5191
gnb_all | thr=0.04 | F1_test=0.1039
dt_all | thr=0.44 | F1_test=0.2674
rf_all | thr=0.50 | F1_test=0.3604
ada_all | thr=0.42 | F1_test=0.3171
gb_all | thr=0.31 | F1_test=0.9388
extra_all | thr=0.50 | F1_test=0.2564
lgbm_all | thr=0.50 | F1_test=0.5291
xgb_all | thr=0.28 | F1_test=0.7500
cat_all | thr=0.50 | F1_test=0.6351
logreg_boruta | thr=0.50 | F1_test=0.2241
knn_boruta | thr=0.01 | F1_test=0.4229
svm_boruta | thr=0.14 | F1_test=0.3209
gnb_boruta | thr=0.12 | F1_test=0.2684
dt_boruta | thr=0.46 | F1_test=0.2561
rf_boruta | thr=0.50 | F1_test=0.3292
ada_boruta | thr=0.32 | F1_test=0.3571
gb_boruta | thr=0.34 | F1_test=0.8660
extra_boruta | thr=0.50 | F1_test=0.2653
lgbm_boruta | thr=0.50 | F1_test=0.3289
xgb_boruta | thr=0.34 | F1_test=0.5800
cat_boruta | thr=0.50 | F1_test=0.4974
logreg_correlation | thr=0.50 | F1_test=0.1948
knn_correlation | thr=0.01 | F1_test=0.4498
svm

In [136]:
results = results.sort_values(
    by=results.columns.tolist(),
    ascending=False
)

results

Unnamed: 0,all,boruta,correlation,mi,rfe
gb,0.938776,0.865979,0.910891,0.325203,0.905263
knn,0.927835,0.422907,0.449761,0.296296,0.929293
xgb,0.75,0.58,0.877551,0.354286,0.909091
cat,0.635135,0.497409,0.502618,0.267516,0.548571
lgbm,0.529101,0.328947,0.4329,0.254658,0.561798
svm,0.519084,0.320856,0.402597,0.273292,0.395062
rf,0.36036,0.329218,0.406091,0.260563,0.275618
ada,0.317073,0.357143,0.315789,0.339394,0.33121
dt,0.267409,0.256131,0.363636,0.242236,0.268571
extra,0.25641,0.265306,0.288809,0.23676,0.240741


In [137]:
def load_all_model_predictions(
    MODELS, FEATURE_SETS,
    X_train, y_train,
    X_test
):
    train_preds = []
    test_preds = []
    model_names = []

    for feature_name, feature_list in FEATURE_SETS.items():
        X_train_sel = X_train[feature_list]
        X_test_sel = X_test[feature_list]

        for model_name in MODELS:
            study_name = f"{model_name}_{feature_name}_prob"

            with open(f"models/{study_name}.pkl", "rb") as f:
                artifact = pickle.load(f)

            model = artifact["model"]

            p_train = model.predict_proba(X_train_sel)[:, 1]
            p_test = model.predict_proba(X_test_sel)[:, 1]

            train_preds.append(p_train)
            test_preds.append(p_test)
            model_names.append(study_name)

    P_train = np.column_stack(train_preds)
    P_test = np.column_stack(test_preds)

    return P_train, P_test, model_names


In [138]:
def predict_with_threshold(model, X, threshold=0.5):
    proba = model.predict_proba(X)[:, 1]
    y_pred = (proba >= threshold).astype(int)
    return y_pred, proba

def evaluate_model(model, X, y, threshold=0.5):
    y_pred, proba = predict_with_threshold(model, X, threshold)

    return {
        "accuracy": accuracy_score(y, y_pred),
        "f1": f1_score(y, y_pred),
        "avg_precision": average_precision_score(y, proba)
    }

def evaluate_saved_models(
    model_paths,
    X_train, y_train,
    X_test, y_test
):
    rows = []

    for name, path in model_paths.items():

        with open(path, "rb") as f:
            artifact = pickle.load(f)

        if isinstance(artifact, dict) and artifact.get("type") == "hill_climbing_ensemble":

            weights = artifact["weights"]
            threshold = artifact["threshold"]

            P_train, P_test, _ = load_all_model_predictions(
                MODELS, FEATURE_SETS,
                X_train, y_train,
                X_test
            )

            train_proba = P_train @ weights
            train_pred = (train_proba >= threshold).astype(int)

            test_proba = P_test @ weights
            test_pred = (test_proba >= threshold).astype(int)

            train_scores = {
                "accuracy": accuracy_score(y_train, train_pred),
                "f1": f1_score(y_train, train_pred),
                "avg_precision": average_precision_score(y_train, train_proba)
            }

            test_scores = {
                "accuracy": accuracy_score(y_test, test_pred),
                "f1": f1_score(y_test, test_pred),
                "avg_precision": average_precision_score(y_test, test_proba)
            }

        else:
            if isinstance(artifact, dict):
                model = artifact["model"]
                threshold = artifact.get("threshold", 0.5)
            else:
                model = artifact
                threshold = 0.5

            train_scores = evaluate_model(
                model, X_train, y_train, threshold
            )

            test_scores = evaluate_model(
                model, X_test, y_test, threshold
            )

        rows.append({
            "model": name,

            "train_f1": train_scores["f1"],
            "test_f1": test_scores["f1"],

            "train_accuracy": train_scores["accuracy"],
            "test_accuracy": test_scores["accuracy"],

            "train_avg_precision": train_scores["avg_precision"],
            "test_avg_precision": test_scores["avg_precision"],

            "threshold": threshold
        })

    df = pd.DataFrame(rows).set_index("model")

    return df.sort_values("test_avg_precision", ascending=False)

In [139]:
MODEL_PATHS = {
    "rf_all": "models/prob_f1/rf_all_prob.pkl",
    "xgb_all": "models/prob_f1/xgb_all_prob.pkl",
    "cat_all": "models/prob_f1/cat_all_prob.pkl",
    "ensemble_hill": "models/hc/hc_ensemble.pkl"
}

results = evaluate_saved_models(
    MODEL_PATHS,
    X_train, y_train,
    X_test, y_test
)

results

Unnamed: 0_level_0,train_f1,test_f1,train_accuracy,test_accuracy,train_avg_precision,test_avg_precision,threshold
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ensemble_hill,0.867403,0.927835,0.988258,0.993151,0.871676,0.961668,0.5
cat_all,0.576271,0.635135,0.938845,0.947162,0.810996,0.882984,0.5
xgb_all,0.710183,0.756757,0.972847,0.973581,0.697247,0.795387,0.29
rf_all,0.349738,0.36036,0.848092,0.861057,0.455327,0.564512,0.5


Wnioski:
- Wagi bardzo waÅºny hiperparametr
- Dobry ensembling zawsze polepsza wyniki

Discussion:
- Oversampling poprzez imbalanced learn
- Avarage precision score zamiast f1 dla recall > precision importance dla nie zbalansowanych klas
