# Imports

In [156]:
import numpy as np
import os
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.colors as mcolors
from pandas.plotting import scatter_matrix
from scipy.stats import spearmanr
from scipy.cluster import hierarchy
from collections import defaultdict

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.metrics import confusion_matrix, f1_score, average_precision_score, classification_report, fbeta_score, accuracy_score
from sklearn.feature_selection import RFECV

from statsmodels.stats.outliers_influence import variance_inflation_factor

import optuna
import statsmodels.api as sm
from boruta import BorutaPy

import custom_map

In [157]:
import importlib

importlib.reload(custom_map)

<module 'custom_map' from '/Users/dominikmika/PycharmProjects/Ridge-hillclimbing/custom_map.py'>

# Setup

In [158]:
data = pd.read_csv("healthcare-dataset-stroke-data.csv")
bmi_median = data['bmi'].median()
data['bmi'] = data['bmi'].fillna(bmi_median)
data.info()

categorical_features = data.select_dtypes(['object']).columns.tolist()
numerical_features = data.select_dtypes(['float64', 'int64']).columns.drop('id')

data = pd.get_dummies(data, columns=categorical_features, drop_first=True, dtype=float)
data = data.drop('id', axis=1)

target = "stroke"

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                5110 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [159]:
X = data.drop(columns=[target])
y = data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=y, shuffle=True)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [160]:
## hard coding because of stochasticity

all_features = ["age", "hypertension", "heart_disease", "avg_glucose_level",
                "bmi", "gender_Male", "gender_Other", "ever_married_Yes",
                "work_type_Never_worked", "work_type_Private", "work_type_Self-employed",
                "work_type_children", "Residence_type_Urban",
                "smoking_status_formerly smoked", "smoking_status_never smoked",
                "smoking_status_smokes"]

boruta_features = ["age", "avg_glucose_level", "bmi"]

corr_features = ["age", "heart_disease", "avg_glucose_level", "hypertension",
                 "ever_married_Yes", "smoking_status_formerly smoked",
                 "work_type_Self-employed"]

mi_features = ["age", "hypertension", "gender_Other",
               "work_type_Private", "smoking_status_formerly smoked"]

rfe_features = ["age", "hypertension", "heart_disease", "avg_glucose_level",
                "bmi", "work_type_Never_worked", "work_type_children",
                "Residence_type_Urban", "smoking_status_never smoked",
                "smoking_status_smokes"]

FEATURE_SETS = {
    "all": all_features,
    "boruta": boruta_features,
    "correlation": corr_features,
    "mi": mi_features,
    "rfe": rfe_features
}

In [161]:
storage_url = "sqlite:///optuna_studies.db"
cv = StratifiedKFold(5, shuffle=True, random_state=42)

MODELS = [
    "logreg", "knn", "svm", "gnb", "dt",
    "rf", "ada", "gb", "extra",
    "lgbm", "xgb", "cat"
]

SMOTE_MODELS = {"logreg", "knn", "svm", "gnb"}

# Results

In [162]:
def find_best_f1_threshold(model, X, y, thresholds=np.linspace(0.01, 0.5, 50)):
    proba = model.predict_proba(X)[:, 1]

    best_f1 = 0.0
    best_thr = 0.5

    for t in thresholds:
        y_pred = (proba >= t).astype(int)
        f1 = f1_score(y, y_pred)

        if f1 > best_f1:
            best_f1 = f1
            best_thr = t

    return best_thr, best_f1

In [163]:
for feature_name, feature_list in FEATURE_SETS.items():

    X_train_sel = X_train[feature_list]

    for model_name in MODELS:

        study_name = f"{model_name}_{feature_name}_prob"

        with open(f"models/prob_f1/{study_name}.pkl", "rb") as f:
            artifact = pickle.load(f)

        model = artifact["model"]

        best_thr, best_f1 = find_best_f1_threshold(
            model, X_train_sel, y_train
        )

        artifact = {
            "model": model,
            "threshold": best_thr,
            "f1_train": best_f1
        }

        with open(f"models/{study_name}.pkl", "wb") as f:
            pickle.dump(artifact, f)

        print(
            f"SAVED {study_name} | "
            f"thr={best_thr:.2f} | "
            f"F1_train={best_f1:.4f}"
        )

SAVED logreg_all_prob | thr=0.50 | F1_train=0.2241
SAVED knn_all_prob | thr=0.48 | F1_train=0.8496
SAVED svm_all_prob | thr=0.14 | F1_train=0.4515
SAVED gnb_all_prob | thr=0.37 | F1_train=0.1063
SAVED dt_all_prob | thr=0.44 | F1_train=0.2527
SAVED rf_all_prob | thr=0.50 | F1_train=0.3460
SAVED ada_all_prob | thr=0.42 | F1_train=0.2699
SAVED gb_all_prob | thr=0.31 | F1_train=0.8579
SAVED extra_all_prob | thr=0.50 | F1_train=0.2506
SAVED lgbm_all_prob | thr=0.50 | F1_train=0.4737
SAVED xgb_all_prob | thr=0.29 | F1_train=0.7143
SAVED cat_all_prob | thr=0.50 | F1_train=0.6007
SAVED logreg_boruta_prob | thr=0.50 | F1_train=0.2200
SAVED knn_boruta_prob | thr=0.01 | F1_train=0.4069
SAVED svm_boruta_prob | thr=0.13 | F1_train=0.2750
SAVED gnb_boruta_prob | thr=0.12 | F1_train=0.2492
SAVED dt_boruta_prob | thr=0.46 | F1_train=0.2281
SAVED rf_boruta_prob | thr=0.50 | F1_train=0.3239
SAVED ada_boruta_prob | thr=0.32 | F1_train=0.2809
SAVED gb_boruta_prob | thr=0.32 | F1_train=0.8272
SAVED extra_b

In [164]:
results = pd.DataFrame(
    index=MODELS,
    columns=FEATURE_SETS.keys(),
    dtype=float
)

for feature_name, feature_list in FEATURE_SETS.items():

    X_test_sel = X_test[feature_list]

    for model_name in MODELS:

        study_name = f"{model_name}_{feature_name}_prob"

        with open(f"models/{study_name}.pkl", "rb") as f:
            artifact = pickle.load(f)

        model = artifact["model"]
        best_thr = artifact["threshold"]

        proba_test = model.predict_proba(X_test_sel)[:, 1]
        y_pred_test = (proba_test >= best_thr).astype(int)

        f1 = f1_score(y_test, y_pred_test)
        results.loc[model_name, feature_name] = f1

        print(
            f"{model_name}_{feature_name} | "
            f"thr={best_thr:.2f} | "
            f"F1_test={f1:.4f}"
        )

logreg_all | thr=0.50 | F1_test=0.2429
knn_all | thr=0.48 | F1_test=0.8791
svm_all | thr=0.14 | F1_test=0.5691
gnb_all | thr=0.37 | F1_test=0.1065
dt_all | thr=0.44 | F1_test=0.2466
rf_all | thr=0.50 | F1_test=0.3755
ada_all | thr=0.42 | F1_test=0.3441
gb_all | thr=0.31 | F1_test=0.8750
extra_all | thr=0.50 | F1_test=0.2622
lgbm_all | thr=0.50 | F1_test=0.4817
xgb_all | thr=0.29 | F1_test=0.7451
cat_all | thr=0.50 | F1_test=0.5395
logreg_boruta | thr=0.50 | F1_test=0.2411
knn_boruta | thr=0.01 | F1_test=0.4037
svm_boruta | thr=0.13 | F1_test=0.3173
gnb_boruta | thr=0.12 | F1_test=0.2645
dt_boruta | thr=0.46 | F1_test=0.2314
rf_boruta | thr=0.50 | F1_test=0.3361
ada_boruta | thr=0.32 | F1_test=0.3478
gb_boruta | thr=0.32 | F1_test=0.8333
extra_boruta | thr=0.50 | F1_test=0.2783
lgbm_boruta | thr=0.50 | F1_test=0.2994
xgb_boruta | thr=0.32 | F1_test=0.5556
cat_boruta | thr=0.50 | F1_test=0.4216
logreg_correlation | thr=0.50 | F1_test=0.2238
knn_correlation | thr=0.01 | F1_test=0.4054
svm

In [165]:
results = results.sort_values(
    by=results.columns.tolist(),
    ascending=False
)

results

Unnamed: 0,all,boruta,correlation,mi,rfe
knn,0.879121,0.40367,0.405405,0.305556,0.901099
gb,0.875,0.833333,0.863158,0.371585,0.795699
xgb,0.745098,0.555556,0.828283,0.344828,0.851064
svm,0.569106,0.317308,0.367816,0.310502,0.433121
cat,0.539474,0.421569,0.438776,0.273256,0.466667
lgbm,0.481675,0.299363,0.377049,0.25,0.513966
rf,0.375546,0.336066,0.421622,0.263666,0.288591
ada,0.344086,0.347826,0.338462,0.336957,0.350282
extra,0.262195,0.278317,0.297578,0.252199,0.252199
dt,0.246649,0.231362,0.281369,0.267442,0.233422


In [166]:
def load_all_model_predictions(
    MODELS, FEATURE_SETS,
    X_train, y_train,
    X_test
):
    train_preds = []
    test_preds = []
    model_names = []

    for feature_name, feature_list in FEATURE_SETS.items():
        X_train_sel = X_train[feature_list]
        X_test_sel = X_test[feature_list]

        for model_name in MODELS:
            study_name = f"{model_name}_{feature_name}_prob"

            with open(f"models/{study_name}.pkl", "rb") as f:
                artifact = pickle.load(f)

            model = artifact["model"]

            p_train = model.predict_proba(X_train_sel)[:, 1]
            p_test = model.predict_proba(X_test_sel)[:, 1]

            train_preds.append(p_train)
            test_preds.append(p_test)
            model_names.append(study_name)

    P_train = np.column_stack(train_preds)
    P_test = np.column_stack(test_preds)

    return P_train, P_test, model_names


In [167]:
def extract_feature_set(model_name, FEATURE_SETS):
    for fs in FEATURE_SETS:
        if model_name.endswith(f"_{fs}"):
            return fs
    raise ValueError(f"Nie można dopasować feature setu do modelu: {model_name}")

def evaluate_single_model(
    artifact,
    model_name,
    X_train, y_train,
    X_test, y_test,
    FEATURE_SETS
):
    model = artifact["model"]
    threshold = artifact.get("threshold", 0.5)

    feature_set = extract_feature_set(model_name, FEATURE_SETS)
    features = FEATURE_SETS[feature_set]

    Xtr = X_train[features]
    Xte = X_test[features]

    train_proba = model.predict_proba(Xtr)[:, 1]
    test_proba = model.predict_proba(Xte)[:, 1]

    train_pred = (train_proba >= threshold).astype(int)
    test_pred = (test_proba >= threshold).astype(int)

    return (
        {
            "accuracy": accuracy_score(y_train, train_pred),
            "f1": f1_score(y_train, train_pred),
            "avg_precision": average_precision_score(y_train, train_proba),
        },
        {
            "accuracy": accuracy_score(y_test, test_pred),
            "f1": f1_score(y_test, test_pred),
            "avg_precision": average_precision_score(y_test, test_proba),
        },
        threshold
    )

def evaluate_saved_models(
    model_paths,
    X_train, y_train,
    X_test, y_test,
    MODELS,
    FEATURE_SETS
):
    rows = []

    for name, path in model_paths.items():

        with open(path, "rb") as f:
            artifact = pickle.load(f)

        # =========================
        # ENSEMBLE HILL CLIMBING
        # =========================
        if isinstance(artifact, dict) and artifact.get("type") == "hill_climbing_ensemble":

            weights = artifact["weights"]
            threshold = artifact["threshold"]

            P_train, P_test, model_names = load_all_model_predictions(
                MODELS, FEATURE_SETS,
                X_train, y_train,
                X_test
            )

            train_proba = P_train @ weights
            test_proba = P_test @ weights

            train_pred = (train_proba >= threshold).astype(int)
            test_pred = (test_proba >= threshold).astype(int)

            train_scores = {
                "accuracy": accuracy_score(y_train, train_pred),
                "f1": f1_score(y_train, train_pred),
                "avg_precision": average_precision_score(y_train, train_proba),
            }

            test_scores = {
                "accuracy": accuracy_score(y_test, test_pred),
                "f1": f1_score(y_test, test_pred),
                "avg_precision": average_precision_score(y_test, test_proba),
            }

        # =========================
        # POJEDYNCZY MODEL
        # =========================
        else:
            train_scores, test_scores, threshold = evaluate_single_model(
                artifact,
                name,
                X_train, y_train,
                X_test, y_test,
                FEATURE_SETS
            )

        rows.append({
            "model": name,

            "train_f1": train_scores["f1"],
            "test_f1": test_scores["f1"],

            "train_accuracy": train_scores["accuracy"],
            "test_accuracy": test_scores["accuracy"],

            "train_avg_precision": train_scores["avg_precision"],
            "test_avg_precision": test_scores["avg_precision"],

            "threshold": threshold
        })

    return (
        pd.DataFrame(rows)
        .set_index("model")
        .sort_values("test_avg_precision", ascending=False)
    )



In [168]:
MODEL_PATHS = {
    "knn_all": "models/prob_f1/knn_all_prob.pkl",
    "knn_rfe": "models/prob_f1/knn_rfe_prob.pkl",

    "gb_all": "models/prob_f1/gb_all_prob.pkl",
    "gb_boruta": "models/prob_f1/gb_boruta_prob.pkl",
    "gb_correlation": "models/prob_f1/gb_correlation_prob.pkl",
    "gb_rfe": "models/prob_f1/gb_rfe_prob.pkl",

    "xgb_all": "models/prob_f1/xgb_all_prob.pkl",
    "xgb_correlation": "models/prob_f1/xgb_correlation_prob.pkl",
    "xgb_rfe": "models/prob_f1/xgb_rfe_prob.pkl",

    "cat_all": "models/prob_f1/cat_all_prob.pkl",
    "ensemble_hill": "models/hc/hc_ensemble.pkl",
    "ensemble_hill_2": "models/hc/hc_ensemble_v2.pkl"

}

results = evaluate_saved_models(
    MODEL_PATHS,
    X_train, y_train,
    X_test, y_test,
    MODELS=MODELS,
    FEATURE_SETS=FEATURE_SETS
)

results = results.sort_values("test_f1", ascending=False)
results

Unnamed: 0_level_0,train_f1,test_f1,train_accuracy,test_accuracy,train_avg_precision,test_avg_precision,threshold
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ensemble_hill,0.880435,0.879121,0.989237,0.989237,0.891292,0.887076,0.5
gb_all,0.851852,0.857143,0.986301,0.986301,0.859954,0.893677,0.27
ensemble_hill_2,0.837209,0.857143,0.984589,0.986301,0.881738,0.879983,0.5
gb_boruta,0.822917,0.833333,0.983366,0.984344,0.845302,0.873293,0.31
gb_correlation,0.831169,0.828283,0.9841,0.983366,0.837915,0.85313,0.28
xgb_correlation,0.808081,0.815534,0.981409,0.981409,0.805175,0.845301,0.33
gb_rfe,0.809278,0.808511,0.981898,0.982387,0.83177,0.866354,0.29
xgb_rfe,0.814458,0.8,0.981164,0.979452,0.871618,0.805838,0.12
knn_all,0.716102,0.777778,0.967221,0.976517,0.842722,0.839462,0.01
xgb_all,0.714286,0.745098,0.972603,0.97456,0.711145,0.751604,0.29


Wnioski:
- Wagi bardzo waźny hiperparametr
- Dobry ensembling zawsze polepsza wyniki

Discussion:
- Oversampling poprzez imbalanced learn
- Avarage precision score zamiast f1 dla recall > precision importance dla nie zbalansowanych klas
