# Imports

In [75]:
import numpy as np
import os
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.colors as mcolors
from pandas.plotting import scatter_matrix
from scipy.stats import spearmanr
from scipy.cluster import hierarchy
from collections import defaultdict

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.metrics import confusion_matrix, f1_score, average_precision_score, classification_report, fbeta_score, accuracy_score
from sklearn.feature_selection import RFECV

from statsmodels.stats.outliers_influence import variance_inflation_factor

import optuna
import statsmodels.api as sm
from boruta import BorutaPy

import custom_map

In [76]:
import importlib

importlib.reload(custom_map)

<module 'custom_map' from '/Users/dominikmika/PycharmProjects/Ridge-hillclimbing/custom_map.py'>

# Setup

In [77]:
data = pd.read_csv("healthcare-dataset-stroke-data.csv")
bmi_median = data['bmi'].median()
data['bmi'] = data['bmi'].fillna(bmi_median)
data.info()

categorical_features = data.select_dtypes(['object']).columns.tolist()
numerical_features = data.select_dtypes(['float64', 'int64']).columns.drop('id')

data = pd.get_dummies(data, columns=categorical_features, drop_first=True, dtype=float)
data = data.drop('id', axis=1)

target = "stroke"

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                5110 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [78]:
X = data.drop(columns=[target])
y = data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=y, shuffle=True)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [79]:
## hard coding because of stochasticity

all_features = ["age", "hypertension", "heart_disease", "avg_glucose_level",
                "bmi", "gender_Male", "gender_Other", "ever_married_Yes",
                "work_type_Never_worked", "work_type_Private", "work_type_Self-employed",
                "work_type_children", "Residence_type_Urban",
                "smoking_status_formerly smoked", "smoking_status_never smoked",
                "smoking_status_smokes"]

boruta_features = ["age", "avg_glucose_level", "bmi"]

corr_features = ["age", "heart_disease", "avg_glucose_level", "hypertension",
                 "ever_married_Yes", "smoking_status_formerly smoked",
                 "work_type_Self-employed"]

mi_features = ["age", "hypertension", "gender_Other",
               "work_type_Private", "smoking_status_formerly smoked"]

rfe_features = ["age", "hypertension", "heart_disease", "avg_glucose_level",
                "bmi", "work_type_Never_worked", "work_type_children",
                "Residence_type_Urban", "smoking_status_never smoked",
                "smoking_status_smokes"]

FEATURE_SETS = {
    "all": all_features,
    "boruta": boruta_features,
    "correlation": corr_features,
    "mi": mi_features,
    "rfe": rfe_features
}

In [80]:
storage_url = "sqlite:///optuna_studies.db"
cv = StratifiedKFold(5, shuffle=True, random_state=42)

MODELS = [
    "logreg", "knn", "svm", "gnb", "dt",
    "rf", "ada", "gb", "extra",
    "lgbm", "xgb", "cat"
]

SMOTE_MODELS = {"logreg", "knn", "svm", "gnb"}

# Results

In [81]:
def find_best_f1_threshold(model, X, y, thresholds=np.linspace(0.01, 0.5, 50)):
    proba = model.predict_proba(X)[:, 1]

    best_f1 = 0.0
    best_thr = 0.5

    for t in thresholds:
        y_pred = (proba >= t).astype(int)
        f1 = f1_score(y, y_pred)

        if f1 > best_f1:
            best_f1 = f1
            best_thr = t

    return best_thr, best_f1

In [82]:
for feature_name, feature_list in FEATURE_SETS.items():

    X_train_sel = X_train[feature_list]

    for model_name in MODELS:

        study_name = f"{model_name}_{feature_name}_prob"

        with open(f"models/prob_f1/{study_name}.pkl", "rb") as f:
            artifact = pickle.load(f)

        model = artifact["model"]

        best_thr, best_f1 = find_best_f1_threshold(
            model, X_train_sel, y_train
        )

        artifact = {
            "model": model,
            "threshold": best_thr,
            "f1_train": best_f1
        }

        with open(f"models/{study_name}.pkl", "wb") as f:
            pickle.dump(artifact, f)

        print(
            f"SAVED {study_name} | "
            f"thr={best_thr:.2f} | "
            f"F1_train={best_f1:.4f}"
        )

SAVED logreg_all_prob | thr=0.50 | F1_train=0.2311
SAVED knn_all_prob | thr=0.50 | F1_train=0.8472
SAVED svm_all_prob | thr=0.14 | F1_train=0.4686
SAVED gnb_all_prob | thr=0.04 | F1_train=0.1064
SAVED dt_all_prob | thr=0.44 | F1_train=0.2534
SAVED rf_all_prob | thr=0.50 | F1_train=0.3492
SAVED ada_all_prob | thr=0.42 | F1_train=0.2833
SAVED gb_all_prob | thr=0.36 | F1_train=0.8556
SAVED extra_all_prob | thr=0.50 | F1_train=0.2531
SAVED lgbm_all_prob | thr=0.50 | F1_train=0.4735
SAVED xgb_all_prob | thr=0.29 | F1_train=0.7273
SAVED cat_all_prob | thr=0.50 | F1_train=0.5826
SAVED logreg_boruta_prob | thr=0.50 | F1_train=0.2253
SAVED knn_boruta_prob | thr=0.01 | F1_train=0.4060
SAVED svm_boruta_prob | thr=0.14 | F1_train=0.2857
SAVED gnb_boruta_prob | thr=0.12 | F1_train=0.2593
SAVED dt_boruta_prob | thr=0.46 | F1_train=0.2283
SAVED rf_boruta_prob | thr=0.49 | F1_train=0.3241
SAVED ada_boruta_prob | thr=0.32 | F1_train=0.2956
SAVED gb_boruta_prob | thr=0.34 | F1_train=0.8254
SAVED extra_b

In [83]:
results = pd.DataFrame(
    index=MODELS,
    columns=FEATURE_SETS.keys(),
    dtype=float
)

for feature_name, feature_list in FEATURE_SETS.items():

    X_test_sel = X_test[feature_list]

    for model_name in MODELS:

        study_name = f"{model_name}_{feature_name}_prob"

        with open(f"models/{study_name}.pkl", "rb") as f:
            artifact = pickle.load(f)

        model = artifact["model"]
        best_thr = artifact["threshold"]

        proba_test = model.predict_proba(X_test_sel)[:, 1]
        y_pred_test = (proba_test >= best_thr).astype(int)

        f1 = f1_score(y_test, y_pred_test)
        results.loc[model_name, feature_name] = f1

        print(
            f"{model_name}_{feature_name} | "
            f"thr={best_thr:.2f} | "
            f"F1_test={f1:.4f}"
        )

logreg_all | thr=0.50 | F1_test=0.2149
knn_all | thr=0.50 | F1_test=0.8958
svm_all | thr=0.14 | F1_test=0.4925
gnb_all | thr=0.04 | F1_test=0.1058
dt_all | thr=0.44 | F1_test=0.2440
rf_all | thr=0.50 | F1_test=0.3621
ada_all | thr=0.42 | F1_test=0.2941
gb_all | thr=0.36 | F1_test=0.8750
extra_all | thr=0.50 | F1_test=0.2523
lgbm_all | thr=0.50 | F1_test=0.4821
xgb_all | thr=0.29 | F1_test=0.6939
cat_all | thr=0.50 | F1_test=0.6093
logreg_boruta | thr=0.50 | F1_test=0.2198
knn_boruta | thr=0.01 | F1_test=0.4072
svm_boruta | thr=0.14 | F1_test=0.2786
gnb_boruta | thr=0.12 | F1_test=0.2222
dt_boruta | thr=0.46 | F1_test=0.2308
rf_boruta | thr=0.49 | F1_test=0.3221
ada_boruta | thr=0.32 | F1_test=0.2907
gb_boruta | thr=0.34 | F1_test=0.8511
extra_boruta | thr=0.50 | F1_test=0.2771
lgbm_boruta | thr=0.50 | F1_test=0.3141
xgb_boruta | thr=0.34 | F1_test=0.5400
cat_boruta | thr=0.50 | F1_test=0.4700
logreg_correlation | thr=0.50 | F1_test=0.1991
knn_correlation | thr=0.01 | F1_test=0.4229
svm

In [84]:
results = results.sort_values(
    by=results.columns.tolist(),
    ascending=False
)

results

Unnamed: 0,all,boruta,correlation,mi,rfe
knn,0.895833,0.40724,0.422907,0.28125,0.884211
gb,0.875,0.851064,0.84,0.391892,0.808511
xgb,0.693878,0.54,0.851485,0.366667,0.886598
cat,0.609272,0.47,0.445498,0.263804,0.54023
svm,0.492537,0.278607,0.35503,0.271357,0.390244
lgbm,0.482051,0.314103,0.401674,0.246499,0.502618
rf,0.362069,0.322097,0.412371,0.274247,0.289474
ada,0.294118,0.290698,0.279793,0.321429,0.25731
extra,0.252308,0.277108,0.289753,0.248485,0.230088
dt,0.244032,0.230769,0.348178,0.236994,0.238095


In [85]:
def load_all_model_predictions(
    MODELS, FEATURE_SETS,
    X_train, y_train,
    X_test
):
    train_preds = []
    test_preds = []
    model_names = []

    for feature_name, feature_list in FEATURE_SETS.items():
        X_train_sel = X_train[feature_list]
        X_test_sel = X_test[feature_list]

        for model_name in MODELS:
            study_name = f"{model_name}_{feature_name}_prob"

            with open(f"models/{study_name}.pkl", "rb") as f:
                artifact = pickle.load(f)

            model = artifact["model"]

            p_train = model.predict_proba(X_train_sel)[:, 1]
            p_test = model.predict_proba(X_test_sel)[:, 1]

            train_preds.append(p_train)
            test_preds.append(p_test)
            model_names.append(study_name)

    P_train = np.column_stack(train_preds)
    P_test = np.column_stack(test_preds)

    return P_train, P_test, model_names


In [86]:
def predict_with_threshold(model, X, threshold=0.5):
    proba = model.predict_proba(X)[:, 1]
    y_pred = (proba >= threshold).astype(int)
    return y_pred, proba

def evaluate_model(model, X, y, threshold=0.5):
    y_pred, proba = predict_with_threshold(model, X, threshold)

    return {
        "accuracy": accuracy_score(y, y_pred),
        "f1": f1_score(y, y_pred),
        "avg_precision": average_precision_score(y, proba)
    }

def evaluate_saved_models(
    model_paths,
    X_train, y_train,
    X_test, y_test
):
    rows = []

    for name, path in model_paths.items():

        with open(path, "rb") as f:
            artifact = pickle.load(f)

        if isinstance(artifact, dict) and artifact.get("type") == "hill_climbing_ensemble":

            weights = artifact["weights"]
            threshold = artifact["threshold"]

            P_train, P_test, _ = load_all_model_predictions(
                MODELS, FEATURE_SETS,
                X_train, y_train,
                X_test
            )

            train_proba = P_train @ weights
            train_pred = (train_proba >= threshold).astype(int)

            test_proba = P_test @ weights
            test_pred = (test_proba >= threshold).astype(int)

            train_scores = {
                "accuracy": accuracy_score(y_train, train_pred),
                "f1": f1_score(y_train, train_pred),
                "avg_precision": average_precision_score(y_train, train_proba)
            }

            test_scores = {
                "accuracy": accuracy_score(y_test, test_pred),
                "f1": f1_score(y_test, test_pred),
                "avg_precision": average_precision_score(y_test, test_proba)
            }

        else:
            if isinstance(artifact, dict):
                model = artifact["model"]
                threshold = artifact.get("threshold", 0.5)
            else:
                model = artifact
                threshold = 0.5

            train_scores = evaluate_model(
                model, X_train, y_train, threshold
            )

            test_scores = evaluate_model(
                model, X_test, y_test, threshold
            )

        rows.append({
            "model": name,

            "train_f1": train_scores["f1"],
            "test_f1": test_scores["f1"],

            "train_accuracy": train_scores["accuracy"],
            "test_accuracy": test_scores["accuracy"],

            "train_avg_precision": train_scores["avg_precision"],
            "test_avg_precision": test_scores["avg_precision"],

            "threshold": threshold
        })

    df = pd.DataFrame(rows).set_index("model")

    return df.sort_values("test_avg_precision", ascending=False)

In [87]:
MODEL_PATHS = {
    "knn_all": "models/prob_f1/knn_all_prob.pkl",
    "knn_rfe": "models/prob_f1/knn_rfe_prob.pkl",

    "gb_all": "models/prob_f1/gb_all_prob.pkl",
    "gb_boruta": "models/prob_f1/gb_boruta_prob.pkl",
    "gb_correlation": "models/prob_f1/gb_correlation_prob.pkl",
    "gb_rfe": "models/prob_f1/gb_rfe_prob.pkl",

    "xgb_all": "models/prob_f1/xgb_all_prob.pkl",
    "xgb_correlation": "models/prob_f1/xgb_correlation_prob.pkl",
    "xgb_rfe": "models/prob_f1/xgb_rfe_prob.pkl",

    "cat_all": "models/prob_f1/cat_all_prob.pkl",
    "ensemble_hill": "models/hc/hc_ensemble.pkl",
    "ensemble_hill_2": "models/hc/hc_ensemble_v2.pkl"
}

results = evaluate_saved_models(
    MODEL_PATHS,
    X_train, y_train,
    X_test, y_test
)

results

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- ever_married_Yes
- gender_Male
- gender_Other
- smoking_status_formerly smoked
- work_type_Private
- ...


Wnioski:
- Wagi bardzo waÅºny hiperparametr
- Dobry ensembling zawsze polepsza wyniki

Discussion:
- Oversampling poprzez imbalanced learn
- Avarage precision score zamiast f1 dla recall > precision importance dla nie zbalansowanych klas
