## Imports

In [559]:
import numpy as np
import os
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.colors as mcolors
from pandas.plotting import scatter_matrix
from scipy.stats import spearmanr
from scipy.cluster import hierarchy
from collections import defaultdict

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.feature_selection import mutual_info_classif, mutual_info_regression
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.metrics import confusion_matrix, f1_score, average_precision_score, classification_report, fbeta_score, accuracy_score
from sklearn.feature_selection import RFECV

from statsmodels.stats.outliers_influence import variance_inflation_factor

import optuna
import statsmodels.api as sm
from boruta import BorutaPy

import custom_map

In [None]:
import importlib

importlib.reload(custom_map)

## Data and feature engineering

In [None]:
data = pd.read_csv("healthcare-dataset-stroke-data.csv")
data.info()

In [None]:
target = "stroke"

categorical_features = data.select_dtypes(['object']).columns.tolist()
numerical_features = data.select_dtypes(['float64', 'int64']).columns.drop('id')
categorical_features.append('stroke')

print(categorical_features, '\n', numerical_features)

data.head()

In [None]:
data[numerical_features].hist(bins=30, figsize=(15, 10), color="teal", edgecolor='black')
plt.suptitle("Histogramy zmiennych numerycznych", fontsize=16)
plt.show()

In [None]:
bmi_median = data['bmi'].median()
bmi_mean = data['bmi'].mean()
print(bmi_median, bmi_mean)

data['bmi'] = data['bmi'].fillna(bmi_median)

In [None]:
nrows, ncols = 2, 3
fig, axes = plt.subplots(nrows, ncols, figsize=(16, 9))
axes = axes.flat

for ax, col in zip(axes, categorical_features):
    sizes = data[col].value_counts()
    labels = sizes.index.astype(str)

    ax.pie(sizes, labels=labels, autopct="%1.1f%%", startangle=90)
    ax.set_title(col)
    ax.axis("equal")

plt.tight_layout()
plt.show()

In [None]:
categorical_features.remove('stroke')
data = pd.get_dummies(data, columns=categorical_features, drop_first=True, dtype=float)
data = data.drop('id', axis=1)

In [None]:
correlation_data = data[numerical_features].corr()

n_features = data[numerical_features].shape[1]
n = data[numerical_features].size/data[numerical_features].shape[1]
custom_map.cmap_pearson(n_features, n, 0.1)
print(custom_map.cmap_pearson(n_features, n, 0.1))

plt.figure(figsize=(16, 16))
sns.heatmap(correlation_data, annot=True, fmt=".2f", cmap="bajon", vmin=-1, vmax=1) #tab20b
plt.tight_layout()
plt.show()

In [None]:
correlation_data = data.corr()

n_features = data[numerical_features].shape[1]
n = data[numerical_features].size/data[numerical_features].shape[1]
custom_map.cmap_pearson(n_features, n, 0.1)
print(custom_map.cmap_pearson(n_features, n, 0.1))

plt.figure(figsize=(16, 16))
sns.heatmap(correlation_data, annot=True, fmt=".2f", cmap="bajon", vmin=-1, vmax=1) #tab20b
plt.tight_layout()
plt.show()

In [None]:
X = data.drop(columns=[target])
y = data[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify=y, shuffle=True)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Feature selection

In [None]:
rf = RandomForestClassifier(
    n_estimators=500,
    n_jobs=-1,
    class_weight="balanced",
)

boruta = BorutaPy(
    estimator=rf,
    n_estimators='auto',   # lub liczba
    max_iter=100,
    alpha=0.05
)

boruta.fit(X_train, y_train)

selected_mask = boruta.support_
selected_features = np.where(selected_mask)[0]
print("Wybrane cechy (indeksy):", selected_features)
print("Ranking:", boruta.ranking_)

In [None]:
selected_indices = np.where(boruta.support_)[0]
boruta_features = X_train.columns[selected_indices].tolist()

boruta_features

In [None]:
corr_features = data.corr()[target].sort_values(ascending=False)
corr_features = corr_features[corr_features > custom_map.cmap_pearson(n_features, n, 0.1)['r_crit']].index.tolist()

corr_features.remove('stroke')
corr_features

In [None]:
data[corr_features].head()

In [None]:
X_chosen = data[corr_features].copy()
X_scaled = scaler.fit_transform(X_chosen)

kmeans = KMeans(
    n_clusters=3,
    random_state=42,
    n_init=20
)

clusters = kmeans.fit_predict(X_scaled)
X_chosen["cluster"] = clusters

X_chosen["cluster"].describe()

In [None]:
y_train.info()

In [None]:
sample_idx = np.random.choice(len(X_train), size=int(X_train.size/20), replace=False)

mi = mutual_info_regression(X_train.iloc[sample_idx], y_train.iloc[sample_idx])
mi_df = pd.DataFrame({"Feature": X_train.columns, "Mutual Information": mi})

plt.figure(figsize=(20, 20))
plt.barh(X_train.columns, mi)
plt.grid(False)
plt.xticks(rotation=90)
plt.title('Mutual Information')
plt.show()
mi_df



In [None]:
names = X_train.columns
# 1. Compute Spearman correlation and distance matrix
# Assuming X is your dataframe of explanatory variables
corr = spearmanr(X).correlation
# Ensure the matrix is symmetric (sometimes float errors occur)
corr = (corr + corr.T) / 2
np.fill_diagonal(corr, 1)

# Convert correlation to a distance matrix
dist_matrix = 1 - np.abs(corr)
dist_linkage = hierarchy.ward(hierarchy.distance.squareform(dist_matrix))

# 2. Visualize the Dendrogram
fig, ax = plt.subplots(figsize=(10, 6))
dendro = hierarchy.dendrogram(
    dist_linkage, labels=names, ax=ax, leaf_rotation=90
)
ax.set_title("Hierarchical Clustering Dendrogram (Feature Redundancy)")
plt.tight_layout()
plt.grid(False)
plt.show()

# 3. Select Features
# Threshold '1' is common for 1 - abs(corr), but you can adjust based on the plot
cluster_ids = hierarchy.fcluster(dist_linkage, t=1, criterion='distance')
cluster_id_to_feature_ids = defaultdict(list)

for idx, cluster_id in enumerate(cluster_ids):
    cluster_id_to_feature_ids[cluster_id].append(idx)

# Keep only the first feature from each cluster
selected_idx = [v[0] for v in cluster_id_to_feature_ids.values()]
mi_features = names[selected_idx]
X_reduced = X_train.iloc[:, selected_idx]


print(f"Original features: {X_train.shape[1]}")
print(f"Reduced features: {len(mi_features)}")
print(f"Selected: {mi_features}")

In [None]:
def calculate_vif(df: pd.DataFrame):
    vif_data = pd.DataFrame()
    vif_data["feature"] = df.columns

    vif_data["VIF"] = [variance_inflation_factor(df.values, i) for i in range(len(df.columns))]
    print(vif_data)

In [None]:
calculate_vif(X_train.drop(columns=["age", "bmi"]))

In [None]:
vif_features = X_train.drop(columns=["age", "bmi"]).columns.tolist()
vif_features

In [None]:
model = LogisticRegression(
    solver="lbfgs",
    max_iter=10000,
    class_weight="balanced",
    random_state=42
)

rfecv = RFECV(
    estimator=model,
    step=1,
    cv=StratifiedKFold(5),
    scoring="roc_auc",
    min_features_to_select=3
)

rfecv.fit(X_train_scaled, y_train)

rfe_features = X_train.columns[rfecv.support_].tolist()
ranking = pd.DataFrame({
    "feature": X_train.columns,
    "rank": rfecv.ranking_
}).sort_values("rank")

print("Liczba wybranych cech:", len(rfe_features))
print("Wybrane cechy:")
print(rfe_features)

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(
    range(rfecv.min_features_to_select, len(rfecv.cv_results_["mean_test_score"]) + rfecv.min_features_to_select),
    rfecv.cv_results_["mean_test_score"]
)
plt.xlabel("Number of features")
plt.ylabel("CV ROC AUC")
plt.title("RFECV performance")
plt.grid(True)
plt.show()


## Model training

In [None]:
## hard coding because of stochasticity

all_features = ["age", "hypertension", "heart_disease", "avg_glucose_level",
                "bmi", "gender_Male", "gender_Other", "ever_married_Yes",
                "work_type_Never_worked", "work_type_Private", "work_type_Self-employed",
                "work_type_children", "Residence_type_Urban",
                "smoking_status_formerly smoked", "smoking_status_never smoked",
                "smoking_status_smokes"]

boruta_features = ["age", "avg_glucose_level", "bmi"]

corr_features = ["age", "heart_disease", "avg_glucose_level", "hypertension",
                 "ever_married_Yes", "smoking_status_formerly smoked",
                 "work_type_Self-employed"]

mi_features = ["age", "hypertension", "gender_Other",
               "work_type_Private", "smoking_status_formerly smoked"]

rfe_features = ["age", "hypertension", "heart_disease", "avg_glucose_level",
                "bmi", "work_type_Never_worked", "work_type_children",
                "Residence_type_Urban", "smoking_status_never smoked",
                "smoking_status_smokes"]

FEATURE_SETS = {
    "all": all_features,
    "boruta": boruta_features,
    "correlation": corr_features,
    "mi": mi_features,
    "rfe": rfe_features
}

In [None]:
X_train_all = X_train[all_features] # all features
X_train_boruta = X_train[boruta_features].copy() # boruta selection
X_train_corr = X_train[corr_features].copy() # cmap correlation
X_train_mi = X_train[mi_features].copy() # correlation, mi & clustering
X_train_rfe = X_train[rfe_features].copy() # rfecv

data = {
    "Method": [
        "All features",
        "Boruta",
        "Corelation",
        "Mutual Information",
        "RFE"
    ],
    "Feature quantity": [
        len(all_features),
        len(boruta_features),
        len(corr_features),
        len(mi_features),
        len(rfe_features)
    ],
    "Feature name": [
        ", ".join(all_features),
        ", ".join(boruta_features),
        ", ".join(corr_features),
        ", ".join(mi_features),
        ", ".join(rfe_features)
    ]
}

pd.set_option("display.max_colwidth", None)
pd.set_option("display.width", None)
pd.set_option("display.max_columns", None)
df = pd.DataFrame(data)
df

In [542]:
storage_url = "sqlite:///optuna_studies.db"
cv = StratifiedKFold(5, shuffle=True, random_state=42)

MODELS = [
    "logreg", "knn", "svm", "gnb", "dt",
    "rf", "ada", "gb", "extra",
    "lgbm", "xgb", "cat"
]

SMOTE_MODELS = {"logreg", "knn", "svm", "gnb"}

In [None]:
def sanitize_params(model, params):
    valid = model.get_params().keys()
    return {k: v for k, v in params.items() if k in valid}


In [None]:
def get_model_base(name, pos_weight=None):

    use_smote = name in SMOTE_MODELS

    def smote():
        return BorderlineSMOTE(
            sampling_strategy=0.3,
            random_state=42
        )

    if name == "logreg":
        steps = [("scaler", StandardScaler())]
        if use_smote:
            steps.append(("smote", smote()))
        steps.append(("clf", LogisticRegression(
            solver="saga",
            C=1.0,
            class_weight=None if use_smote else "balanced",
            max_iter=5000,
            random_state=42
        )))
        return ImbPipeline(steps)

    if name == "knn":
        steps = [("scaler", StandardScaler())]
        if use_smote:
            steps.append(("smote", smote()))
        steps.append(("clf", KNeighborsClassifier(
            n_neighbors=15,
            weights="distance"
        )))
        return ImbPipeline(steps)

    if name == "svm":
        steps = [("scaler", StandardScaler())]
        if use_smote:
            steps.append(("smote", smote()))
        steps.append(("clf", SVC(
            C=1.0,
            kernel="rbf",
            probability=True,
            class_weight=None if use_smote else "balanced",
            random_state=42
        )))
        return ImbPipeline(steps)

    if name == "gnb":
        steps = [("scaler", StandardScaler())]
        if use_smote:
            steps.append(("smote", smote()))
        steps.append(("clf", GaussianNB()))
        return ImbPipeline(steps)

    if name == "dt":
        return DecisionTreeClassifier(
            max_depth=6,
            min_samples_leaf=20,
            class_weight="balanced",
            random_state=42
        )

    if name == "rf":
        return RandomForestClassifier(
            n_estimators=500,
            min_samples_leaf=5,
            class_weight="balanced",
            n_jobs=-1,
            random_state=42
        )

    if name == "extra":
        return ExtraTreesClassifier(
            n_estimators=500,
            min_samples_leaf=5,
            class_weight="balanced",
            n_jobs=-1,
            random_state=42
        )

    if name == "ada":
        return AdaBoostClassifier(
            n_estimators=300,
            learning_rate=0.05,
            random_state=42
        )

    if name == "gb":
        return GradientBoostingClassifier(
            n_estimators=300,
            learning_rate=0.05,
            max_depth=3,
            random_state=42
        )

    if name == "lgbm":
        return LGBMClassifier(
            n_estimators=500,
            learning_rate=0.05,
            class_weight="balanced",
            random_state=42
        )

    if name == "xgb":
        return XGBClassifier(
            n_estimators=500,
            learning_rate=0.05,
            max_depth=6,
            subsample=0.8,
            colsample_bytree=0.8,
            scale_pos_weight=pos_weight,
            eval_metric="logloss",
            tree_method="hist",
            random_state=42
        )

    if name == "cat":
        return CatBoostClassifier(
            iterations=500,
            depth=6,
            learning_rate=0.05,
            loss_function="Logloss",
            auto_class_weights="Balanced",
            verbose=False,
            random_seed=42
        )

    raise ValueError(name)

In [None]:
def get_model_search(trial, name):

    use_smote = name in SMOTE_MODELS

    def smote():
        return BorderlineSMOTE(
            sampling_strategy=trial.suggest_float("smote_ratio", 0.2, 0.5),
            random_state=42
        )

    if name == "logreg":
        steps = [("scaler", StandardScaler())]
        if use_smote:
            steps.append(("smote", smote()))
        steps.append(("clf", LogisticRegression(
            C=trial.suggest_float("clf__C", 1e-3, 10, log=True),
            l1_ratio=trial.suggest_float("clf__l1_ratio", 0.0, 1.0),
            solver="saga",
            class_weight=None,
            max_iter=5000,
            random_state=42
        )))
        return ImbPipeline(steps)

    if name == "knn":
        steps = [("scaler", StandardScaler())]
        if use_smote:
            steps.append(("smote", smote()))
        steps.append(("clf", KNeighborsClassifier(
            n_neighbors=trial.suggest_int("clf__n_neighbors", 3, 25),
            weights=trial.suggest_categorical("clf__weights", ["uniform", "distance"])
        )))
        return ImbPipeline(steps)

    if name == "svm":
        steps = [("scaler", StandardScaler())]
        if use_smote:
            steps.append(("smote", smote()))
        steps.append(("clf", SVC(
            C=trial.suggest_float("clf__C", 1e-2, 10, log=True),
            kernel="rbf",
            probability=True,
            class_weight=None,
            random_state=42
        )))
        return ImbPipeline(steps)

    if name == "gnb":
        steps = [("scaler", StandardScaler())]
        if use_smote:
            steps.append(("smote", smote()))
        steps.append(("clf", GaussianNB(
            var_smoothing=trial.suggest_float("clf__var_smoothing", 1e-12, 1e-8, log=True)
        )))
        return ImbPipeline(steps)

    if name == "dt":
        return DecisionTreeClassifier(
            max_depth=trial.suggest_int("max_depth", 2, 15),
            min_samples_split=trial.suggest_int("min_samples_split", 2, 20),
            class_weight="balanced",
            random_state=42
        )

    if name == "rf":
        return RandomForestClassifier(
            n_estimators=trial.suggest_int("n_estimators", 200, 600),
            max_depth=trial.suggest_int("max_depth", 3, 15),
            class_weight="balanced",
            n_jobs=-1,
            random_state=42
        )

    if name == "extra":
        return ExtraTreesClassifier(
            n_estimators=trial.suggest_int("n_estimators", 200, 600),
            max_depth=trial.suggest_int("max_depth", 3, 15),
            class_weight="balanced",
            n_jobs=-1,
            random_state=42
        )

    if name == "ada":
        return AdaBoostClassifier(
            n_estimators=trial.suggest_int("n_estimators", 100, 400),
            learning_rate=trial.suggest_float("learning_rate", 0.01, 1.0),
            random_state=42
        )

    if name == "gb":
        return GradientBoostingClassifier(
            n_estimators=trial.suggest_int("n_estimators", 100, 400),
            learning_rate=trial.suggest_float("learning_rate", 0.01, 0.2),
            max_depth=trial.suggest_int("max_depth", 2, 5),
            random_state=42
        )

    if name == "lgbm":
        return LGBMClassifier(
            n_estimators=trial.suggest_int("n_estimators", 200, 600),
            learning_rate=trial.suggest_float("learning_rate", 0.01, 0.2),
            num_leaves=trial.suggest_int("num_leaves", 16, 64),
            class_weight="balanced",
            random_state=42
        )

    if name == "xgb":
        return XGBClassifier(
            n_estimators=trial.suggest_int("n_estimators", 200, 600),
            max_depth=trial.suggest_int("max_depth", 3, 10),
            learning_rate=trial.suggest_float("learning_rate", 0.01, 0.2),
            subsample=trial.suggest_float("subsample", 0.6, 1.0),
            colsample_bytree=trial.suggest_float("colsample_bytree", 0.6, 1.0),
            eval_metric="logloss",
            tree_method="hist",
            random_state=42
        )

    raise ValueError(name)

In [None]:
def objective_catboost(trial, X, y):

    params = {
        "iterations": trial.suggest_int("iterations", 200, 600),
        "depth": trial.suggest_int("depth", 4, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
        "loss_function": "Logloss",
        "auto_class_weights": "Balanced",
        "verbose": False,
        "random_seed": 42
    }

    f1s = []

    for tr, va in cv.split(X, y):
        model = CatBoostClassifier(**params)
        model.fit(X.iloc[tr], y.iloc[tr])
        preds = model.predict(X.iloc[va])
        f1s.append(f1_score(y.iloc[va], preds))

    return np.mean(f1s)

In [518]:
def get_model_final(name, params):
    model = get_model_base(name)
    clean_params = sanitize_params(model, params)
    model.set_params(**clean_params)
    return model

In [519]:
def predict_f1(model, X, y, threshold=0.5):
    proba = model.predict_proba(X)[:, 1]
    y_pred = (proba >= threshold).astype(int)
    return f1_score(y, y_pred)

In [532]:
def find_best_f1_threshold(model, X, y, thresholds=np.linspace(0.01, 0.5, 50)):
    proba = model.predict_proba(X)[:, 1]

    best_f1 = 0.0
    best_thr = 0.5

    for t in thresholds:
        y_pred = (proba >= t).astype(int)
        f1 = f1_score(y, y_pred)

        if f1 > best_f1:
            best_f1 = f1
            best_thr = t

    return best_thr, best_f1


In [536]:
for feature_name, feature_list in FEATURE_SETS.items():

    X_train_sel = X_train[feature_list]

    for model_name in MODELS:

        study_name = f"{model_name}_{feature_name}_prob"

        with open(f"models/{study_name}.pkl", "rb") as f:
            artifact = pickle.load(f)

        model = artifact["model"]

        best_thr, best_f1 = find_best_f1_threshold(
            model, X_train_sel, y_train
        )

        artifact = {
            "model": model,
            "threshold": best_thr,
            "f1_train": best_f1
        }

        with open(f"models/{study_name}.pkl", "wb") as f:
            pickle.dump(artifact, f)

        print(
            f"SAVED {study_name} | "
            f"thr={best_thr:.2f} | "
            f"F1_train={best_f1:.4f}"
        )


SAVED logreg_all_prob | thr=0.50 | F1_train=0.2265
SAVED knn_all_prob | thr=0.01 | F1_train=1.0000
SAVED svm_all_prob | thr=0.14 | F1_train=0.5455
SAVED gnb_all_prob | thr=0.37 | F1_train=0.1060
SAVED dt_all_prob | thr=0.44 | F1_train=0.2641
SAVED rf_all_prob | thr=0.50 | F1_train=0.3718
SAVED ada_all_prob | thr=0.42 | F1_train=0.2869
SAVED gb_all_prob | thr=0.27 | F1_train=0.9823
SAVED extra_all_prob | thr=0.50 | F1_train=0.2632
SAVED lgbm_all_prob | thr=0.50 | F1_train=0.5237
SAVED xgb_all_prob | thr=0.29 | F1_train=0.8615
SAVED cat_all_prob | thr=0.50 | F1_train=0.6746
SAVED logreg_boruta_prob | thr=0.50 | F1_train=0.2230
SAVED knn_boruta_prob | thr=0.01 | F1_train=0.4569
SAVED svm_boruta_prob | thr=0.13 | F1_train=0.2782
SAVED gnb_boruta_prob | thr=0.12 | F1_train=0.2437
SAVED dt_boruta_prob | thr=0.46 | F1_train=0.2326
SAVED rf_boruta_prob | thr=0.50 | F1_train=0.3449
SAVED ada_boruta_prob | thr=0.32 | F1_train=0.2901
SAVED gb_boruta_prob | thr=0.31 | F1_train=0.9671
SAVED extra_b

In [537]:
results = pd.DataFrame(
    index=MODELS,
    columns=FEATURE_SETS.keys(),
    dtype=float
)

for feature_name, feature_list in FEATURE_SETS.items():

    X_test_sel = X_test[feature_list]

    for model_name in MODELS:

        study_name = f"{model_name}_{feature_name}_prob"

        with open(f"models/{study_name}.pkl", "rb") as f:
            artifact = pickle.load(f)

        model = artifact["model"]
        best_thr = artifact["threshold"]

        proba_test = model.predict_proba(X_test_sel)[:, 1]
        y_pred_test = (proba_test >= best_thr).astype(int)

        f1 = f1_score(y_test, y_pred_test)
        results.loc[model_name, feature_name] = f1

        print(
            f"{model_name}_{feature_name} | "
            f"thr={best_thr:.2f} | "
            f"F1_test={f1:.4f}"
        )


logreg_all | thr=0.50 | F1_test=0.2331
knn_all | thr=0.01 | F1_test=0.1319
svm_all | thr=0.14 | F1_test=0.1101
gnb_all | thr=0.37 | F1_test=0.1075
dt_all | thr=0.44 | F1_test=0.2027
rf_all | thr=0.50 | F1_test=0.2739
ada_all | thr=0.42 | F1_test=0.2791
gb_all | thr=0.27 | F1_test=0.2222
extra_all | thr=0.50 | F1_test=0.2145
lgbm_all | thr=0.50 | F1_test=0.2827
xgb_all | thr=0.29 | F1_test=0.1923
cat_all | thr=0.50 | F1_test=0.2432
logreg_boruta | thr=0.50 | F1_test=0.2286
knn_boruta | thr=0.01 | F1_test=0.2072
svm_boruta | thr=0.13 | F1_test=0.3028
gnb_boruta | thr=0.12 | F1_test=0.2869
dt_boruta | thr=0.46 | F1_test=0.2137
rf_boruta | thr=0.50 | F1_test=0.2538
ada_boruta | thr=0.32 | F1_test=0.3140
gb_boruta | thr=0.31 | F1_test=0.1647
extra_boruta | thr=0.50 | F1_test=0.2433
lgbm_boruta | thr=0.50 | F1_test=0.2547
xgb_boruta | thr=0.34 | F1_test=0.1524
cat_boruta | thr=0.50 | F1_test=0.2857
logreg_correlation | thr=0.50 | F1_test=0.2115
knn_correlation | thr=0.01 | F1_test=0.2143
svm

In [538]:
results = results.sort_values(
    by=results.columns.tolist(),
    ascending=False
)

results

Unnamed: 0,all,boruta,correlation,mi,rfe
lgbm,0.282723,0.254658,0.248996,0.238227,0.296296
ada,0.27907,0.313953,0.263158,0.261438,0.290909
rf,0.273859,0.253846,0.203046,0.213592,0.253086
cat,0.243243,0.285714,0.233503,0.228739,0.247191
logreg,0.233062,0.228571,0.211494,0.223958,0.232432
gb,0.222222,0.164706,0.114943,0.130435,0.21978
extra,0.214493,0.243323,0.22973,0.222857,0.239316
dt,0.202667,0.21374,0.246809,0.232295,0.208
xgb,0.192308,0.152381,0.080808,0.109375,0.196721
knn,0.131868,0.207207,0.214286,0.189573,0.201923


# Hill-Climbing Ridge Ensemble

In [563]:
def load_all_model_predictions(
    MODELS, FEATURE_SETS,
    X_train, y_train,
    X_test
):
    train_preds = []
    test_preds = []
    model_names = []

    for feature_name, feature_list in FEATURE_SETS.items():
        X_train_sel = X_train[feature_list]
        X_test_sel = X_test[feature_list]

        for model_name in MODELS:
            study_name = f"{model_name}_{feature_name}_prob"

            with open(f"models/{study_name}.pkl", "rb") as f:
                artifact = pickle.load(f)

            model = artifact["model"]

            p_train = model.predict_proba(X_train_sel)[:, 1]
            p_test = model.predict_proba(X_test_sel)[:, 1]

            train_preds.append(p_train)
            test_preds.append(p_test)
            model_names.append(study_name)

    P_train = np.column_stack(train_preds)
    P_test = np.column_stack(test_preds)

    return P_train, P_test, model_names


In [564]:
def ensemble_f1_score(P, y, weights, threshold, alpha=0.01):
    ensemble_proba = P @ weights
    y_pred = (ensemble_proba >= threshold).astype(int)

    f1 = f1_score(y, y_pred)
    ridge_penalty = alpha * np.sum(weights ** 2)

    return f1 - ridge_penalty


In [565]:
def hill_climbing_ensemble(
    P_train, y_train,
    step=0.02,
    max_iter=200,
    alpha=0.01
):
    n_models = P_train.shape[1]
    weights = np.ones(n_models) / n_models

    best_threshold, _ = find_best_f1_threshold_dummy(P_train, y_train, weights)
    best_score = ensemble_f1_score(P_train, y_train, weights, best_threshold, alpha)

    for _ in range(max_iter):
        improved = False

        for i in range(n_models):
            for delta in [+step, -step]:
                new_weights = weights.copy()
                new_weights[i] += delta

                if new_weights[i] < 0:
                    continue

                new_weights /= new_weights.sum()

                thr, _ = find_best_f1_threshold_dummy(
                    P_train, y_train, new_weights
                )

                score = ensemble_f1_score(
                    P_train, y_train, new_weights, thr, alpha
                )

                if score > best_score:
                    weights = new_weights
                    best_score = score
                    best_threshold = thr
                    improved = True

        if not improved:
            break

    return weights, best_threshold, best_score


In [566]:
def find_best_f1_threshold_dummy(P, y, weights, thresholds=np.linspace(0.01, 0.5, 50)):
    ensemble_proba = P @ weights

    best_f1 = 0.0
    best_thr = 0.5

    for t in thresholds:
        y_pred = (ensemble_proba >= t).astype(int)
        f1 = f1_score(y, y_pred)

        if f1 > best_f1:
            best_f1 = f1
            best_thr = t

    return best_thr, best_f1


In [567]:
P_train, P_test, model_names = load_all_model_predictions(
    MODELS, FEATURE_SETS,
    X_train, y_train,
    X_test
)

weights, best_thr, train_score = hill_climbing_ensemble(
    P_train, y_train,
    step=0.02,
    alpha=0.01
)

y_test_pred = (P_test @ weights >= best_thr).astype(int)
f1_test = f1_score(y_test, y_test_pred)

print("ENSEMBLE RESULTS")
print("----------------")
print(f"F1 train: {train_score:.4f}")
print(f"F1 test : {f1_test:.4f}")
print(f"Threshold: {best_thr:.3f}")

for name, w in sorted(zip(model_names, weights), key=lambda x: -x[1]):
    if w > 0.01:
        print(f"{name:40s}  weight={w:.3f}")

os.makedirs("models/hc", exist_ok=True)

hc_path = "models/hc/hc_ensemble.pkl"

hc_artifact = {
    "type": "hill_climbing_ensemble",
    "weights": weights,
    "threshold": best_thr,
    "model_names": model_names,
    "f1_train": train_score
}

with open(hc_path, "wb") as f:
    pickle.dump(hc_artifact, f)

ENSEMBLE RESULTS
----------------
F1 train: 0.9972
F1 test : 0.1081
Threshold: 0.460
svm_all_prob                              weight=0.058
knn_all_prob                              weight=0.057
knn_rfe_prob                              weight=0.043
xgb_correlation_prob                      weight=0.042
gb_correlation_prob                       weight=0.041
gb_boruta_prob                            weight=0.040
gb_all_prob                               weight=0.039
knn_correlation_prob                      weight=0.029
ada_rfe_prob                              weight=0.029
svm_rfe_prob                              weight=0.028
dt_mi_prob                                weight=0.028
cat_rfe_prob                              weight=0.026
svm_boruta_prob                           weight=0.026
xgb_rfe_prob                              weight=0.025
lgbm_rfe_prob                             weight=0.025
gb_rfe_prob                               weight=0.025
xgb_all_prob                       

# Summary

In [574]:
def predict_with_threshold(model, X, threshold=0.5):
    proba = model.predict_proba(X)[:, 1]
    y_pred = (proba >= threshold).astype(int)
    return y_pred, proba

def evaluate_model(model, X, y, threshold=0.5):
    y_pred, proba = predict_with_threshold(model, X, threshold)

    return {
        "accuracy": accuracy_score(y, y_pred),
        "f1": f1_score(y, y_pred),
        "avg_precision": average_precision_score(y, proba)
    }

def evaluate_saved_models(
    model_paths,
    X_train, y_train,
    X_test, y_test
):
    rows = []

    for name, path in model_paths.items():

        with open(path, "rb") as f:
            artifact = pickle.load(f)

        if isinstance(artifact, dict) and artifact.get("type") == "hill_climbing_ensemble":

            weights = artifact["weights"]
            threshold = artifact["threshold"]

            P_train, P_test, _ = load_all_model_predictions(
                MODELS, FEATURE_SETS,
                X_train, y_train,
                X_test
            )

            train_proba = P_train @ weights
            train_pred = (train_proba >= threshold).astype(int)

            test_proba = P_test @ weights
            test_pred = (test_proba >= threshold).astype(int)

            train_scores = {
                "accuracy": accuracy_score(y_train, train_pred),
                "f1": f1_score(y_train, train_pred),
                "avg_precision": average_precision_score(y_train, train_proba)
            }

            test_scores = {
                "accuracy": accuracy_score(y_test, test_pred),
                "f1": f1_score(y_test, test_pred),
                "avg_precision": average_precision_score(y_test, test_proba)
            }

        else:
            if isinstance(artifact, dict):
                model = artifact["model"]
                threshold = artifact.get("threshold", 0.5)
            else:
                model = artifact
                threshold = 0.5

            train_scores = evaluate_model(
                model, X_train, y_train, threshold
            )

            test_scores = evaluate_model(
                model, X_test, y_test, threshold
            )

        rows.append({
            "model": name,

            "train_f1": train_scores["f1"],
            "test_f1": test_scores["f1"],

            "train_accuracy": train_scores["accuracy"],
            "test_accuracy": test_scores["accuracy"],

            "train_avg_precision": train_scores["avg_precision"],
            "test_avg_precision": test_scores["avg_precision"],

            "threshold": threshold
        })

    df = pd.DataFrame(rows).set_index("model")

    return df.sort_values("test_avg_precision", ascending=False)

In [575]:
MODEL_PATHS = {
    "rf_all": "models/prob_f1/rf_all_prob.pkl",
    "xgb_all": "models/prob_f1/xgb_all_prob.pkl",
    "cat_all": "models/prob_f1/cat_all_prob.pkl",
    "ensemble_hill": "models/hc/hc_ensemble.pkl"
}

results = evaluate_saved_models(
    MODEL_PATHS,
    X_train, y_train,
    X_test, y_test
)

results

FileNotFoundError: [Errno 2] No such file or directory: 'models/logreg_all_prob.pkl'

Wnioski:
- Oversampling poprzez imbalanced learn
- Wagi bardzo waÅºny hiperparametr
- Avarage precision score zamiast f1 dla recall > precision importance ale nie zbalansowane klasy
- Dobry ensembling zawsze polepsza wyniki

todo:
- ustawic zeby foldery z zapisanymi modelami byly kompatybilne do dynamicznego uzytku
- liczyc fb accuracy
- wytrenowac modele f1 oversampling i fb oversampling