In [None]:
import numpy as np
import pandas as pd

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score, classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier

X, y = fetch_openml("heart", version=1, as_frame=True, return_X_y = True)

y = y.astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size = 0.2, random_state = 42, stratify=y
)

num_cols = X_train.select_dtypes(include = ["number"]).columns
cat_cols = X_train.select_dtypes(exclude = ["number"]).columns

numeric_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy = "median"))
])

categorical_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
])

preprocess = ColumnTransformer(
    transformers = [
        ("num", numeric_pipe, num_cols),
        ("cat", categorical_pipe, cat_cols),
    ]
)

baseline = Pipeline(steps=[
    ("prep", preprocess),
    ("model", LogisticRegression(max_iter=2000))
])


gb = Pipeline(steps=[
    ("prep", preprocess),
    ("model", HistGradientBoostingClassifier(
        learning_rate=0.05,
        max_depth=3,
        max_iter=400,
        early_stopping=True,
        random_state=42
    ))
])

baseline.fit(X_train, y_train)
gb.fit(X_train, y_train)

for name, model in [("Baseline(LogReg)", baseline), ("GradientBoosting(HistGB)", gb)]:
    pred = model.predict(X_test)
    acc = accuracy_score(y_test, pred)
    f1w = f1_score(y_test, pred, average="weighted")
    print("\n===", name, "===")
    print("Accuracy:", round(acc, 4))
    print("F1 weighted:", round(f1w, 4))
    print(classification_report(y_test, pred, digits=4))


