In [None]:
from collections import Counter

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

from src.data import clean_data, normalize_column_names
from src.features import feature_engineering
from src.ml.pipelines import build_preprocess_pipeline
from src.preprocessing import balance_dataset

# 1. Load data

In [None]:
df = pd.read_csv("../data/raw/diabetic_data.csv")
df = normalize_column_names(df)

In [None]:
df.columns

# 2. Data cleaning & Feature engineering

### Clean data

In [None]:
df = clean_data(df)

### Feature engineering

In [None]:
df = feature_engineering(df)

In [None]:
X = df.drop("readmitted_30_days", axis=1)
y = df["readmitted_30_days"]

### Balance dataset

In [None]:
print(f"Numer of rows before sampling: {X.shape[0]:,}")
X, y = balance_dataset(X, y, strategy="undersample")
print(f"Numer of rows after sampling: {X.shape[0]:,}")

### Split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# 3. Define Candidate Models

In [None]:
counter = Counter(y_train)
neg, pos = counter[0], counter[1]
scale_pos_weight = neg / pos

models = {
    "LogisticRegression": LogisticRegression(max_iter=1_000, class_weight="balanced"),
    "GradientBoosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(eval_metric="logloss", scale_pos_weight=scale_pos_weight),
}

# 4. Cross-validation Setup

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scoring = {
    "roc_auc": "roc_auc",
    "f1": "f1",
    "recall": "recall",
    "precision": "precision",
}

# 5. Evaluate Each Model

In [None]:
results = []

for name, model in models.items():
    print(f"----------\nTesting model: {name}")
    preprocessor = build_preprocess_pipeline(X_train)
    pipeline = Pipeline([("preprocessor", preprocessor), ("model", model)])

    scores = cross_validate(
        pipeline, X_train, y_train, cv=cv, scoring=scoring, return_train_score=False
    )

    summary = {
        "model": name,
        "roc_auc": np.mean(scores["test_roc_auc"]),
        "f1": np.mean(scores["test_f1"]),
        "recall": np.mean(scores["test_recall"]),
        "precision": np.mean(scores["test_precision"]),
    }
    results.append(summary)

df_results = pd.DataFrame(results).sort_values(by="f1", ascending=False)
df_results

# 6. Visualize Comparison

In [None]:
# Reorganizamos para agrupar por métrica
df_plot = df_results.set_index("model")[["roc_auc", "f1", "recall", "precision"]].T

# Gráfico de barras verticales agrupado por métrica
ax = df_plot.plot(kind="bar", figsize=(10, 6))
plt.title("Model Comparison by Metric")
plt.ylabel("Score")
plt.xticks(rotation=0)
plt.ylim(0, 1)
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.legend(title="Model", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.show()