In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from src.preprocessing import balance_dataset


from sklearn.model_selection import StratifiedKFold, cross_validate


from src.data import clean_data
from src.data import normalize_column_names
from src.features import feature_engineering

# 1. Load data

In [None]:
df = pd.read_csv("../data/raw/diabetic_data.csv")
df = normalize_column_names(df)

In [None]:
df.columns

# 2. Data cleaning & Feature engineering

### Clean data

In [None]:
df = clean_data(df)

### Feature engineering

In [None]:
df = feature_engineering(df)

In [None]:
X = df.drop("readmitted_30_days", axis=1)
y = df["readmitted_30_days"]

### Under sampling

In [None]:
print(f"Numer of rows before sampling: {X.shape[0]:,}")
X, y = balance_dataset(X, y)
print(f"Numer of rows after sampling: {X.shape[0]:,}")

### Split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# 3. Define Candidate Models

In [None]:
counter = Counter(y_train)
neg, pos = counter[0], counter[1]
scale_pos_weight = neg / pos

models = {
    "LogisticRegression": LogisticRegression(max_iter=1_000, class_weight="balanced"),
    "GradientBoosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(eval_metric="logloss", scale_pos_weight=scale_pos_weight),
}

# 4. Build the pipeline

In [None]:
def build_pipeline(model, X):
    # Detect columns
    num_cols = X.select_dtypes(
        include=["int", "float", "int64", "float64"]
    ).columns.tolist()
    cat_cols = X.select_dtypes(include=["object", "category", "bool"]).columns.tolist()

    # Pipelines by type
    numeric_pipeline = Pipeline(
        [("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
    )

    categorical_pipeline = Pipeline(
        [
            ("imputer", SimpleImputer(strategy="most_frequent")),
            (
                "encoder",
                OneHotEncoder(
                    handle_unknown="ignore", drop="first", sparse_output=False
                ),
            ),
        ]
    )

    preprocessor = ColumnTransformer(
        [("num", numeric_pipeline, num_cols), ("cat", categorical_pipeline, cat_cols)]
    )

    return Pipeline([("preprocessor", preprocessor), ("model", model)])

# 5. Cross-validation Setup

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scoring = {
    "roc_auc": "roc_auc",
    "f1": "f1",
    "recall": "recall",
    "precision": "precision",
}

# 6. Evaluate Each Model

In [None]:
results = []

for name, model in models.items():
    print(f"----------\nTesting model: {name}")
    pipeline = build_pipeline(model, X_train)
    scores = cross_validate(
        pipeline, X_train, y_train, cv=cv, scoring=scoring, return_train_score=False
    )

    summary = {
        "model": name,
        "roc_auc": np.mean(scores["test_roc_auc"]),
        "f1": np.mean(scores["test_f1"]),
        "recall": np.mean(scores["test_recall"]),
        "precision": np.mean(scores["test_precision"]),
    }
    results.append(summary)

df_results = pd.DataFrame(results).sort_values(by="f1", ascending=False)
df_results

# 7. Visualize Comparison

In [None]:
# Reorganizamos para agrupar por métrica
df_plot = df_results.set_index("model")[["roc_auc", "f1", "recall", "precision"]].T

# Gráfico de barras verticales agrupado por métrica
ax = df_plot.plot(kind="bar", figsize=(10, 6))
plt.title("Model Comparison by Metric")
plt.ylabel("Score")
plt.xticks(rotation=0)
plt.ylim(0, 1)
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.legend(title="Model", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.show()