In [None]:
# ==============================================================
# Gender-split XGBoost + BMI + OHE + EarlyStopping
# Optuna CV + CLASS WEIGHTS for Overweight I & II
# GPU-enabled (gpu_hist) with safe CPU fallback
# Prints trial/fold progress and GPU usage
# Uses: train_combined.csv  → submission.csv
# ==============================================================

import os
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score

import xgboost as xgb
import optuna
from contextlib import suppress

# ---------------- Config ----------------
TRAIN_PATH = "train_combined.csv"
TEST_PATH  = "test.csv"
SAMPLE_SUB_PATH = "sample_submission.csv"

RANDOM_STATE = 42
N_JOBS = -1
N_FOLDS = 5
EARLY_STOP = 200

# Tuning budget
TRIALS_MALE = 30
TRIALS_FEMALE = 30

# Multi-seed CV
CV_SEEDS = [42, 2027, 1337]

# >>> Class weights for Overweight classes <<<
W_OVR1 = 1.6   # Overweight Level I
W_OVR2 = 1.6   # Overweight Level II

# >>> Progress / logging <<<
PRINT_EVERY = 200          # print eval metric every N boosting rounds (set 0 to silence)
SHOW_PARAMS_PER_TRIAL = True

# >>> Prefer GPU if available <<<
USE_GPU = True  # set False to force CPU

GPU_STATUS = {"requested": USE_GPU, "used_any": False}

def want_gpu_params():
    # device is honored by XGBoost 2.x; ignored by older
    return {"tree_method": "gpu_hist", "predictor": "gpu_predictor", "device": "cuda"}

def apply_device(params, want_gpu=True):
    p = dict(params)
    if want_gpu:
        p.update(want_gpu_params())
    else:
        p.update({"tree_method": "hist", "predictor": "auto"})
    return p

def make_dmatrix(X, y=None, weight=None, use_gpu=False):
    """Use fast DeviceQuantileDMatrix on GPU, fallback to classic DMatrix."""
    if use_gpu:
        with suppress(Exception):
            return xgb.DeviceQuantileDMatrix(X, label=y, weight=weight)
    return xgb.DMatrix(X, label=y, weight=weight)

def has_gpu(params):
    return params.get("tree_method") == "gpu_hist"

# ---------------- Helpers ----------------
def add_bmi(df):
    if ("Weight" in df.columns) and ("Height" in df.columns):
        h = df["Height"].astype(float)
        hm = np.where(h.median() > 3.0, h/100.0, h)
        with np.errstate(divide="ignore", invalid="ignore"):
            bmi = df["Weight"].astype(float) / (hm**2 + 1e-12)
        df["BMI"] = pd.Series(bmi).replace([np.inf, -np.inf], np.nan).clip(10, 80)
    return df

def make_preprocessor(num_cols, cat_cols):
    num_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler(with_mean=False))
    ])
    try:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
    except TypeError:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse=True)
    cat_pipe = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", ohe),
    ])
    return ColumnTransformer(
        transformers=[
            ("num", num_pipe, num_cols),
            ("cat", cat_pipe, cat_cols)
        ],
        remainder="drop",
        sparse_threshold=1.0
    )

def detect_gender(df):
    for c in df.columns:
        if c.lower() in ("gender","sex"):
            return c
    raise ValueError("Could not detect gender column (expected Gender or Sex).")

def infer_cols(df):
    cat_cols = df.select_dtypes(include=["object","category","bool"]).columns.tolist()
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    return num_cols, cat_cols

def norm_label(s: str) -> str:
    s = str(s)
    return "".join(ch for ch in s.lower() if ch.isalnum())

OVERWEIGHT1_ALIASES = {
    "overweightleveli","overweightlevel1","overweight_level_i","overweighti"
}
OVERWEIGHT2_ALIASES = {
    "overweightlevelii","overweightlevel2","overweight_level_ii","overweightii"
}

def build_class_weights(y_series: pd.Series, w1=W_OVR1, w2=W_OVR2):
    w = np.ones(len(y_series), dtype=np.float32)
    y_norm = y_series.astype(str).map(norm_label)
    w[(y_norm.isin(OVERWEIGHT1_ALIASES)).values] = float(w1)
    w[(y_norm.isin(OVERWEIGHT2_ALIASES)).values] = float(w2)
    return w

def suggest_params(trial):
    return {
        "eta": trial.suggest_float("eta", 0.02, 0.06, log=True),
        "max_depth": trial.suggest_int("max_depth", 5, 8),
        "min_child_weight": trial.suggest_float("min_child_weight", 1.0, 4.5),
        "subsample": trial.suggest_float("subsample", 0.75, 0.95),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.70, 0.95),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.8, 2.5, log=True),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 0.6),
        "gamma": trial.suggest_float("gamma", 0.0, 0.4),
        "max_delta_step": trial.suggest_float("max_delta_step", 0.0, 3.0),
        "num_boost_round": trial.suggest_int("num_boost_round", 2000, 12000, step=1000),
    }

def _verbose_eval():
    # return False to silence; or integer N to print every N rounds
    return False if PRINT_EVERY <= 0 else PRINT_EVERY

# ---------------- Tuning & Training (uses weights) ----------------
def tune_group(Xg, yg_enc, y_labels_grp, label, num_class, trials):
    num_cols, cat_cols = infer_cols(Xg)
    weights_grp = build_class_weights(y_labels_grp)

    def objective(trial):
        hp = suggest_params(trial)
        if SHOW_PARAMS_PER_TRIAL:
            print(f"\n[{label}] Trial {trial.number+1}/{trials} | "
                  f"eta={hp['eta']:.4f}, depth={hp['max_depth']}, "
                  f"mcw={hp['min_child_weight']:.2f}, subsample={hp['subsample']:.2f}, "
                  f"colsample={hp['colsample_bytree']:.2f}, rounds={hp['num_boost_round']}")

        scores_across_seeds = []
        base_params = {
            "objective": "multi:softprob",
            "num_class": num_class,
            "eval_metric": "mlogloss",
            "eta": hp["eta"],
            "max_depth": int(hp["max_depth"]),
            "min_child_weight": float(hp["min_child_weight"]),
            "subsample": float(hp["subsample"]),
            "colsample_bytree": float(hp["colsample_bytree"]),
            "reg_lambda": float(hp["reg_lambda"]),
            "reg_alpha": float(hp["reg_alpha"]),
            "gamma": float(hp["gamma"]),
            "max_delta_step": float(hp["max_delta_step"]),
            "nthread": N_JOBS,
            "verbosity": 0,
        }

        for cv_seed in CV_SEEDS:
            skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=cv_seed)
            fold_scores = []

            for fold_id, (tr_idx, va_idx) in enumerate(skf.split(Xg, yg_enc), start=1):
                print(f"[{label}] Trial {trial.number+1}/{trials} • Seed {cv_seed} • Fold {fold_id}/{N_FOLDS}")
                X_tr, X_va = Xg.iloc[tr_idx], Xg.iloc[va_idx]
                y_tr, y_va = yg_enc[tr_idx], yg_enc[va_idx]
                w_tr = weights_grp[tr_idx]

                pre = make_preprocessor(num_cols, cat_cols)
                Xtr = pre.fit_transform(X_tr)
                Xva = pre.transform(X_va)

                params_try = apply_device(base_params, want_gpu=USE_GPU)
                use_gpu = has_gpu(params_try)

                dtrain = make_dmatrix(Xtr, y_tr, w_tr, use_gpu)
                dvalid = make_dmatrix(Xva, y_va, None, use_gpu)

                try:
                    bst = xgb.train(
                        params=params_try,
                        dtrain=dtrain,
                        num_boost_round=hp["num_boost_round"],
                        evals=[(dtrain, "train"), (dvalid, "valid")],
                        early_stopping_rounds=EARLY_STOP,
                        verbose_eval=_verbose_eval()
                    )
                    if use_gpu and not GPU_STATUS["used_any"]:
                        GPU_STATUS["used_any"] = True
                        print(f"[{label}] ✅ GPU is being used (tree_method=gpu_hist, predictor=gpu_predictor).")
                except xgb.core.XGBoostError as e:
                    if USE_GPU:
                        print(f"[{label}] ⚠️ GPU unavailable → CPU fallback. Reason: {e}")
                    params_cpu = apply_device(base_params, want_gpu=False)
                    dtrain = make_dmatrix(Xtr, y_tr, w_tr, use_gpu=False)
                    dvalid = make_dmatrix(Xva, y_va, None, use_gpu=False)
                    bst = xgb.train(
                        params=params_cpu,
                        dtrain=dtrain,
                        num_boost_round=hp["num_boost_round"],
                        evals=[(dtrain, "train"), (dvalid, "valid")],
                        early_stopping_rounds=EARLY_STOP,
                        verbose_eval=_verbose_eval()
                    )

                pred_va = bst.predict(dvalid, iteration_range=(0, bst.best_iteration + 1))
                y_hat = np.argmax(pred_va, axis=1)
                acc = accuracy_score(y_va, y_hat)
                fold_scores.append(acc)
                print(f"[{label}]   Fold {fold_id} Acc={acc:.5f} | best_iter={bst.best_iteration+1}")

            mean_seed_score = float(np.mean(fold_scores))
            scores_across_seeds.append(mean_seed_score)
            trial.report(np.mean(scores_across_seeds), step=len(scores_across_seeds))

        mean_score = float(np.mean(scores_across_seeds))
        print(f"[{label}] Trial {trial.number+1} mean Acc over seeds: {mean_score:.5f}")
        return mean_score

    study = optuna.create_study(direction="maximize")
    print(f"\n[{label}] Robust tuning (weighted): {trials} trials × {len(CV_SEEDS)} seeds × {N_FOLDS}-fold")
    print(f"[INFO] XGBoost {xgb.__version__} | GPU requested: {USE_GPU}")
    study.optimize(objective, n_trials=trials, show_progress_bar=True)

    print(f"[{label}] Best CV Acc: {study.best_value:.5f}")
    print(f"[{label}] Best params:\n{study.best_params}")
    return study.best_params

def train_cv_predict(Xg, yg_enc, y_labels_grp, Xtestg, params, num_class, label):
    num_cols, cat_cols = infer_cols(Xg)
    weights_grp = build_class_weights(y_labels_grp)

    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    oof = np.zeros((len(Xg), num_class), dtype=np.float32)
    test_pred = np.zeros((len(Xtestg), num_class), dtype=np.float32)
    best_iters = []

    base_params = {
        "objective": "multi:softprob",
        "num_class": num_class,
        "eval_metric": "mlogloss",
        "nthread": N_JOBS,
        "seed": RANDOM_STATE,
        "verbosity": 0,
        "eta": float(params["eta"]),
        "max_depth": int(params["max_depth"]),
        "min_child_weight": float(params["min_child_weight"]),
        "subsample": float(params["subsample"]),
        "colsample_bytree": float(params["colsample_bytree"]),
        "reg_lambda": float(params["reg_lambda"]),
        "reg_alpha": float(params["reg_alpha"]),
        "gamma": float(params["gamma"]),
        "max_delta_step": float(params["max_delta_step"]),
    }

    for fold, (tr_idx, va_idx) in enumerate(skf.split(Xg, yg_enc), start=1):
        print(f"[{label}] Final CV • Fold {fold}/{N_FOLDS}")
        X_tr, X_va = Xg.iloc[tr_idx], Xg.iloc[va_idx]
        y_tr, y_va = yg_enc[tr_idx], yg_enc[va_idx]
        w_tr = weights_grp[tr_idx]

        pre = make_preprocessor(num_cols, cat_cols)
        Xtr = pre.fit_transform(X_tr)
        Xva = pre.transform(X_va)
        Xte = pre.transform(Xtestg)

        params_try = apply_device(base_params, want_gpu=USE_GPU)
        use_gpu = has_gpu(params_try)

        dtrain = make_dmatrix(Xtr, y_tr, w_tr, use_gpu)
        dvalid = make_dmatrix(Xva, y_va, None, use_gpu)
        dtest  = make_dmatrix(Xte, None, None, use_gpu)

        try:
            bst = xgb.train(
                params=params_try,
                dtrain=dtrain,
                num_boost_round=int(params["num_boost_round"]),
                evals=[(dtrain, "train"), (dvalid, "valid")],
                early_stopping_rounds=EARLY_STOP,
                verbose_eval=_verbose_eval()
            )
            if use_gpu and not GPU_STATUS["used_any"]:
                GPU_STATUS["used_any"] = True
                print(f"[{label}] ✅ GPU is being used (tree_method=gpu_hist, predictor=gpu_predictor).")
        except xgb.core.XGBoostError as e:
            if USE_GPU:
                print(f"[{label}] ⚠️ GPU unavailable → CPU fallback. Reason: {e}")
            params_cpu = apply_device(base_params, want_gpu=False)
            dtrain = make_dmatrix(Xtr, y_tr, w_tr, use_gpu=False)
            dvalid = make_dmatrix(Xva, y_va, None, use_gpu=False)
            dtest  = make_dmatrix(Xte, None, None, use_gpu=False)
            bst = xgb.train(
                params=params_cpu,
                dtrain=dtrain,
                num_boost_round=int(params["num_boost_round"]),
                evals=[(dtrain, "train"), (dvalid, "valid")],
                early_stopping_rounds=EARLY_STOP,
                verbose_eval=_verbose_eval()
            )

        best_iters.append(int(bst.best_iteration + 1))
        oof[va_idx] = bst.predict(dvalid, iteration_range=(0, bst.best_iteration + 1))
        test_pred += bst.predict(dtest, iteration_range=(0, bst.best_iteration + 1)) / N_FOLDS

        print(f"[{label}]   Fold {fold} best_iter={best_iters[-1]}")

    y_oof = np.argmax(oof, axis=1)
    print(f"[{label}] OOF Acc: {accuracy_score(yg_enc, y_oof):.5f} | "
          f"OOF F1: {f1_score(yg_enc, y_oof, average='macro'):.5f} | "
          f"median best_iter: {int(np.median(best_iters))}")
    return test_pred

# ==============================================================
# Load & Prepare
# ==============================================================

print(f"[INFO] XGBoost version: {xgb.__version__} | USE_GPU={USE_GPU}")
train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)
sample_sub = pd.read_csv(SAMPLE_SUB_PATH)

# Drop columns you don’t want
for c in ["MTRANS","SMOKE"]:
    if c in train.columns: train.drop(columns=[c], inplace=True)
    if c in test.columns:  test.drop(columns=[c], inplace=True)

# Add BMI
train = add_bmi(train)
test  = add_bmi(test)

# Detect ID/Target
ID_COL, TARGET_COL = sample_sub.columns[0], sample_sub.columns[1]

y = train[TARGET_COL].copy()
X = train.drop(columns=[TARGET_COL, ID_COL], errors="ignore")
test_ids = test[ID_COL]
test_X = test.drop(columns=[ID_COL], errors="ignore")

# Label encode target
le = LabelEncoder()
y_enc = le.fit_transform(y)
classes = le.classes_
num_class = len(classes)

# Gender split
gender_col = detect_gender(X)
male_mask   = X[gender_col].astype(str).str.lower().str.startswith(("m"))
female_mask = X[gender_col].astype(str).str.lower().str.startswith(("f"))
test_male_mask   = test_X[gender_col].astype(str).str.lower().str.startswith(("m"))
test_female_mask = test_X[gender_col].astype(str).str.lower().str.startswith(("f"))

print(f"[INFO] Train: males={male_mask.sum()}, females={female_mask.sum()}")
print(f"[INFO] Test:  males={test_male_mask.sum()}, females={test_female_mask.sum()}")

# Split: X, y (encoded), AND the original string labels for weight computation
X_male     = X[male_mask].reset_index(drop=True)
y_male_enc = y_enc[male_mask]
y_male_lbl = y[male_mask].reset_index(drop=True)

X_female     = X[female_mask].reset_index(drop=True)
y_female_enc = y_enc[female_mask]
y_female_lbl = y[female_mask].reset_index(drop=True)

test_male   = test_X[test_male_mask].reset_index(drop=True)
test_female = test_X[test_female_mask].reset_index(drop=True)

# ==============================================================
# Tune (weighted) per gender
# ==============================================================

best_male   = tune_group(X_male, y_male_enc, y_male_lbl, "MALE",   num_class, TRIALS_MALE)
best_female = tune_group(X_female, y_female_enc, y_female_lbl, "FEMALE", num_class, TRIALS_FEMALE)

# ==============================================================
# Train with best params and predict (5-fold avg, weighted)
# ==============================================================

pred_male   = train_cv_predict(X_male, y_male_enc, y_male_lbl, test_male,   best_male,   num_class, "MALE")
pred_female = train_cv_predict(X_female, y_female_enc, y_female_lbl, test_female, best_female, num_class, "FEMALE")

# Merge predictions and build submission
final_proba = np.zeros((len(test_X), num_class), dtype=np.float32)
final_proba[test_male_mask.values]   = pred_male
final_proba[test_female_mask.values] = pred_female
final_pred = le.inverse_transform(np.argmax(final_proba, axis=1))

sub = pd.DataFrame({ID_COL: test_ids, TARGET_COL: final_pred})
sub.to_csv("submission.csv", index=False)
print("\nSaved submission.csv ✅")
print(sub.head())

# Final GPU summary
if GPU_STATUS["requested"]:
    print(f"\n[SUMMARY] GPU requested ✔ | GPU actually used at least once: {GPU_STATUS['used_any']}")
else:
    print("\n[SUMMARY] GPU not requested (forced CPU).")
