In [None]:
# ==============================================
# End-to-end (Gender-Specific Models) + BMI + Randomized Search:
# Load → Detect ID/Target/Gender → Drop MTRANS/SCC → +BMI →
# Split by Gender → (Per-gender) Randomized Search + 5-Fold XGB (ES) →
# Predict test → submission.csv → Evaluate on Kaggle_test.csv
# ==============================================

# -------- Imports --------
import os
import math
import random
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.base import clone

import xgboost as xgb

# -------- Paths --------
TRAIN_PATH = "train.csv"
TEST_PATH = "test.csv"
SAMPLE_SUB_PATH = "sample_submission.csv"
KAGGLE_TEST_PATH = "Kaggle_test.csv"   # must contain WeightCategory ground truth

# -------- Seeds / folds / speed --------
RANDOM_STATE = 42
N_FOLDS = 5
N_JOBS = -1
NUM_CLASSES_EXPECTED = 7
USE_GPU = True  # <— turn this on


# -------- Randomized Search Config --------
N_TRIALS = 30            # increase (e.g., 60–100) for more thorough tuning
EARLY_STOP = 200
MAX_BOOST_ROUND = 20000
BASE_SEED = 1337

random.seed(BASE_SEED)
np.random.seed(BASE_SEED)

# Optional: GPU toggle (falls back to CPU if not available)
USE_GPU = False

# -------- Helpers --------
def norm_col(s: str) -> str:
    if s is None: return s
    return str(s).replace("\ufeff", "").strip().lower()

def build_norm_map(cols):
    fwd = {c: norm_col(c) for c in cols}
    rev = {}
    for orig, n in fwd.items():
        if n not in rev:
            rev[n] = orig
    return fwd, rev

def find_id_and_label(sample_sub, train, test):
    ss_fwd, ss_rev = build_norm_map(sample_sub.columns)
    tr_fwd, tr_rev = build_norm_map(train.columns)
    te_fwd, te_rev = build_norm_map(test.columns)

    ss_norm_cols = [ss_fwd[c] for c in sample_sub.columns]
    tr_norm_cols = [tr_fwd[c] for c in train.columns]
    te_norm_cols = [te_fwd[c] for c in test.columns]

    id_norm, label_norm = None, None
    if len(ss_norm_cols) == 2:
        c1, c2 = ss_norm_cols
        if c1 in te_norm_cols and c2 not in te_norm_cols:
            id_norm, label_norm = c1, c2
        elif c2 in te_norm_cols and c1 not in te_norm_cols:
            id_norm, label_norm = c2, c1
        else:
            if c1 in te_norm_cols and c1 in tr_norm_cols:
                id_norm, label_norm = c1, c2
            elif c2 in te_norm_cols and c2 in tr_norm_cols:
                id_norm, label_norm = c2, c1

    if id_norm is None:
        for cand in ["id", "row_id", "index", "sample_id"]:
            if cand in te_norm_cols and cand in tr_norm_cols:
                id_norm = cand
                break

    if label_norm is None:
        candidates = [c for c in ss_norm_cols if c != id_norm]
        if len(candidates) == 1:
            label_norm = candidates[0]

    if label_norm is None:
        for cand in ["label", "target", "class", "y", "weightcategory", "nobeyesdad"]:
            if cand in tr_norm_cols and cand != id_norm:
                label_norm = cand
                break

    if label_norm is None:
        for c in reversed(tr_norm_cols):
            if c != id_norm:
                label_norm = c
                break

    return {
        "id_norm": id_norm,
        "label_norm": label_norm,
        "id_in_train": build_norm_map(train.columns)[1].get(id_norm, None),
        "id_in_test": build_norm_map(test.columns)[1].get(id_norm, None),
        "id_in_sample": build_norm_map(sample_sub.columns)[1].get(id_norm, None),
        "label_in_train": build_norm_map(train.columns)[1].get(label_norm, None),
        "label_in_sample": build_norm_map(sample_sub.columns)[1].get(label_norm, None),
    }

def infer_feature_types(df):
    cat_cols = df.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    return num_cols, cat_cols

def detect_gender_column(df):
    candidates = [c for c in df.columns if norm_col(c) in {"gender","sex"}]
    if candidates:
        return candidates[0]
    for c in df.columns:
        vals = pd.Series(df[c].dropna().astype(str).str.lower().str.strip()).unique()
        if len(vals) in (2, 3):
            if any(v.startswith("m") for v in vals) and any(v.startswith("f") for v in vals):
                return c
    return None

def split_by_gender(series):
    s = series.astype(str).str.lower().str.strip()
    male_mask = s.str.startswith(("m","1","true"))
    female_mask = s.str.startswith(("f","0","false"))
    if male_mask.sum()==0 and female_mask.sum()==0:
        top = s.value_counts().index.tolist()
        if len(top)>=2:
            male_mask = s==top[0]
            female_mask = s==top[1]
    return male_mask, female_mask

def add_bmi(df):
    """Compute BMI = Weight / (Height_m^2) with robust height-unit detection."""
    if ("Weight" in df.columns) and ("Height" in df.columns):
        h = pd.to_numeric(df["Height"], errors="coerce")
        height_m = np.where(np.nanmedian(h) > 3.0, h / 100.0, h)
        with np.errstate(divide="ignore", invalid="ignore"):
            w = pd.to_numeric(df["Weight"], errors="coerce")
            bmi = w / (np.power(height_m, 2) + 1e-12)
        df["BMI"] = pd.Series(bmi).replace([np.inf, -np.inf], np.nan)
    return df

def round_age_inplace(df):
    if "Age" in df.columns:
        df["Age"] = pd.to_numeric(df["Age"], errors="coerce").round().astype("Int64")


# ---- Randomized Search sampling helpers ----
def sample_uniform(a, b):
    return a + (b - a) * random.random()

def sample_int(a, b):
    return random.randint(a, b)

def sample_loguniform(a, b):
    la, lb = math.log(a), math.log(b)
    return math.exp(la + (lb - la) * random.random())

# ---- Preprocessor factory ----
def make_preprocessor(X_frame):
    num_cols, cat_cols = infer_feature_types(X_frame)
    numeric_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler(with_mean=False)),
    ])
    try:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
    except TypeError:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse=True)
    categorical_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", ohe),
    ])
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, num_cols),
            ("cat", categorical_transformer, cat_cols),
        ],
        remainder="drop",
        sparse_threshold=1.0,
    )
    return preprocessor

# ---- Base XGB params (CPU/GPU aware) ----
def make_xgb_base():
    base = {
        "objective": "multi:softprob",
        "num_class": None,        # fill later
        "eval_metric": "mlogloss",
        "nthread": N_JOBS,
        "seed": RANDOM_STATE,
    }
    if USE_GPU:
        try:
            # GPU path (works for xgboost <2 and >=2)
            try:
                from packaging import version
                if version.parse(xgb.__version__) >= version.parse("2.0.0"):
                    base.update({"device": "cuda", "tree_method": "hist"})
                else:
                    base.update({"tree_method": "gpu_hist", "predictor": "gpu_predictor"})
            except Exception:
                base.update({"tree_method": "gpu_hist"})
            base.update({"max_bin": 512})
        except Exception:
            base.update({"tree_method": "hist"})
    else:
        base.update({"tree_method": "hist"})
    return base

BASE_XGB = make_xgb_base()

def sample_params():
    return {
        "eta":                sample_uniform(0.02, 0.07),    # learning rate
        "max_depth":          sample_int(4, 10),
        "min_child_weight":   sample_int(1, 8),
        "subsample":          sample_uniform(0.6, 1.0),
        "colsample_bytree":   sample_uniform(0.6, 1.0),
        "lambda":             sample_loguniform(1e-3, 10.0), # L2
        "alpha":              sample_loguniform(1e-3, 5.0),  # L1
        "gamma":              sample_uniform(0.0, 5.0),
        # "max_delta_step":   sample_int(0, 3),              # optional stabilizer
    }

def cv_score_for_params(Xg, yg, params, classes, group_name):
    preprocessor = make_preprocessor(Xg)
    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    f1s = []

    for fold, (tr_idx, va_idx) in enumerate(skf.split(Xg, yg), start=1):
        X_tr, X_va = Xg.iloc[tr_idx], Xg.iloc[va_idx]
        y_tr, y_va = yg[tr_idx], yg[va_idx]

        prep = clone(preprocessor)
        Xtr = prep.fit_transform(X_tr)
        Xva = prep.transform(X_va)

        dtrain = xgb.DMatrix(Xtr, label=y_tr)
        dval   = xgb.DMatrix(Xva, label=y_va)

        xgb_params = dict(BASE_XGB)
        xgb_params["num_class"] = len(classes)
        xgb_params.update(params)

        bst = xgb.train(
            params=xgb_params,
            dtrain=dtrain,
            num_boost_round=MAX_BOOST_ROUND,
            evals=[(dtrain, "train"), (dval, "valid")],
            early_stopping_rounds=EARLY_STOP,
            verbose_eval=False
        )
        proba = bst.predict(dval, iteration_range=(0, int(bst.best_iteration + 1)))
        pred  = np.argmax(proba, axis=1)
        f1s.append(f1_score(y_va, pred, average="macro"))

    return float(np.mean(f1s))

def randomized_search_xgb(Xg, yg, classes, group_name, n_trials=N_TRIALS):
    best_score = -1.0
    best_params = None
    for t in range(1, n_trials + 1):
        params = sample_params()
        score = cv_score_for_params(Xg, yg, params, classes, group_name)
        print(f"[TUNE {group_name}] Trial {t:02d}/{n_trials} | macroF1={score:.4f} | params={params}")
        if score > best_score:
            best_score = score
            best_params = params
    print(f"[TUNE {group_name}] Best macroF1={best_score:.4f}")
    print(f"[TUNE {group_name}] Best params={best_params}")
    return best_params

def train_group_and_predict_with_params(X_grp, y_enc_grp, test_grp, group_name, classes, best_params, gender_col):
    cols_to_use = [c for c in X_grp.columns if c != gender_col]
    Xg = X_grp[cols_to_use].copy()
    Xtestg = test_grp[cols_to_use].copy()

    preprocessor = make_preprocessor(Xg)
    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)

    oof_group = np.zeros((len(Xg), len(classes)), dtype=np.float32)
    test_group_pred = np.zeros((len(Xtestg), len(classes)), dtype=np.float32)
    fold_best = []

    xgb_params = dict(BASE_XGB)
    xgb_params["num_class"] = len(classes)
    xgb_params.update(best_params)

    for fold, (tr_idx, va_idx) in enumerate(skf.split(Xg, y_enc_grp), start=1):
        print(f"\n[{group_name}] Fold {fold}/{N_FOLDS}")
        X_tr, X_va = Xg.iloc[tr_idx], Xg.iloc[va_idx]
        y_tr, y_va = y_enc_grp[tr_idx], y_enc_grp[va_idx]

        prep = clone(preprocessor)
        Xtr = prep.fit_transform(X_tr)
        Xva = prep.transform(X_va)

        dtrain = xgb.DMatrix(Xtr, label=y_tr)
        dval   = xgb.DMatrix(Xva, label=y_va)

        bst = xgb.train(
            params=xgb_params,
            dtrain=dtrain,
            num_boost_round=MAX_BOOST_ROUND,
            evals=[(dtrain, "train"), (dval, "valid")],
            early_stopping_rounds=EARLY_STOP,
            verbose_eval=False
        )
        best_round = int(bst.best_iteration + 1)
        fold_best.append(best_round)
        print(f"[{group_name}] Best iteration: {best_round}")

        oof_proba = bst.predict(dval, iteration_range=(0, best_round))
        oof_group[va_idx] = oof_proba

        # test preds
        Xtest_tf = prep.transform(Xtestg)
        dtest = xgb.DMatrix(Xtest_tf)
        test_group_pred += bst.predict(dtest, iteration_range=(0, best_round)) / N_FOLDS

    oof_labels = np.argmax(oof_group, axis=1)
    acc_g = accuracy_score(y_enc_grp, oof_labels)
    f1_g = f1_score(y_enc_grp, oof_labels, average="macro")
    print(f"\n[{group_name}] OOF Accuracy: {acc_g:.4f} | Macro F1: {f1_g:.4f}")
    print(f"[{group_name}] Best iterations: {fold_best} | Median: {int(np.median(fold_best))}")

    return oof_group, test_group_pred

# -------- Load data --------
train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)
sample_sub = pd.read_csv(SAMPLE_SUB_PATH)

# Drop not-used columns
for c in ["MTRANS","SCC"]:
    if c in train.columns: train.drop(columns=[c], inplace=True)
    if c in test.columns:  test.drop(columns=[c], inplace=True)

# Add BMI feature
train = add_bmi(train)
test  = add_bmi(test)

# after: train = add_bmi(train); test = add_bmi(test)
round_age_inplace(train)
round_age_inplace(test)


# Detect headers
info = find_id_and_label(sample_sub, train, test)
ID_COL_TRAIN   = info["id_in_train"]
ID_COL_TEST    = info["id_in_test"]
ID_COL_SAMPLE  = info["id_in_sample"]
TARGET_COL     = info["label_in_train"]
LABEL_COL_SAMP = info["label_in_sample"]

if TARGET_COL is None:
    raise ValueError("Could not detect the target column. Ensure sample_submission and train headers align.")
if LABEL_COL_SAMP is None:
    ss_cols = list(sample_sub.columns)
    others = [c for c in ss_cols if c != ID_COL_SAMPLE]
    if len(others) == 1:
        LABEL_COL_SAMP = others[0]
    else:
        raise ValueError("Could not detect label header in sample_submission.csv")

print(f"[Detected] Target in train: '{TARGET_COL}', Label in sample_sub: '{LABEL_COL_SAMP}'")
if ID_COL_TRAIN and ID_COL_TEST:
    print(f"[Detected] ID in train: '{ID_COL_TRAIN}', ID in test: '{ID_COL_TEST}'")

# Target / Features
y = train[TARGET_COL].copy()
X = train.drop(columns=[TARGET_COL]).copy()
if ID_COL_TRAIN in X.columns:
    X.drop(columns=[ID_COL_TRAIN], inplace=True)

test_features = test.copy()
if ID_COL_TEST in test_features.columns:
    test_ids = test_features[ID_COL_TEST].copy()
    test_features.drop(columns=[ID_COL_TEST], inplace=True)
else:
    test_ids = pd.Series(np.arange(len(test_features)), name="id")

# Label encode target
le = LabelEncoder()
y_enc = le.fit_transform(y)
classes = list(le.classes_)
if len(classes) != NUM_CLASSES_EXPECTED:
    print(f"[Warn] Expected {NUM_CLASSES_EXPECTED} classes but found {len(classes)}. Proceeding.")

# Detect gender & split
gender_col = detect_gender_column(pd.concat([X, test_features], axis=0))
if gender_col is None:
    raise ValueError("Could not detect a gender column (e.g., 'Gender' or 'SEX').")

male_mask, female_mask = split_by_gender(train[gender_col])
test_male_mask, test_female_mask = split_by_gender(test_features[gender_col])
print(f"[Info] Train male rows: {int(male_mask.sum())} | female rows: {int(female_mask.sum())}")
print(f"[Info] Test  male rows: {int(test_male_mask.sum())} | female rows: {int(test_female_mask.sum())}")

# Per-gender frames
X_male = X[male_mask].reset_index(drop=True)
y_male = y_enc[male_mask]
X_female = X[female_mask].reset_index(drop=True)
y_female = y_enc[female_mask]
test_male = test_features[test_male_mask].reset_index(drop=True)
test_female = test_features[test_female_mask].reset_index(drop=True)

# ---- Randomized search per gender ----
print("\n=== Randomized Search: MALE ===")
best_params_male = randomized_search_xgb(X_male, y_male, classes, "MALE", n_trials=N_TRIALS)
print("\n=== Randomized Search: FEMALE ===")
best_params_female = randomized_search_xgb(X_female, y_female, classes, "FEMALE", n_trials=N_TRIALS)

# ---- Train per gender using tuned params ----
male_oof, male_test_pred = train_group_and_predict_with_params(
    X_male, y_male, test_male, "MALE", classes, best_params_male, gender_col
)
female_oof, female_test_pred = train_group_and_predict_with_params(
    X_female, y_female, test_female, "FEMALE", classes, best_params_female, gender_col
)

# ---- Combine OOF, print overall ----
oof_full = np.zeros((len(X), len(classes)), dtype=np.float32)
oof_full[male_mask.values] = male_oof
oof_full[female_mask.values] = female_oof

oof_labels = np.argmax(oof_full, axis=1)
oof_acc = accuracy_score(y_enc, oof_labels)
oof_f1 = f1_score(y_enc, oof_labels, average="macro")
print("\n========== OVERALL OOF ==========")
print(f"OOF Accuracy: {oof_acc:.4f} | OOF Macro F1: {oof_f1:.4f}")
try:
    print("\nOOF Classification Report:\n",
          classification_report(y_enc, oof_labels, target_names=classes, zero_division=0))
except Exception as e:
    print(f"[Info] Could not print classification report: {e}")

# ---- Predict test & build submission ----
test_pred_proba = np.zeros((len(test_features), len(classes)), dtype=np.float32)
test_pred_proba[test_male_mask.values] = male_test_pred
test_pred_proba[test_female_mask.values] = female_test_pred
test_pred_int = np.argmax(test_pred_proba, axis=1)
test_pred_labels = le.inverse_transform(test_pred_int)

ss_cols = list(sample_sub.columns)
ID_HEADER = ID_COL_SAMPLE if (ID_COL_SAMPLE in sample_sub.columns) else None
LABEL_HEADER = LABEL_COL_SAMP

sub = pd.DataFrame()
if ID_HEADER is not None and (ID_COL_TEST in test.columns):
    sub[ID_HEADER] = test[ID_COL_TEST].values
elif ID_HEADER is not None:
    sub[ID_HEADER] = np.arange(len(test_features))
sub[LABEL_HEADER] = test_pred_labels

for c in ss_cols:
    if c not in sub.columns:
        sub[c] = sample_sub[c].iloc[0] if len(sample_sub[c]) else None
sub = sub[ss_cols]
sub.to_csv("submission.csv", index=False)
print("\nSaved submission.csv")
print(sub.head(10))

# ==============================================
# Evaluate on Kaggle_test.csv (with ground truth)
# ==============================================
if not os.path.exists(KAGGLE_TEST_PATH):
    raise FileNotFoundError(f"{KAGGLE_TEST_PATH} not found")

kdf = pd.read_csv(KAGGLE_TEST_PATH)
if "WeightCategory" not in kdf.columns:
    raise KeyError("Kaggle_test.csv must contain 'WeightCategory' as ground truth.")

y_true = kdf["WeightCategory"].copy()
X_k = kdf.drop(columns=["WeightCategory"], errors="ignore").copy()

# drop same cols + add BMI (align to train processing)
for c in ["MTRANS","SCC"]:
    if c in X_k.columns: X_k.drop(columns=[c], inplace=True)
X_k = add_bmi(X_k)
round_age_inplace(X_k)


# detect gender on Kaggle_test
gender_col_k = detect_gender_column(X_k)
if gender_col_k is None:
    raise ValueError("Could not detect a gender column in Kaggle_test.csv")
km_k, kf_k = split_by_gender(X_k[gender_col_k])

# Helper: train on full group & infer on Kaggle subset using tuned params
def train_full_and_predict_with_params(X_full, y_full, X_eval, name, params):
    if len(X_eval) == 0 or len(X_full) == 0:
        return np.zeros((len(X_eval), len(classes)), dtype=np.float32)

    cols_to_use = [c for c in X_full.columns if c != gender_col]
    Xf = X_full[cols_to_use].copy()
    Xe = X_eval[cols_to_use].copy()

    preprocessor = make_preprocessor(Xf)
    prep = clone(preprocessor)
    Xtr = prep.fit_transform(Xf)
    Xev = prep.transform(Xe)

    xgb_params = dict(BASE_XGB)
    xgb_params["num_class"] = len(classes)
    xgb_params.update(params)

    dtrain = xgb.DMatrix(Xtr, label=y_full)
    dtest  = xgb.DMatrix(Xev)

    bst = xgb.train(
        params=xgb_params,
        dtrain=dtrain,
        num_boost_round=MAX_BOOST_ROUND,
        evals=[(dtrain, "train")],
        early_stopping_rounds=EARLY_STOP,
        verbose_eval=False
    )
    best_round = int(bst.best_iteration + 1)
    proba = bst.predict(dtest, iteration_range=(0, best_round))
    print(f"[Kaggle] Trained {name} on {Xf.shape[0]} rows; infer {Xev.shape[0]} rows. Best iters={best_round}")
    return proba

# split Kaggle set by gender
Xk_male   = X_k.loc[km_k].reset_index(drop=True)
Xk_female = X_k.loc[kf_k].reset_index(drop=True)

# train on full train-group and predict Kaggle subsets
kaggle_pred_proba = np.zeros((len(X_k), len(classes)), dtype=np.float32)
kaggle_pred_proba[km_k.values] = train_full_and_predict_with_params(X_male, y_male, Xk_male, "MALE (full)", best_params_male)
kaggle_pred_proba[kf_k.values] = train_full_and_predict_with_params(X_female, y_female, Xk_female, "FEMALE (full)", best_params_female)

kaggle_pred_idx = np.argmax(kaggle_pred_proba, axis=1)
y_pred = le.inverse_transform(kaggle_pred_idx)

In [None]:
# -------- Accuracy & confusion matrix --------
overall_acc = accuracy_score(y_true, y_pred)
print(f"\n✅ Overall Accuracy on Kaggle_test: {overall_acc:.5f}")

order = [
    'Insufficient_Weight',
    'Normal_Weight',
    'Overweight_Level_I',
    'Overweight_Level_II',
    'Obesity_Type_I',
    'Obesity_Type_II',
    'Obesity_Type_III'
]
cm = confusion_matrix(y_true, y_pred, labels=order)
cm_norm = cm.astype(float) / (cm.sum(axis=1, keepdims=True) + 1e-12)

print("\n=== Confusion Matrix (counts) ===")
print("Predicted →")
print("True ↓")
for i, true_class in enumerate(order):
    row = " | ".join(f"{cm[i, j]:4d}" for j in range(len(order)))
    print(f"{true_class:<22}: {row}")

print("\n=== Confusion Matrix (row-normalized) ===")
for i, true_class in enumerate(order):
    row = " | ".join(f"{cm_norm[i, j]:.2f}" for j in range(len(order)))
    print(f"{true_class:<22}: {row}")

print("\n=== Per-class metrics ===")
try:
    print(classification_report(y_true, y_pred, labels=order, target_names=order, digits=4, zero_division=0))
except Exception as e:
    print(f"[Info] classification_report fallback: {e}")
    print(classification_report(y_true, y_pred, digits=4, zero_division=0))