In [1]:
# ==============================================
# Gender-specific XGB + BMI + targeted class boost + Kaggle_test eval
# ==============================================
import numpy as np
import pandas as pd
from collections import Counter

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.base import clone
import xgboost as xgb

# -------- Paths --------
TRAIN_PATH = "train.csv"
TEST_PATH = "test.csv"
SAMPLE_SUB_PATH = "sample_submission.csv"
KAGGLE_TEST_PATH = "Kaggle_test.csv"  # has WeightCategory ground truth

RANDOM_STATE = 42
N_FOLDS = 5
N_JOBS = -1

# -------- Helpers --------
def norm_col(s: str) -> str:
    if s is None: return s
    return str(s).replace("\ufeff", "").strip().lower()

def infer_feature_types(df):
    cat_cols = df.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    return num_cols, cat_cols

def detect_gender_column(df):
    # common names
    for c in df.columns:
        if norm_col(c) in {"gender", "sex"}:
            return c
    # fallback: column that looks like M/F
    for c in df.columns:
        vals = pd.Series(df[c].dropna().astype(str).str.lower().str.strip()).unique()
        if len(vals) in (2, 3):
            if any(v.startswith("m") for v in vals) and any(v.startswith("f") for v in vals):
                return c
    return None

def split_by_gender(series):
    s = series.astype(str).str.lower().str.strip()
    male_mask = s.str.startswith(("m","1","true"))
    female_mask = s.str.startswith(("f","0","false"))
    if male_mask.sum()==0 and female_mask.sum()==0:
        top = s.value_counts().index.tolist()
        if len(top)>=2:
            male_mask = s==top[0]
            female_mask = s==top[1]
    return male_mask, female_mask

def add_bmi(df):
    """Compute BMI = Weight / (Height_m^2).
       If median height > 3 assume cm → convert to meters."""
    if ("Weight" in df.columns) and ("Height" in df.columns):
        h = df["Height"].astype(float)
        height_m = np.where(h.median() > 3.0, h / 100.0, h)
        with np.errstate(divide="ignore", invalid="ignore"):
            bmi = df["Weight"].astype(float) / (np.power(height_m, 2) + 1e-12)
        df["BMI"] = pd.Series(bmi).replace([np.inf, -np.inf], np.nan)
    return df

# -------- Load data --------
train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)
sample_sub = pd.read_csv(SAMPLE_SUB_PATH)

# Drop columns we don’t want in this run
for c in ["MTRANS","SMOKE"]:
    if c in train.columns: train.drop(columns=[c], inplace=True)
    if c in test.columns:  test.drop(columns=[c], inplace=True)

# Feature engineering
train = add_bmi(train)
test  = add_bmi(test)

# Detect ID/Target from files (simple logic)
id_col = None
for cand in ["id", "row_id", "index", "sample_id"]:
    if cand in train.columns and cand in test.columns:
        id_col = cand
        break

target_col = None
for cand in ["WeightCategory", "NObeyesdad", "label", "target", "class", "y"]:
    if cand in train.columns:
        target_col = cand
        break
if target_col is None:
    raise ValueError("Could not detect target column in train.csv")

# Build X/y
y = train[target_col].copy()
X = train.drop(columns=[target_col]).copy()
if id_col and id_col in X.columns:
    X.drop(columns=[id_col], inplace=True)

test_features = test.copy()
if id_col and id_col in test_features.columns:
    test_ids = test_features[id_col].copy()
    test_features.drop(columns=[id_col], inplace=True)
else:
    test_ids = pd.Series(np.arange(len(test_features)), name="id")

# Label encode target
le = LabelEncoder()
y_enc = le.fit_transform(y)
classes = list(le.classes_)
print(f"[Info] Classes: {classes}")

# Detect gender and split
gender_col = detect_gender_column(pd.concat([X, test_features], axis=0))
if gender_col is None:
    raise ValueError("Could not detect a gender column (e.g., 'Gender'/'SEX').")
male_mask, female_mask = split_by_gender(train[gender_col])
test_male_mask, test_female_mask = split_by_gender(test_features[gender_col])
print(f"[Info] Train male={int(male_mask.sum())}, female={int(female_mask.sum())}")
print(f"[Info] Test  male={int(test_male_mask.sum())}, female={int(test_female_mask.sum())}")

# -------- Training function (gender-specific) with class boosting for two classes --------
def train_group_and_predict(X_grp, y_enc_grp, test_grp, group_name,
                            boost_targets=("Overweight_Level_I","Overweight_Level_II"),
                            base_boost=1.50, jitter_amp=0.10):
    # Drop gender column inside a group (constant after split)
    cols_to_use = [c for c in X_grp.columns if c != gender_col]
    Xg = X_grp[cols_to_use].copy()
    Xtestg = test_grp[cols_to_use].copy()

    num_cols, cat_cols = infer_feature_types(Xg)

    # Preprocessor
    numeric_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler(with_mean=False))
    ])
    try:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
    except TypeError:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse=True)
    categorical_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", ohe)
    ])
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, num_cols),
            ("cat", categorical_transformer, cat_cols),
        ],
        remainder="drop",
        sparse_threshold=1.0
    )

    # XGB params
    xgb_params = {
        "objective": "multi:softprob",
        "num_class": len(classes),
        "eval_metric": "mlogloss",
        "tree_method": "hist",
        "max_depth": 6,
        "min_child_weight": 2,
        "subsample": 0.9,
        "colsample_bytree": 0.9,
        "lambda": 1.0,
        "alpha": 0.0,
        "eta": 0.03,
        "nthread": N_JOBS,
        "seed": RANDOM_STATE,
    }
    NUM_BOOST_ROUND = 20000
    EARLY_STOP = 200

    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    oof_group = np.zeros((len(Xg), len(classes)), dtype=np.float32)
    test_group_pred = np.zeros((len(Xtestg), len(classes)), dtype=np.float32)
    fold_best = []

    # map class name -> index
    cls_to_idx = {c: i for i, c in enumerate(classes)}

    for fold, (tr_idx, va_idx) in enumerate(skf.split(Xg, y_enc_grp), start=1):
        print(f"\n[{group_name}] Fold {fold}/{N_FOLDS}")
        X_tr, X_va = Xg.iloc[tr_idx], Xg.iloc[va_idx]
        y_tr, y_va = y_enc_grp[tr_idx], y_enc_grp[va_idx]

        prep = clone(preprocessor)
        Xtr = prep.fit_transform(X_tr)
        Xva = prep.transform(X_va)

        # ---- RANDOM (non-count) WEIGHTS to gently boost two classes ----
        w_tr = np.ones_like(y_tr, dtype=float)
        rng = np.random.default_rng(RANDOM_STATE + fold)  # deterministic per fold
        for t in boost_targets:
            if t in cls_to_idx:
                cls_id = cls_to_idx[t]
                idx_t = np.where(y_tr == cls_id)[0]
                if idx_t.size > 0:
                    jitter = rng.uniform(-jitter_amp, jitter_amp, size=idx_t.size)
                    w_tr[idx_t] = base_boost + jitter
        w_va = np.ones_like(y_va, dtype=float)

        dtrain = xgb.DMatrix(Xtr, label=y_tr, weight=w_tr)
        dval   = xgb.DMatrix(Xva, label=y_va, weight=w_va)

        bst = xgb.train(
            params=xgb_params,
            dtrain=dtrain,
            num_boost_round=NUM_BOOST_ROUND,
            evals=[(dtrain, "train"), (dval, "valid")],
            feval=None,
            early_stopping_rounds=EARLY_STOP,
            verbose_eval=False
        )
        best_round = int(bst.best_iteration + 1)
        fold_best.append(best_round)
        print(f"[{group_name}] Best iteration: {best_round}")

        oof_proba = bst.predict(dval, iteration_range=(0, best_round))
        oof_group[va_idx] = oof_proba

        # test preds for this fold
        Xtest_tf = prep.transform(Xtestg)
        dtest = xgb.DMatrix(Xtest_tf)
        test_group_pred += bst.predict(dtest, iteration_range=(0, best_round)) / N_FOLDS

    # OOF summary for the group
    oof_labels = np.argmax(oof_group, axis=1)
    acc_g = accuracy_score(y_enc_grp, oof_labels)
    f1_g = f1_score(y_enc_grp, oof_labels, average="macro")
    print(f"\n[{group_name}] OOF Accuracy: {acc_g:.4f} | Macro F1: {f1_g:.4f}")
    print(f"[{group_name}] Best iterations: {fold_best} | Median: {int(np.median(fold_best))}")

    return oof_group, test_group_pred

# -------- Train per-gender and predict full test --------
X_male = X[male_mask].reset_index(drop=True)
y_male_enc = y_enc[male_mask]
test_male = test_features[test_male_mask].reset_index(drop=True)

X_female = X[female_mask].reset_index(drop=True)
y_female_enc = y_enc[female_mask]
test_female = test_features[test_female_mask].reset_index(drop=True)

male_oof, male_test_pred = train_group_and_predict(X_male, y_male_enc, test_male, "MALE")
female_oof, female_test_pred = train_group_and_predict(X_female, y_female_enc, test_female, "FEMALE")

# Combine OOF
oof_full = np.zeros((len(X), len(classes)), dtype=np.float32)
oof_full[male_mask.values] = male_oof
oof_full[female_mask.values] = female_oof

oof_labels = np.argmax(oof_full, axis=1)
oof_acc = accuracy_score(y_enc, oof_labels)
oof_f1 = f1_score(y_enc, oof_labels, average="macro")
print("\n========== OVERALL OOF ==========")
print(f"OOF Accuracy: {oof_acc:.4f} | OOF Macro F1: {oof_f1:.4f}")
try:
    print("\nOOF Classification Report:\n",
          classification_report(y_enc, oof_labels, target_names=classes))
except Exception as e:
    print(f"[Info] Could not print classification report: {e}")

# Build full test predictions (for Kaggle submission use-case)
test_pred_proba = np.zeros((len(test_features), len(classes)), dtype=np.float32)
test_pred_proba[test_male_mask.values] = male_test_pred
test_pred_proba[test_female_mask.values] = female_test_pred

test_pred_int = np.argmax(test_pred_proba, axis=1)
test_pred_labels = le.inverse_transform(test_pred_int)

# Submission
ss_cols = list(sample_sub.columns)
ID_HEADER = None
LABEL_HEADER = None
if len(ss_cols) == 2:
    # detect which is ID by presence in test
    c1, c2 = ss_cols
    if c1 in test.columns and c2 not in test.columns:
        ID_HEADER, LABEL_HEADER = c1, c2
    elif c2 in test.columns and c1 not in test.columns:
        ID_HEADER, LABEL_HEADER = c2, c1
if ID_HEADER is None:
    # fallback
    ID_HEADER = ss_cols[0]
    LABEL_HEADER = ss_cols[1]

sub = pd.DataFrame()
if ID_HEADER in test.columns:
    sub[ID_HEADER] = test[ID_HEADER].values
else:
    sub[ID_HEADER] = np.arange(len(test_features))
sub[LABEL_HEADER] = test_pred_labels

# Ensure column order
for c in ss_cols:
    if c not in sub.columns:
        sub[c] = sample_sub[c].iloc[0] if len(sample_sub[c]) else None
sub = sub[ss_cols]

sub.to_csv("submission.csv", index=False)
print("\nSaved submission.csv")
print(sub.head(5))

# ==============================================
# Evaluate on Kaggle_test.csv (with ground truth)
# ==============================================
kdf = pd.read_csv(KAGGLE_TEST_PATH)
if "WeightCategory" not in kdf.columns:
    raise KeyError("Kaggle_test.csv must contain 'WeightCategory'.")

y_true = kdf["WeightCategory"].copy()
X_k = kdf.drop(columns=["WeightCategory"], errors="ignore").copy()
if id_col and id_col in X_k.columns:
    X_k.drop(columns=[id_col], inplace=True)

# same drops + BMI
for c in ["MTRANS","SMOKE"]:
    if c in X_k.columns:
        X_k.drop(columns=[c], inplace=True)
X_k = add_bmi(X_k)

# detect gender and split for Kaggle set
gender_col_k = detect_gender_column(X_k)
if gender_col_k is None:
    raise ValueError("Could not detect a gender column in Kaggle_test.csv")
km_k, kf_k = split_by_gender(X_k[gender_col_k])

# Predict on Kaggle by reusing the same training procedure (per gender)
kaggle_pred_proba = np.zeros((len(X_k), len(classes)), dtype=np.float32)

if X_male.shape[0] > 0 and km_k.sum() > 0:
    _, male_k_pred = train_group_and_predict(X_male, y_male_enc, X_k[km_k].reset_index(drop=True), "MALE (Kaggle)")
    kaggle_pred_proba[km_k.values] = male_k_pred
if X_female.shape[0] > 0 and kf_k.sum() > 0:
    _, female_k_pred = train_group_and_predict(X_female, y_female_enc, X_k[kf_k].reset_index(drop=True), "FEMALE (Kaggle)")
    kaggle_pred_proba[kf_k.values] = female_k_pred

kaggle_pred_idx = np.argmax(kaggle_pred_proba, axis=1)
y_pred = le.inverse_transform(kaggle_pred_idx)

# -------- Overall accuracy to 5 decimals --------
overall_acc = accuracy_score(y_true, y_pred)
print(f"\n✅ Overall Accuracy on Kaggle_test: {overall_acc:.5f}")

# -------- Text-only error analysis (custom order) --------
order = [
    'Insufficient_Weight',
    'Normal_Weight',
    'Overweight_Level_I',
    'Overweight_Level_II',
    'Obesity_Type_I',
    'Obesity_Type_II',
    'Obesity_Type_III'
]

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_true, y_pred, labels=order)
cm_norm = cm.astype(float) / (cm.sum(axis=1, keepdims=True) + 1e-12)

print("\n=== Confusion Matrix (counts) ===")
print("Predicted →")
print("True ↓")
for i, true_class in enumerate(order):
    row = " | ".join(f"{cm[i, j]:4d}" for j in range(len(order)))
    print(f"{true_class:<22}: {row}")

print("\n=== Confusion Matrix (row-normalized) ===")
for i, true_class in enumerate(order):
    row = " | ".join(f"{cm_norm[i, j]:.2f}" for j in range(len(order)))
    print(f"{true_class:<22}: {row}")

print("\n=== Per-class metrics ===")
try:
    print(classification_report(y_true, y_pred, labels=order, target_names=order, digits=4, zero_division=0))
except Exception as e:
    print(f"[Info] classification_report fallback: {e}")
    print(classification_report(y_true, y_pred, digits=4, zero_division=0))

print("\n=== Per-class accuracy (diagonal/row total) ===")
for i, c in enumerate(order):
    total = cm[i].sum()
    correct = cm[i, i]
    acc = correct / total if total > 0 else 0.0
    print(f"{c:<22} | Correct: {correct:3d} / {total:3d} | {acc*100:6.2f}%")

print("\n=== Most common confusions (true → predicted) ===")
pairs = []
for i, t in enumerate(order):
    for j, p in enumerate(order):
        if i == j or cm[i, j] == 0:
            continue
        pairs.append((cm[i, j], t, p, cm_norm[i, j]))
pairs = sorted(pairs, key=lambda x: (-x[0], -x[3]))
for cnt, true_label, pred_label, norm_val in pairs[:10]:
    print(f"{true_label:25} → {pred_label:25} | Count: {cnt:3d} | Row%: {norm_val*100:5.1f}")

print("\n=== Sample of misclassified rows (first 10) ===")
mis_idx = np.where(np.asarray(y_true) != np.asarray(y_pred))[0]
if len(mis_idx) == 0:
    print("🎉 No misclassifications!")
else:
    for idx in mis_idx[:10]:
        true_lab = y_true.iloc[idx] if hasattr(y_true, "iloc") else y_true[idx]
        pred_lab = y_pred[idx]
        conf = float(np.max(kaggle_pred_proba[idx]))
        rank = np.argsort(-kaggle_pred_proba[idx])
        second_idx = rank[1] if rank.size > 1 else rank[0]
        second_lab = le.inverse_transform([second_idx])[0]
        second_conf = float(kaggle_pred_proba[idx][second_idx])
        print(f"Row {idx:4d}: true={true_lab:<22} pred={pred_lab:<22} conf={conf:.3f} 2nd={second_lab:<22}({second_conf:.3f})")


[Info] Classes: ['Insufficient_Weight', 'Normal_Weight', 'Obesity_Type_I', 'Obesity_Type_II', 'Obesity_Type_III', 'Overweight_Level_I', 'Overweight_Level_II']
[Info] Train male=7783, female=7750
[Info] Test  male=10336, female=10422

[MALE] Fold 1/5




[MALE] Best iteration: 336

[MALE] Fold 2/5
[MALE] Best iteration: 328

[MALE] Fold 3/5
[MALE] Best iteration: 398

[MALE] Fold 4/5
[MALE] Best iteration: 354

[MALE] Fold 5/5
[MALE] Best iteration: 332

[MALE] OOF Accuracy: 0.8876 | Macro F1: 0.7513
[MALE] Best iterations: [336, 328, 398, 354, 332] | Median: 336

[FEMALE] Fold 1/5
[FEMALE] Best iteration: 360

[FEMALE] Fold 2/5
[FEMALE] Best iteration: 334

[FEMALE] Fold 3/5
[FEMALE] Best iteration: 242

[FEMALE] Fold 4/5
[FEMALE] Best iteration: 368

[FEMALE] Fold 5/5
[FEMALE] Best iteration: 363

[FEMALE] OOF Accuracy: 0.9163 | Macro F1: 0.7487
[FEMALE] Best iterations: [360, 334, 242, 368, 363] | Median: 360

OOF Accuracy: 0.9019 | OOF Macro F1: 0.8925

OOF Classification Report:
                      precision    recall  f1-score   support

Insufficient_Weight       0.93      0.94      0.93      1870
      Normal_Weight       0.89      0.88      0.88      2345
     Obesity_Type_I       0.89      0.87      0.88      2207
    Obesit



[MALE (Kaggle)] Best iteration: 336

[MALE (Kaggle)] Fold 2/5
[MALE (Kaggle)] Best iteration: 328

[MALE (Kaggle)] Fold 3/5
[MALE (Kaggle)] Best iteration: 398

[MALE (Kaggle)] Fold 4/5
[MALE (Kaggle)] Best iteration: 354

[MALE (Kaggle)] Fold 5/5
[MALE (Kaggle)] Best iteration: 332

[MALE (Kaggle)] OOF Accuracy: 0.8876 | Macro F1: 0.7513
[MALE (Kaggle)] Best iterations: [336, 328, 398, 354, 332] | Median: 336

[FEMALE (Kaggle)] Fold 1/5
[FEMALE (Kaggle)] Best iteration: 360

[FEMALE (Kaggle)] Fold 2/5
[FEMALE (Kaggle)] Best iteration: 334

[FEMALE (Kaggle)] Fold 3/5
[FEMALE (Kaggle)] Best iteration: 242

[FEMALE (Kaggle)] Fold 4/5
[FEMALE (Kaggle)] Best iteration: 368

[FEMALE (Kaggle)] Fold 5/5
[FEMALE (Kaggle)] Best iteration: 363

[FEMALE (Kaggle)] OOF Accuracy: 0.9163 | Macro F1: 0.7487
[FEMALE (Kaggle)] Best iterations: [360, 334, 242, 368, 363] | Median: 360

✅ Overall Accuracy on Kaggle_test: 0.90947

=== Confusion Matrix (counts) ===
Predicted →
True ↓
Insufficient_Weight   : 

In [2]:
# -------- Overall accuracy to 5 decimals --------
overall_acc = accuracy_score(y_true, y_pred)
print(f"\n✅ Overall Accuracy on Kaggle_test: {overall_acc:.5f}")

# -------- Text-only error analysis (custom order) --------
order = [
    'Insufficient_Weight',
    'Normal_Weight',
    'Overweight_Level_I',
    'Overweight_Level_II',
    'Obesity_Type_I',
    'Obesity_Type_II',
    'Obesity_Type_III'
]

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_true, y_pred, labels=order)
cm_norm = cm.astype(float) / (cm.sum(axis=1, keepdims=True) + 1e-12)

print("\n=== Confusion Matrix (counts) ===")
print("Predicted →")
print("True ↓")
for i, true_class in enumerate(order):
    row = " | ".join(f"{cm[i, j]:4d}" for j in range(len(order)))
    print(f"{true_class:<22}: {row}")

print("\n=== Confusion Matrix (row-normalized) ===")
for i, true_class in enumerate(order):
    row = " | ".join(f"{cm_norm[i, j]:.2f}" for j in range(len(order)))
    print(f"{true_class:<22}: {row}")

print("\n=== Per-class metrics ===")
try:
    print(classification_report(y_true, y_pred, labels=order, target_names=order, digits=4, zero_division=0))
except Exception as e:
    print(f"[Info] classification_report fallback: {e}")
    print(classification_report(y_true, y_pred, digits=4, zero_division=0))

print("\n=== Per-class accuracy (diagonal/row total) ===")
for i, c in enumerate(order):
    total = cm[i].sum()
    correct = cm[i, i]
    acc = correct / total if total > 0 else 0.0
    print(f"{c:<22} | Correct: {correct:3d} / {total:3d} | {acc*100:6.2f}%")

print("\n=== Most common confusions (true → predicted) ===")
pairs = []
for i, t in enumerate(order):
    for j, p in enumerate(order):
        if i == j or cm[i, j] == 0:
            continue
        pairs.append((cm[i, j], t, p, cm_norm[i, j]))
pairs = sorted(pairs, key=lambda x: (-x[0], -x[3]))
for cnt, true_label, pred_label, norm_val in pairs[:10]:
    print(f"{true_label:25} → {pred_label:25} | Count: {cnt:3d} | Row%: {norm_val*100:5.1f}")

print("\n=== Sample of misclassified rows (first 10) ===")
mis_idx = np.where(np.asarray(y_true) != np.asarray(y_pred))[0]
if len(mis_idx) == 0:
    print("🎉 No misclassifications!")
else:
    for idx in mis_idx[:10]:
        true_lab = y_true.iloc[idx] if hasattr(y_true, "iloc") else y_true[idx]
        pred_lab = y_pred[idx]
        conf = float(np.max(kaggle_pred_proba[idx]))
        rank = np.argsort(-kaggle_pred_proba[idx])
        second_idx = rank[1] if rank.size > 1 else rank[0]
        second_lab = le.inverse_transform([second_idx])[0]
        second_conf = float(kaggle_pred_proba[idx][second_idx])
        print(f"Row {idx:4d}: true={true_lab:<22} pred={pred_lab:<22} conf={conf:.3f} 2nd={second_lab:<22}({second_conf:.3f})")


✅ Overall Accuracy on Kaggle_test: 0.90947

=== Confusion Matrix (counts) ===
Predicted →
True ↓
Insufficient_Weight   :  623 |   27 |    3 |    0 |    0 |    0 |    0
Normal_Weight         :   42 |  648 |   38 |    8 |    1 |    0 |    0
Overweight_Level_I    :    4 |   49 |  453 |   68 |    9 |    0 |    0
Overweight_Level_II   :    0 |   16 |   51 |  525 |   45 |    4 |    0
Obesity_Type_I        :    1 |    1 |   11 |   48 |  623 |   17 |    2
Obesity_Type_II       :    0 |    0 |    2 |    5 |   19 |  819 |    0
Obesity_Type_III      :    0 |    0 |    1 |    0 |    1 |    0 | 1061

=== Confusion Matrix (row-normalized) ===
Insufficient_Weight   : 0.95 | 0.04 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00
Normal_Weight         : 0.06 | 0.88 | 0.05 | 0.01 | 0.00 | 0.00 | 0.00
Overweight_Level_I    : 0.01 | 0.08 | 0.78 | 0.12 | 0.02 | 0.00 | 0.00
Overweight_Level_II   : 0.00 | 0.02 | 0.08 | 0.82 | 0.07 | 0.01 | 0.00
Obesity_Type_I        : 0.00 | 0.00 | 0.02 | 0.07 | 0.89 | 0.02 | 0.00
Obesity

In [11]:
# ==============================================
# Gender-specific XGB + BMI + targeted class boost + Kaggle_test eval
# Cost-sensitive training + weighted-F1 early stopping for Overweight I/II
# ==============================================
import numpy as np
import pandas as pd
from collections import Counter

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.base import clone
import xgboost as xgb
import os

# -------- Paths --------
TRAIN_PATH = "train.csv"
TEST_PATH = "test.csv"
SAMPLE_SUB_PATH = "sample_submission.csv"
KAGGLE_TEST_PATH = "Kaggle_test.csv"  # must contain WeightCategory for eval

RANDOM_STATE = 27
N_FOLDS = 5
N_JOBS = -1

# ---- Cost-sensitive knobs ----
BOOST_CLASSES = ("Overweight_Level_I", "Overweight_Level_II")  # classes to emphasize
TRAIN_WEIGHT_MULT = 2.25  # sample-weight multiplier for boosted classes (1.5–3.0 typical)
F1_WEIGHT_FOR_BOOST = 2.0  # eval-time weight on boosted classes (1.5–3.0 typical)


In [12]:
# ==============================================
# Helpers
# ==============================================
def norm_col(s: str) -> str:
    if s is None:
        return s
    return str(s).replace("\ufeff", "").strip().lower()

def infer_feature_types(df):
    cat_cols = df.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    return num_cols, cat_cols

def detect_gender_column(df):
    # common names
    for c in df.columns:
        if norm_col(c) in {"gender", "sex"}:
            return c
    # fallback: column that looks like M/F
    for c in df.columns:
        vals = pd.Series(df[c].dropna().astype(str).str.lower().str.strip()).unique()
        if len(vals) in (2, 3):
            if any(v.startswith("m") for v in vals) and any(v.startswith("f") for v in vals):
                return c
    return None

def split_by_gender(series):
    s = series.astype(str).str.lower().str.strip()
    male_mask = s.str.startswith(("m", "1", "true"))
    female_mask = s.str.startswith(("f", "0", "false"))
    if male_mask.sum() == 0 and female_mask.sum() == 0:
        top = s.value_counts().index.tolist()
        if len(top) >= 2:
            male_mask = s == top[0]
            female_mask = s == top[1]
    return male_mask, female_mask

def add_bmi(df):
    """Compute BMI = Weight / (Height_m^2).
       If median height > 3 assume cm → convert to meters."""
    if ("Weight" in df.columns) and ("Height" in df.columns):
        h = pd.to_numeric(df["Height"], errors="coerce")
        height_m = np.where(h.median(skipna=True) > 3.0, h / 100.0, h)
        with np.errstate(divide="ignore", invalid="ignore"):
            bmi = pd.to_numeric(df["Weight"], errors="coerce") / (np.power(height_m, 2) + 1e-12)
        df["BMI"] = pd.Series(bmi).replace([np.inf, -np.inf], np.nan)
    return df


In [13]:

# ==============================================
# Load data
# ==============================================
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
sample_sub = pd.read_csv(SAMPLE_SUB_PATH)

# Optional drops
for c in ["MTRANS", "SMOKE"]:
    if c in train.columns:
        train.drop(columns=[c], inplace=True)
    if c in test.columns:
        test.drop(columns=[c], inplace=True)

# Feature engineering
train = add_bmi(train)
test = add_bmi(test)

# Detect ID/Target
id_col = None
for cand in ["id", "row_id", "index", "sample_id"]:
    if cand in train.columns and cand in test.columns:
        id_col = cand
        break

target_col = None
for cand in ["WeightCategory", "NObeyesdad", "label", "target", "class", "y"]:
    if cand in train.columns:
        target_col = cand
        break
if target_col is None:
    raise ValueError("Could not detect target column in train.csv")

# Build X/y
y = train[target_col].copy()
X = train.drop(columns=[target_col]).copy()
if id_col and id_col in X.columns:
    X.drop(columns=[id_col], inplace=True)

test_features = test.copy()
if id_col and id_col in test_features.columns:
    test_ids = test_features[id_col].copy()
    test_features.drop(columns=[id_col], inplace=True)
else:
    test_ids = pd.Series(np.arange(len(test_features)), name="id")

# Label encode target
le = LabelEncoder()
y_enc = le.fit_transform(y)
classes = list(le.classes_)
print(f"[Info] Classes: {classes}")

# Detect gender and split
gender_col = detect_gender_column(pd.concat([X, test_features], axis=0))
if gender_col is None:
    raise ValueError("Could not detect a gender column (e.g., 'Gender'/'SEX').")
male_mask, female_mask = split_by_gender(train[gender_col])
test_male_mask, test_female_mask = split_by_gender(test_features[gender_col])
print(f"[Info] Train male={int(male_mask.sum())}, female={int(female_mask.sum())}")
print(f"[Info] Test  male={int(test_male_mask.sum())}, female={int(test_female_mask.sum())}")


[Info] Classes: ['Insufficient_Weight', 'Normal_Weight', 'Obesity_Type_I', 'Obesity_Type_II', 'Obesity_Type_III', 'Overweight_Level_I', 'Overweight_Level_II']
[Info] Train male=7783, female=7750
[Info] Test  male=10336, female=10422


In [14]:
# ==============================================
# Training function (gender-specific)
# Cost-sensitive training + weighted-F1 early stopping
# ==============================================
def train_group_and_predict(
    X_grp,
    y_enc_grp,
    test_grp,
    group_name,
    boost_targets=BOOST_CLASSES,
    base_boost=TRAIN_WEIGHT_MULT,
):
    # Drop gender column inside a group (constant after split)
    cols_to_use = [c for c in X_grp.columns if c != gender_col]
    Xg = X_grp[cols_to_use].copy()
    Xtestg = test_grp[cols_to_use].copy()

    num_cols, cat_cols = infer_feature_types(Xg)

    # Preprocessor
    numeric_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler(with_mean=False))
    ])
    try:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
    except TypeError:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse=True)
    categorical_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", ohe)
    ])
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, num_cols),
            ("cat", categorical_transformer, cat_cols),
        ],
        remainder="drop",
        sparse_threshold=1.0
    )

    # XGB params
    xgb_params = {
        "objective": "multi:softprob",
        "num_class": len(classes),
        "eval_metric": "mlogloss",  # keep for logging; early stopping uses feval
        "tree_method": "hist",
        "max_depth": 6,
        "min_child_weight": 2,
        "subsample": 0.9,
        "colsample_bytree": 0.9,
        "lambda": 1.0,
        "alpha": 0.0,
        "eta": 0.03,
        "nthread": N_JOBS,
        "seed": RANDOM_STATE,
    }
    NUM_BOOST_ROUND = 20000
    EARLY_STOP = 200

    # ---- Build cost maps (name -> index) ----
    cls_to_idx = {c: i for i, c in enumerate(classes)}
    boost_idx = {cls_to_idx[c] for c in boost_targets if c in cls_to_idx}

    # weighted macro-F1 eval that emphasizes the two tough classes
    f1_weights = np.array([
        (F1_WEIGHT_FOR_BOOST if i in boost_idx else 1.0) for i in range(len(classes))
    ], dtype=float)

    def weighted_macro_f1_eval(preds: np.ndarray, dmatrix: xgb.DMatrix):
        y_true = dmatrix.get_label().astype(int)
        proba = preds.reshape(-1, len(classes))
        y_hat = np.argmax(proba, axis=1)
        per_class_f1 = f1_score(
            y_true, y_hat,
            labels=np.arange(len(classes)),
            average=None,
            zero_division=0
        )
        score = float(np.average(per_class_f1, weights=f1_weights))
        return ("wF1_boost", score)

    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    oof_group = np.zeros((len(Xg), len(classes)), dtype=np.float32)
    test_group_pred = np.zeros((len(Xtestg), len(classes)), dtype=np.float32)
    fold_best = []

    for fold, (tr_idx, va_idx) in enumerate(skf.split(Xg, y_enc_grp), start=1):
        print(f"\n[{group_name}] Fold {fold}/{N_FOLDS}")
        X_tr, X_va = Xg.iloc[tr_idx], Xg.iloc[va_idx]
        y_tr, y_va = y_enc_grp[tr_idx], y_enc_grp[va_idx]

        prep = clone(preprocessor)
        Xtr = prep.fit_transform(X_tr)
        Xva = prep.transform(X_va)

        # ---- Cost-sensitive sample weights (deterministic) ----
        w_tr = np.ones_like(y_tr, dtype=float)
        for idx in boost_idx:
            w_tr[y_tr == idx] = base_boost
        w_va = np.ones_like(y_va, dtype=float)  # keep validation unweighted

        dtrain = xgb.DMatrix(Xtr, label=y_tr, weight=w_tr)
        dval = xgb.DMatrix(Xva, label=y_va, weight=w_va)

        bst = xgb.train(
            params=xgb_params,
            dtrain=dtrain,
            num_boost_round=NUM_BOOST_ROUND,
            evals=[(dtrain, "train"), (dval, "valid")],
            feval=weighted_macro_f1_eval,
            maximize=True,                 # higher F1 is better
            early_stopping_rounds=EARLY_STOP,
            verbose_eval=False
        )
        best_round = int(bst.best_iteration + 1)
        fold_best.append(best_round)
        print(f"[{group_name}] Best iteration (by wF1_boost): {best_round}")

        # OOF store
        oof_proba = bst.predict(dval, iteration_range=(0, best_round))
        oof_group[va_idx] = oof_proba

        # Quick fold report for the emphasized classes
        y_hat = np.argmax(oof_proba, axis=1)
        for cname in boost_targets:
            if cname in cls_to_idx:
                cidx = cls_to_idx[cname]
                f1_c = f1_score(y_va, y_hat, labels=[cidx], average="macro", zero_division=0)
                print(f"[{group_name}] Fold {fold} F1({cname}): {f1_c:.4f}")

        # Fold predictions on this group's test slice
        Xtest_tf = prep.transform(Xtestg)
        dtest = xgb.DMatrix(Xtest_tf)
        test_group_pred += bst.predict(dtest, iteration_range=(0, best_round)) / N_FOLDS

    # OOF summary for the group
    oof_labels = np.argmax(oof_group, axis=1)
    acc_g = accuracy_score(y_enc_grp, oof_labels)
    f1_g = f1_score(y_enc_grp, oof_labels, average="macro")
    print(f"\n[{group_name}] OOF Accuracy: {acc_g:.4f} | Macro F1: {f1_g:.4f}")
    print(f"[{group_name}] Best iterations: {fold_best} | Median: {int(np.median(fold_best))}")

    return oof_group, test_group_pred


In [15]:
# ==============================================
# Train per-gender and predict full test
# ==============================================
X_male = X[male_mask].reset_index(drop=True)
y_male_enc = y_enc[male_mask]
test_male = test_features[test_male_mask].reset_index(drop=True)

X_female = X[female_mask].reset_index(drop=True)
y_female_enc = y_enc[female_mask]
test_female = test_features[test_female_mask].reset_index(drop=True)

male_oof, male_test_pred = train_group_and_predict(X_male, y_male_enc, test_male, "MALE")
female_oof, female_test_pred = train_group_and_predict(X_female, y_female_enc, test_female, "FEMALE")

# Combine OOF
oof_full = np.zeros((len(X), len(classes)), dtype=np.float32)
oof_full[male_mask.values] = male_oof
oof_full[female_mask.values] = female_oof

oof_labels = np.argmax(oof_full, axis=1)
oof_acc = accuracy_score(y_enc, oof_labels)
oof_f1 = f1_score(y_enc, oof_labels, average="macro")
print("\n========== OVERALL OOF ==========")
print(f"OOF Accuracy: {oof_acc:.4f} | OOF Macro F1: {oof_f1:.4f}")
try:
    print("\nOOF Classification Report:\n",
          classification_report(y_enc, oof_labels, target_names=classes, zero_division=0))
except Exception as e:
    print(f"[Info] Could not print classification report: {e}")





[MALE] Fold 1/5
[MALE] Best iteration (by wF1_boost): 532
[MALE] Fold 1 F1(Overweight_Level_I): 0.8019
[MALE] Fold 1 F1(Overweight_Level_II): 0.8191

[MALE] Fold 2/5




[MALE] Best iteration (by wF1_boost): 280
[MALE] Fold 2 F1(Overweight_Level_I): 0.8028
[MALE] Fold 2 F1(Overweight_Level_II): 0.8492

[MALE] Fold 3/5




[MALE] Best iteration (by wF1_boost): 506
[MALE] Fold 3 F1(Overweight_Level_I): 0.8266
[MALE] Fold 3 F1(Overweight_Level_II): 0.8510

[MALE] Fold 4/5




[MALE] Best iteration (by wF1_boost): 422
[MALE] Fold 4 F1(Overweight_Level_I): 0.7857
[MALE] Fold 4 F1(Overweight_Level_II): 0.8220

[MALE] Fold 5/5




[MALE] Best iteration (by wF1_boost): 522
[MALE] Fold 5 F1(Overweight_Level_I): 0.7981
[MALE] Fold 5 F1(Overweight_Level_II): 0.8282

[MALE] OOF Accuracy: 0.8903 | Macro F1: 0.7534
[MALE] Best iterations: [532, 280, 506, 422, 522] | Median: 506

[FEMALE] Fold 1/5




[FEMALE] Best iteration (by wF1_boost): 299
[FEMALE] Fold 1 F1(Overweight_Level_I): 0.8012
[FEMALE] Fold 1 F1(Overweight_Level_II): 0.7895

[FEMALE] Fold 2/5




[FEMALE] Best iteration (by wF1_boost): 230
[FEMALE] Fold 2 F1(Overweight_Level_I): 0.7492
[FEMALE] Fold 2 F1(Overweight_Level_II): 0.7401

[FEMALE] Fold 3/5




[FEMALE] Best iteration (by wF1_boost): 337
[FEMALE] Fold 3 F1(Overweight_Level_I): 0.7803
[FEMALE] Fold 3 F1(Overweight_Level_II): 0.7706

[FEMALE] Fold 4/5




[FEMALE] Best iteration (by wF1_boost): 788
[FEMALE] Fold 4 F1(Overweight_Level_I): 0.8024
[FEMALE] Fold 4 F1(Overweight_Level_II): 0.7249

[FEMALE] Fold 5/5




[FEMALE] Best iteration (by wF1_boost): 380
[FEMALE] Fold 5 F1(Overweight_Level_I): 0.7771
[FEMALE] Fold 5 F1(Overweight_Level_II): 0.7887

[FEMALE] OOF Accuracy: 0.9169 | Macro F1: 0.7510
[FEMALE] Best iterations: [299, 230, 337, 788, 380] | Median: 337

OOF Accuracy: 0.9036 | OOF Macro F1: 0.8945

OOF Classification Report:
                      precision    recall  f1-score   support

Insufficient_Weight       0.92      0.93      0.93      1870
      Normal_Weight       0.89      0.87      0.88      2345
     Obesity_Type_I       0.91      0.86      0.88      2207
    Obesity_Type_II       0.97      0.97      0.97      2403
   Obesity_Type_III       1.00      1.00      1.00      2983
 Overweight_Level_I       0.79      0.80      0.79      1844
Overweight_Level_II       0.79      0.84      0.81      1881

           accuracy                           0.90     15533
          macro avg       0.89      0.89      0.89     15533
       weighted avg       0.90      0.90      0.90     1553

In [16]:

# Build full test predictions (for Kaggle submission use-case)
test_pred_proba = np.zeros((len(test_features), len(classes)), dtype=np.float32)
test_pred_proba[test_male_mask.values] = male_test_pred
test_pred_proba[test_female_mask.values] = female_test_pred

test_pred_int = np.argmax(test_pred_proba, axis=1)
test_pred_labels = le.inverse_transform(test_pred_int)

# Submission
ss_cols = list(sample_sub.columns)
ID_HEADER = None
LABEL_HEADER = None
if len(ss_cols) == 2:
    c1, c2 = ss_cols
    if c1 in test.columns and c2 not in test.columns:
        ID_HEADER, LABEL_HEADER = c1, c2
    elif c2 in test.columns and c1 not in test.columns:
        ID_HEADER, LABEL_HEADER = c2, c1
if ID_HEADER is None:
    ID_HEADER = ss_cols[0]
    LABEL_HEADER = ss_cols[1]

sub = pd.DataFrame()
if ID_HEADER in test.columns:
    sub[ID_HEADER] = test[ID_HEADER].values
else:
    sub[ID_HEADER] = np.arange(len(test_features))
sub[LABEL_HEADER] = test_pred_labels

# Ensure column order
for c in ss_cols:
    if c not in sub.columns:
        sub[c] = sample_sub[c].iloc[0] if len(sample_sub[c]) else None
sub = sub[ss_cols]

sub.to_csv("submission.csv", index=False)
print("\nSaved submission.csv")
print(sub.head(5))



Saved submission.csv
   id       WeightCategory
0   0  Overweight_Level_II
1   1        Normal_Weight
2   2  Insufficient_Weight
3   3     Obesity_Type_III
4   4  Overweight_Level_II


In [17]:

# ==============================================
# Evaluate on Kaggle_test.csv (with ground truth)
# ==============================================
if not os.path.exists(KAGGLE_TEST_PATH):
    print(f"\n[Warn] {KAGGLE_TEST_PATH} not found. Skipping Kaggle_test evaluation.")
else:
    kdf = pd.read_csv(KAGGLE_TEST_PATH)
    if "WeightCategory" not in kdf.columns:
        raise KeyError("Kaggle_test.csv must contain 'WeightCategory'.")

    y_true = kdf["WeightCategory"].copy()
    X_k = kdf.drop(columns=["WeightCategory"], errors="ignore").copy()
    if id_col and id_col in X_k.columns:
        X_k.drop(columns=[id_col], inplace=True)

    # same drops + BMI
    for c in ["MTRANS", "SMOKE"]:
        if c in X_k.columns:
            X_k.drop(columns=[c], inplace=True)
    X_k = add_bmi(X_k)

    # detect gender and split for Kaggle set
    gender_col_k = detect_gender_column(X_k)
    if gender_col_k is None:
        raise ValueError("Could not detect a gender column in Kaggle_test.csv")
    km_k, kf_k = split_by_gender(X_k[gender_col_k])

    # Predict on Kaggle by reusing the same training procedure (per gender)
    kaggle_pred_proba = np.zeros((len(X_k), len(classes)), dtype=np.float32)

    if X_male.shape[0] > 0 and km_k.sum() > 0:
        _, male_k_pred = train_group_and_predict(X_male, y_male_enc, X_k[km_k].reset_index(drop=True), "MALE (Kaggle)")
        kaggle_pred_proba[km_k.values] = male_k_pred
    if X_female.shape[0] > 0 and kf_k.sum() > 0:
        _, female_k_pred = train_group_and_predict(X_female, y_female_enc, X_k[kf_k].reset_index(drop=True), "FEMALE (Kaggle)")
        kaggle_pred_proba[kf_k.values] = female_k_pred

    kaggle_pred_idx = np.argmax(kaggle_pred_proba, axis=1)
    y_pred = le.inverse_transform(kaggle_pred_idx)

    # -------- Overall accuracy to 5 decimals --------
    overall_acc = accuracy_score(y_true, y_pred)
    print(f"\n✅ Overall Accuracy on Kaggle_test: {overall_acc:.5f}")

    # -------- Text-only error analysis (custom order) --------
    order = [
        'Insufficient_Weight',
        'Normal_Weight',
        'Overweight_Level_I',
        'Overweight_Level_II',
        'Obesity_Type_I',
        'Obesity_Type_II',
        'Obesity_Type_III'
    ]

    cm = confusion_matrix(y_true, y_pred, labels=order)
    cm_norm = cm.astype(float) / (cm.sum(axis=1, keepdims=True) + 1e-12)

    print("\n=== Confusion Matrix (counts) ===")
    print("Predicted →")
    print("True ↓")
    for i, true_class in enumerate(order):
        row = " | ".join(f"{cm[i, j]:4d}" for j in range(len(order)))
        print(f"{true_class:<22}: {row}")

    print("\n=== Confusion Matrix (row-normalized) ===")
    for i, true_class in enumerate(order):
        row = " | ".join(f"{cm_norm[i, j]:.2f}" for j in range(len(order)))
        print(f"{true_class:<22}: {row}")

    print("\n=== Per-class metrics ===")
    try:
        print(classification_report(y_true, y_pred, labels=order, target_names=order, digits=4, zero_division=0))
    except Exception as e:
        print(f"[Info] classification_report fallback: {e}")
        print(classification_report(y_true, y_pred, digits=4, zero_division=0))

    print("\n=== Per-class accuracy (diagonal/row total) ===")
    for i, c in enumerate(order):
        total = cm[i].sum()
        correct = cm[i, i]
        acc = correct / total if total > 0 else 0.0
        print(f"{c:<22} | Correct: {correct:3d} / {total:3d} | {acc*100:6.2f}%")

    print("\n=== Most common confusions (true → predicted) ===")
    pairs = []
    for i, t in enumerate(order):
        for j, p in enumerate(order):
            if i == j or cm[i, j] == 0:
                continue
            pairs.append((cm[i, j], t, p, cm_norm[i, j]))
    pairs = sorted(pairs, key=lambda x: (-x[0], -x[3]))
    for cnt, true_label, pred_label, norm_val in pairs[:10]:
        print(f"{true_label:25} → {pred_label:25} | Count: {cnt:3d} | Row%: {norm_val*100:5.1f}")
    
    print("\n=== Sample of misclassified rows (first 10) ===")
    mis_idx = np.where(np.asarray(y_true) != np.asarray(y_pred))[0]
    if len(mis_idx) == 0:
        print("🎉 No misclassifications!")
    else:
        for idx in mis_idx[:10]:
            true_lab = y_true.iloc[idx] if hasattr(y_true, "iloc") else y_true[idx]
            pred_lab = y_pred[idx]
            conf = float(np.max(kaggle_pred_proba[idx]))
            rank = np.argsort(-kaggle_pred_proba[idx])
            second_idx = rank[1] if rank.size > 1 else rank[0]
            second_lab = le.inverse_transform([second_idx])[0]
            second_conf = float(kaggle_pred_proba[idx][second_idx])
            print(f"Row {idx:4d}: true={true_lab:<22} pred={pred_lab:<22} conf={conf:.3f} 2nd={second_lab:<22}({second_conf:.3f})")


[MALE (Kaggle)] Fold 1/5




[MALE (Kaggle)] Best iteration (by wF1_boost): 532
[MALE (Kaggle)] Fold 1 F1(Overweight_Level_I): 0.8019
[MALE (Kaggle)] Fold 1 F1(Overweight_Level_II): 0.8191

[MALE (Kaggle)] Fold 2/5




[MALE (Kaggle)] Best iteration (by wF1_boost): 280
[MALE (Kaggle)] Fold 2 F1(Overweight_Level_I): 0.8028
[MALE (Kaggle)] Fold 2 F1(Overweight_Level_II): 0.8492

[MALE (Kaggle)] Fold 3/5




[MALE (Kaggle)] Best iteration (by wF1_boost): 506
[MALE (Kaggle)] Fold 3 F1(Overweight_Level_I): 0.8266
[MALE (Kaggle)] Fold 3 F1(Overweight_Level_II): 0.8510

[MALE (Kaggle)] Fold 4/5




[MALE (Kaggle)] Best iteration (by wF1_boost): 422
[MALE (Kaggle)] Fold 4 F1(Overweight_Level_I): 0.7857
[MALE (Kaggle)] Fold 4 F1(Overweight_Level_II): 0.8220

[MALE (Kaggle)] Fold 5/5




[MALE (Kaggle)] Best iteration (by wF1_boost): 522
[MALE (Kaggle)] Fold 5 F1(Overweight_Level_I): 0.7981
[MALE (Kaggle)] Fold 5 F1(Overweight_Level_II): 0.8282

[MALE (Kaggle)] OOF Accuracy: 0.8903 | Macro F1: 0.7534
[MALE (Kaggle)] Best iterations: [532, 280, 506, 422, 522] | Median: 506

[FEMALE (Kaggle)] Fold 1/5




[FEMALE (Kaggle)] Best iteration (by wF1_boost): 299
[FEMALE (Kaggle)] Fold 1 F1(Overweight_Level_I): 0.8012
[FEMALE (Kaggle)] Fold 1 F1(Overweight_Level_II): 0.7895

[FEMALE (Kaggle)] Fold 2/5




[FEMALE (Kaggle)] Best iteration (by wF1_boost): 230
[FEMALE (Kaggle)] Fold 2 F1(Overweight_Level_I): 0.7492
[FEMALE (Kaggle)] Fold 2 F1(Overweight_Level_II): 0.7401

[FEMALE (Kaggle)] Fold 3/5




[FEMALE (Kaggle)] Best iteration (by wF1_boost): 337
[FEMALE (Kaggle)] Fold 3 F1(Overweight_Level_I): 0.7803
[FEMALE (Kaggle)] Fold 3 F1(Overweight_Level_II): 0.7706

[FEMALE (Kaggle)] Fold 4/5




[FEMALE (Kaggle)] Best iteration (by wF1_boost): 788
[FEMALE (Kaggle)] Fold 4 F1(Overweight_Level_I): 0.8024
[FEMALE (Kaggle)] Fold 4 F1(Overweight_Level_II): 0.7249

[FEMALE (Kaggle)] Fold 5/5




[FEMALE (Kaggle)] Best iteration (by wF1_boost): 380
[FEMALE (Kaggle)] Fold 5 F1(Overweight_Level_I): 0.7771
[FEMALE (Kaggle)] Fold 5 F1(Overweight_Level_II): 0.7887

[FEMALE (Kaggle)] OOF Accuracy: 0.9169 | Macro F1: 0.7510
[FEMALE (Kaggle)] Best iterations: [299, 230, 337, 788, 380] | Median: 337

✅ Overall Accuracy on Kaggle_test: 0.90813

=== Confusion Matrix (counts) ===
Predicted →
True ↓
Insufficient_Weight   :  622 |   28 |    3 |    0 |    0 |    0 |    0
Normal_Weight         :   41 |  643 |   44 |    8 |    1 |    0 |    0
Overweight_Level_I    :    4 |   44 |  454 |   72 |    9 |    0 |    0
Overweight_Level_II   :    0 |   15 |   54 |  526 |   42 |    4 |    0
Obesity_Type_I        :    1 |    1 |   11 |   50 |  622 |   16 |    2
Obesity_Type_II       :    0 |    0 |    1 |    6 |   21 |  817 |    0
Obesity_Type_III      :    0 |    0 |    1 |    0 |    1 |    0 | 1061

=== Confusion Matrix (row-normalized) ===
Insufficient_Weight   : 0.95 | 0.04 | 0.00 | 0.00 | 0.00 | 0.0

In [18]:
# ==============================================
# Gender-specific XGB + BMI + targeted class boost
# + Pairwise Expert (Overweight I vs II) re-ranking
# + Kaggle_test evaluation & confusion analysis
# ==============================================
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.base import clone
import xgboost as xgb

# -------- Paths --------
TRAIN_PATH = "train.csv"
TEST_PATH = "test.csv"
SAMPLE_SUB_PATH = "sample_submission.csv"
KAGGLE_TEST_PATH = "Kaggle_test.csv"  # must contain WeightCategory

# -------- Globals --------
RANDOM_STATE = 42
N_FOLDS = 5
N_JOBS = -1

# ---- Cost-sensitive knobs ----
BOOST_CLASSES = ("Overweight_Level_I", "Overweight_Level_II")
TRAIN_WEIGHT_MULT = 2.0        # weight multiplier for emphasized classes
F1_WEIGHT_FOR_BOOST = 1.75     # evaluation-time weight for emphasized classes

# ---- Pairwise Overweight I vs II expert ----
USE_PAIRWISE_EXPERT = True
PAIR_CLASSES = ("Overweight_Level_I", "Overweight_Level_II")
TAU_MARGIN = 0.08   # route to expert if |p1 - p2| < TAU (try 0.05–0.12)

# ---- Multiclass XGB defaults ----
NUM_BOOST_ROUND = 20000
EARLY_STOP = 200

# ==============================================
# Helpers
# ==============================================
def norm_col(s: str) -> str:
    if s is None:
        return s
    return str(s).replace("\ufeff", "").strip().lower()

def infer_feature_types(df):
    cat_cols = df.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    return num_cols, cat_cols

def detect_gender_column(df):
    for c in df.columns:
        if norm_col(c) in {"gender", "sex"}:
            return c
    # fallback: detect M/F-ish column
    for c in df.columns:
        vals = pd.Series(df[c].dropna().astype(str).str.lower().str.strip()).unique()
        if len(vals) in (2, 3):
            if any(v.startswith("m") for v in vals) and any(v.startswith("f") for v in vals):
                return c
    return None

def split_by_gender(series):
    s = series.astype(str).str.lower().str.strip()
    male_mask = s.str.startswith(("m", "1", "true"))
    female_mask = s.str.startswith(("f", "0", "false"))
    if male_mask.sum() == 0 and female_mask.sum() == 0:
        top = s.value_counts().index.tolist()
        if len(top) >= 2:
            male_mask = s == top[0]
            female_mask = s == top[1]
    return male_mask, female_mask

def add_bmi(df):
    """BMI = Weight / (Height_m^2). If median height > 3 assume cm -> meters."""
    if ("Weight" in df.columns) and ("Height" in df.columns):
        h = pd.to_numeric(df["Height"], errors="coerce")
        height_m = np.where(np.nanmedian(h) > 3.0, h / 100.0, h)
        with np.errstate(divide="ignore", invalid="ignore"):
            bmi = pd.to_numeric(df["Weight"], errors="coerce") / (np.power(height_m, 2) + 1e-12)
        df["BMI"] = pd.Series(bmi).replace([np.inf, -np.inf], np.nan)
    return df

# ==============================================
# Load data
# ==============================================
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
sample_sub = pd.read_csv(SAMPLE_SUB_PATH)

# Optional drops
for c in ["MTRANS", "SMOKE"]:
    if c in train.columns:
        train.drop(columns=[c], inplace=True)
    if c in test.columns:
        test.drop(columns=[c], inplace=True)

# Feature engineering
train = add_bmi(train)
test = add_bmi(test)

# Detect ID/Target
id_col = None
for cand in ["id", "row_id", "index", "sample_id"]:
    if cand in train.columns and cand in test.columns:
        id_col = cand
        break

target_col = None
for cand in ["WeightCategory", "NObeyesdad", "label", "target", "class", "y"]:
    if cand in train.columns:
        target_col = cand
        break
if target_col is None:
    raise ValueError("Could not detect target column in train.csv")

# Build X/y
y = train[target_col].copy()
X = train.drop(columns=[target_col]).copy()
if id_col and id_col in X.columns:
    X.drop(columns=[id_col], inplace=True)

test_features = test.copy()
if id_col and id_col in test_features.columns:
    test_ids = test_features[id_col].copy()
    test_features.drop(columns=[id_col], inplace=True)
else:
    test_ids = pd.Series(np.arange(len(test_features)), name="id")

# Label encode target
le = LabelEncoder()
y_enc = le.fit_transform(y)
classes = list(le.classes_)
print(f"[Info] Classes: {classes}")

# Detect gender and split
gender_col = detect_gender_column(pd.concat([X, test_features], axis=0))
if gender_col is None:
    raise ValueError("Could not detect a gender column (e.g., 'Gender'/'SEX').")
male_mask, female_mask = split_by_gender(train[gender_col])
test_male_mask, test_female_mask = split_by_gender(test_features[gender_col])
print(f"[Info] Train male={int(male_mask.sum())}, female={int(female_mask.sum())}")
print(f"[Info] Test  male={int(test_male_mask.sum())}, female={int(test_female_mask.sum())}")

# ==============================================
# Training function (gender-specific)
# - Cost-sensitive training
# - Weighted-F1 early stopping emphasizing BOOST_CLASSES
# - Pairwise expert for Overweight I vs II (OOF + Test re-ranking)
# ==============================================
def train_group_and_predict(
    X_grp,
    y_enc_grp,
    test_grp,
    group_name,
    boost_targets=BOOST_CLASSES,
    base_boost=TRAIN_WEIGHT_MULT,
):
    # Drop gender column inside a group (constant after split)
    cols_to_use = [c for c in X_grp.columns if c != gender_col]
    Xg = X_grp[cols_to_use].copy()
    Xtestg = test_grp[cols_to_use].copy()

    num_cols, cat_cols = infer_feature_types(Xg)

    # Preprocessor
    numeric_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler(with_mean=False))
    ])
    try:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
    except TypeError:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse=True)
    categorical_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", ohe)
    ])
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, num_cols),
            ("cat", categorical_transformer, cat_cols),
        ],
        remainder="drop",
        sparse_threshold=1.0
    )

    # XGB (multiclass) params
    xgb_params = {
        "objective": "multi:softprob",
        "num_class": len(classes),
        "eval_metric": "mlogloss",  # logloss for logging; early stopping uses feval
        "tree_method": "hist",
        "max_depth": 6,
        "min_child_weight": 2,
        "subsample": 0.9,
        "colsample_bytree": 0.9,
        "lambda": 1.0,
        "alpha": 0.0,
        "eta": 0.03,
        "nthread": N_JOBS,
        "seed": RANDOM_STATE,
    }

    # ---- indices
    cls_to_idx = {c: i for i, c in enumerate(classes)}
    boost_idx = {cls_to_idx[c] for c in boost_targets if c in cls_to_idx}
    idx_I = cls_to_idx.get(PAIR_CLASSES[0], None)
    idx_II = cls_to_idx.get(PAIR_CLASSES[1], None)

    # weighted macro-F1 early stopping emphasizing boost classes
    f1_weights = np.array([(F1_WEIGHT_FOR_BOOST if i in boost_idx else 1.0)
                           for i in range(len(classes))], dtype=float)

    def weighted_macro_f1_eval(preds, dmatrix):
        y_true = dmatrix.get_label().astype(int)
        proba = preds.reshape(-1, len(classes))
        y_hat = np.argmax(proba, axis=1)
        per_class_f1 = f1_score(y_true, y_hat,
                                labels=np.arange(len(classes)),
                                average=None, zero_division=0)
        return ("wF1_boost", float(np.average(per_class_f1, weights=f1_weights)))

    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    oof_group = np.zeros((len(Xg), len(classes)), dtype=np.float32)
    test_group_pred = np.zeros((len(Xtestg), len(classes)), dtype=np.float32)
    fold_best = []

    # -------- TRAIN + OOF + Test accumulation per fold
    for fold, (tr_idx, va_idx) in enumerate(skf.split(Xg, y_enc_grp), start=1):
        print(f"\n[{group_name}] Fold {fold}/{N_FOLDS}")
        X_tr, X_va = Xg.iloc[tr_idx], Xg.iloc[va_idx]
        y_tr, y_va = y_enc_grp[tr_idx], y_enc_grp[va_idx]

        prep = clone(preprocessor)
        Xtr = prep.fit_transform(X_tr)
        Xva = prep.transform(X_va)
        Xte = prep.transform(Xtestg)

        # cost-sensitive sample weights
        w_tr = np.ones_like(y_tr, dtype=float)
        for idx in boost_idx:
            w_tr[y_tr == idx] = base_boost
        w_va = np.ones_like(y_va, dtype=float)

        dtrain = xgb.DMatrix(Xtr, label=y_tr, weight=w_tr)
        dval   = xgb.DMatrix(Xva, label=y_va, weight=w_va)
        dtest  = xgb.DMatrix(Xte)

        bst = xgb.train(
            params=xgb_params,
            dtrain=dtrain,
            num_boost_round=NUM_BOOST_ROUND,
            evals=[(dtrain, "train"), (dval, "valid")],
            feval=weighted_macro_f1_eval,
            maximize=True,
            early_stopping_rounds=EARLY_STOP,
            verbose_eval=False
        )
        best_round = int(bst.best_iteration + 1)
        fold_best.append(best_round)
        print(f"[{group_name}] Best iteration (by wF1_boost): {best_round}")

        # OOF proba from main model
        oof_proba = bst.predict(dval, iteration_range=(0, best_round))

        # ---- Train pairwise expert on this fold (y ∈ {I, II})
        pair_bst = None
        if USE_PAIRWISE_EXPERT and idx_I is not None and idx_II is not None:
            mask_pair_tr = np.isin(y_tr, [idx_I, idx_II])
            if mask_pair_tr.sum() >= 20:  # enough samples
                y_tr_bin = (y_tr[mask_pair_tr] == idx_II).astype(int)  # 1=II, 0=I
                Xtr_pair = Xtr[mask_pair_tr]
                dtr_pair = xgb.DMatrix(Xtr_pair, label=y_tr_bin)
                pair_bst = xgb.train(
                    params={
                        "objective": "binary:logistic",
                        "eval_metric": "logloss",
                        "tree_method": "hist",
                        "max_depth": 5,
                        "min_child_weight": 2,
                        "subsample": 0.9,
                        "colsample_bytree": 0.9,
                        "eta": 0.03,
                        "nthread": N_JOBS,
                        "seed": RANDOM_STATE + 137 * fold,
                    },
                    dtrain=dtr_pair,
                    num_boost_round=4000,
                    verbose_eval=False
                )

        # ---- OOF re-ranking with expert when top-2 are I & II and margin is small
        if USE_PAIRWISE_EXPERT and pair_bst is not None:
            top2 = np.argsort(-oof_proba, axis=1)[:, :2]
            margins = oof_proba[np.arange(len(oof_proba)), top2[:, 0]] - \
                      oof_proba[np.arange(len(oof_proba)), top2[:, 1]]
            mask_candidates = np.logical_and(
                np.isin(top2, [idx_I, idx_II]).sum(axis=1) == 2,
                margins < TAU_MARGIN
            )
            if mask_candidates.any():
                dval_pair = xgb.DMatrix(Xva[mask_candidates])
                pII = pair_bst.predict(dval_pair)  # prob of class II
                pI = 1.0 - pII
                rows = np.where(mask_candidates)[0]
                oof_proba[rows, idx_I] = pI
                oof_proba[rows, idx_II] = pII
                row_sum = oof_proba[rows].sum(axis=1, keepdims=True)
                oof_proba[rows] /= (row_sum + 1e-12)

        # store OOF
        oof_group[va_idx] = oof_proba

        # quick per-fold check on emphasized classes
        y_hat = np.argmax(oof_proba, axis=1)
        for cname in boost_targets:
            if cname in cls_to_idx:
                cidx = cls_to_idx[cname]
                f1_c = f1_score(y_va, y_hat, labels=[cidx], average="macro", zero_division=0)
                print(f"[{group_name}] Fold {fold} F1({cname}): {f1_c:.4f}")

        # ---- Test predictions for this fold + expert re-ranking
        proba_fold = bst.predict(dtest, iteration_range=(0, best_round))
        if USE_PAIRWISE_EXPERT and pair_bst is not None and idx_I is not None and idx_II is not None:
            top2 = np.argsort(-proba_fold, axis=1)[:, :2]
            margins = proba_fold[np.arange(len(proba_fold)), top2[:, 0]] - \
                      proba_fold[np.arange(len(proba_fold)), top2[:, 1]]
            mask_candidates = np.logical_and(
                np.isin(top2, [idx_I, idx_II]).sum(axis=1) == 2,
                margins < TAU_MARGIN
            )
            if mask_candidates.any():
                dtest_pair = xgb.DMatrix(Xte[mask_candidates])
                pII = pair_bst.predict(dtest_pair)
                pI = 1.0 - pII
                rows = np.where(mask_candidates)[0]
                proba_fold[rows, idx_I] = pI
                proba_fold[rows, idx_II] = pII
                row_sum = proba_fold[rows].sum(axis=1, keepdims=True)
                proba_fold[rows] /= (row_sum + 1e-12)

        test_group_pred += proba_fold / N_FOLDS

    # ---- OOF summary
    oof_labels = np.argmax(oof_group, axis=1)
    acc_g = accuracy_score(y_enc_grp, oof_labels)
    f1_g = f1_score(y_enc_grp, oof_labels, average="macro")
    print(f"\n[{group_name}] OOF Accuracy: {acc_g:.4f} | Macro F1: {f1_g:.4f}")
    print(f"[{group_name}] Best iterations: {list(map(int, fold_best))} | Median: {int(np.median(fold_best))}")

    return oof_group, test_group_pred

# ==============================================
# Train per-gender and predict full test
# ==============================================
X_male = X[male_mask].reset_index(drop=True)
y_male_enc = y_enc[male_mask]
test_male = test_features[test_male_mask].reset_index(drop=True)

X_female = X[female_mask].reset_index(drop=True)
y_female_enc = y_enc[female_mask]
test_female = test_features[test_female_mask].reset_index(drop=True)

male_oof, male_test_pred = train_group_and_predict(X_male, y_male_enc, test_male, "MALE")
female_oof, female_test_pred = train_group_and_predict(X_female, y_female_enc, test_female, "FEMALE")

# Combine OOF
oof_full = np.zeros((len(X), len(classes)), dtype=np.float32)
oof_full[male_mask.values] = male_oof
oof_full[female_mask.values] = female_oof

oof_labels = np.argmax(oof_full, axis=1)
oof_acc = accuracy_score(y_enc, oof_labels)
oof_f1 = f1_score(y_enc, oof_labels, average="macro")
print("\n========== OVERALL OOF ==========")
print(f"OOF Accuracy: {oof_acc:.4f} | OOF Macro F1: {oof_f1:.4f}")
try:
    print("\nOOF Classification Report:\n",
          classification_report(y_enc, oof_labels, target_names=classes, zero_division=0))
except Exception as e:
    print(f"[Info] Could not print classification report: {e}")

# ==============================================
# Build submission
# ==============================================
test_pred_proba = np.zeros((len(test_features), len(classes)), dtype=np.float32)
test_pred_proba[test_male_mask.values] = male_test_pred
test_pred_proba[test_female_mask.values] = female_test_pred

test_pred_int = np.argmax(test_pred_proba, axis=1)
test_pred_labels = le.inverse_transform(test_pred_int)

ss_cols = list(sample_sub.columns)
ID_HEADER = None
LABEL_HEADER = None
if len(ss_cols) == 2:
    c1, c2 = ss_cols
    if c1 in test.columns and c2 not in test.columns:
        ID_HEADER, LABEL_HEADER = c1, c2
    elif c2 in test.columns and c1 not in test.columns:
        ID_HEADER, LABEL_HEADER = c2, c1
if ID_HEADER is None:
    ID_HEADER = ss_cols[0]
    LABEL_HEADER = ss_cols[1]

sub = pd.DataFrame()
if ID_HEADER in test.columns:
    sub[ID_HEADER] = test[ID_HEADER].values
else:
    sub[ID_HEADER] = np.arange(len(test_features))
sub[LABEL_HEADER] = test_pred_labels

for c in ss_cols:
    if c not in sub.columns:
        sub[c] = sample_sub[c].iloc[0] if len(sample_sub[c]) else None
sub = sub[ss_cols]

sub.to_csv("submission.csv", index=False)
print("\nSaved submission.csv")
print(sub.head(5))

[Info] Classes: ['Insufficient_Weight', 'Normal_Weight', 'Obesity_Type_I', 'Obesity_Type_II', 'Obesity_Type_III', 'Overweight_Level_I', 'Overweight_Level_II']
[Info] Train male=7783, female=7750
[Info] Test  male=10336, female=10422

[MALE] Fold 1/5




[MALE] Best iteration (by wF1_boost): 487
[MALE] Fold 1 F1(Overweight_Level_I): 0.8117
[MALE] Fold 1 F1(Overweight_Level_II): 0.8324

[MALE] Fold 2/5




[MALE] Best iteration (by wF1_boost): 513
[MALE] Fold 2 F1(Overweight_Level_I): 0.8224
[MALE] Fold 2 F1(Overweight_Level_II): 0.8297

[MALE] Fold 3/5




[MALE] Best iteration (by wF1_boost): 232
[MALE] Fold 3 F1(Overweight_Level_I): 0.7700
[MALE] Fold 3 F1(Overweight_Level_II): 0.8244

[MALE] Fold 4/5




[MALE] Best iteration (by wF1_boost): 422
[MALE] Fold 4 F1(Overweight_Level_I): 0.7982
[MALE] Fold 4 F1(Overweight_Level_II): 0.8294

[MALE] Fold 5/5




[MALE] Best iteration (by wF1_boost): 234
[MALE] Fold 5 F1(Overweight_Level_I): 0.7692
[MALE] Fold 5 F1(Overweight_Level_II): 0.8195

[MALE] OOF Accuracy: 0.8874 | Macro F1: 0.7514
[MALE] Best iterations: [487, 513, 232, 422, 234] | Median: 422

[FEMALE] Fold 1/5




[FEMALE] Best iteration (by wF1_boost): 67
[FEMALE] Fold 1 F1(Overweight_Level_I): 0.7937
[FEMALE] Fold 1 F1(Overweight_Level_II): 0.7542

[FEMALE] Fold 2/5




[FEMALE] Best iteration (by wF1_boost): 96
[FEMALE] Fold 2 F1(Overweight_Level_I): 0.7674
[FEMALE] Fold 2 F1(Overweight_Level_II): 0.7431

[FEMALE] Fold 3/5




[FEMALE] Best iteration (by wF1_boost): 643
[FEMALE] Fold 3 F1(Overweight_Level_I): 0.7601
[FEMALE] Fold 3 F1(Overweight_Level_II): 0.7196

[FEMALE] Fold 4/5




[FEMALE] Best iteration (by wF1_boost): 309
[FEMALE] Fold 4 F1(Overweight_Level_I): 0.7673
[FEMALE] Fold 4 F1(Overweight_Level_II): 0.7773

[FEMALE] Fold 5/5




[FEMALE] Best iteration (by wF1_boost): 217
[FEMALE] Fold 5 F1(Overweight_Level_I): 0.8092
[FEMALE] Fold 5 F1(Overweight_Level_II): 0.7966

[FEMALE] OOF Accuracy: 0.9166 | Macro F1: 0.7504
[FEMALE] Best iterations: [67, 96, 643, 309, 217] | Median: 217

OOF Accuracy: 0.9020 | OOF Macro F1: 0.8929

OOF Classification Report:
                      precision    recall  f1-score   support

Insufficient_Weight       0.93      0.93      0.93      1870
      Normal_Weight       0.89      0.87      0.88      2345
     Obesity_Type_I       0.90      0.86      0.88      2207
    Obesity_Type_II       0.97      0.97      0.97      2403
   Obesity_Type_III       1.00      1.00      1.00      2983
 Overweight_Level_I       0.79      0.79      0.79      1844
Overweight_Level_II       0.78      0.84      0.81      1881

           accuracy                           0.90     15533
          macro avg       0.89      0.89      0.89     15533
       weighted avg       0.90      0.90      0.90     15533


In [19]:
# ==============================================
# Evaluate on Kaggle_test.csv (with ground truth)
# ==============================================
if not os.path.exists(KAGGLE_TEST_PATH):
    print(f"\n[Warn] {KAGGLE_TEST_PATH} not found. Skipping Kaggle_test evaluation.")
else:
    kdf = pd.read_csv(KAGGLE_TEST_PATH)
    if "WeightCategory" not in kdf.columns:
        raise KeyError("Kaggle_test.csv must contain 'WeightCategory'.")

    y_true = kdf["WeightCategory"].copy()
    X_k = kdf.drop(columns=["WeightCategory"], errors="ignore").copy()
    if id_col and id_col in X_k.columns:
        X_k.drop(columns=[id_col], inplace=True)

    for c in ["MTRANS", "SMOKE"]:
        if c in X_k.columns:
            X_k.drop(columns=[c], inplace=True)
    X_k = add_bmi(X_k)

    gender_col_k = detect_gender_column(X_k)
    if gender_col_k is None:
        raise ValueError("Could not detect a gender column in Kaggle_test.csv")
    km_k, kf_k = split_by_gender(X_k[gender_col_k])

    # Predict on Kaggle by reusing the same training procedure (per gender)
    kaggle_pred_proba = np.zeros((len(X_k), len(classes)), dtype=np.float32)

    if X_male.shape[0] > 0 and km_k.sum() > 0:
        _, male_k_pred = train_group_and_predict(X_male, y_male_enc, X_k[km_k].reset_index(drop=True), "MALE (Kaggle)")
        kaggle_pred_proba[km_k.values] = male_k_pred
    if X_female.shape[0] > 0 and kf_k.sum() > 0:
        _, female_k_pred = train_group_and_predict(X_female, y_female_enc, X_k[kf_k].reset_index(drop=True), "FEMALE (Kaggle)")
        kaggle_pred_proba[kf_k.values] = female_k_pred

    kaggle_pred_idx = np.argmax(kaggle_pred_proba, axis=1)
    y_pred = le.inverse_transform(kaggle_pred_idx)

    # -------- Overall accuracy to 5 decimals --------
    overall_acc = accuracy_score(y_true, y_pred)
    print(f"\n✅ Overall Accuracy on Kaggle_test: {overall_acc:.5f}")

    # -------- Text-only error analysis (custom order) --------
    order = [
        'Insufficient_Weight',
        'Normal_Weight',
        'Overweight_Level_I',
        'Overweight_Level_II',
        'Obesity_Type_I',
        'Obesity_Type_II',
        'Obesity_Type_III'
    ]

    cm = confusion_matrix(y_true, y_pred, labels=order)
    cm_norm = cm.astype(float) / (cm.sum(axis=1, keepdims=True) + 1e-12)

    print("\n=== Confusion Matrix (counts) ===")
    print("Predicted →")
    print("True ↓")
    for i, true_class in enumerate(order):
        row = " | ".join(f"{cm[i, j]:4d}" for j in range(len(order)))
        print(f"{true_class:<22}: {row}")

    print("\n=== Confusion Matrix (row-normalized) ===")
    for i, true_class in enumerate(order):
        row = " | ".join(f"{cm_norm[i, j]:.2f}" for j in range(len(order)))
        print(f"{true_class:<22}: {row}")

    print("\n=== Per-class metrics ===")
    try:
        print(classification_report(y_true, y_pred, labels=order, target_names=order, digits=4, zero_division=0))
    except Exception as e:
        print(f"[Info] classification_report fallback: {e}")
        print(classification_report(y_true, y_pred, digits=4, zero_division=0))

    print("\n=== Per-class accuracy (diagonal/row total) ===")
    for i, c in enumerate(order):
        total = cm[i].sum()
        correct = cm[i, i]
        acc = correct / total if total > 0 else 0.0
        print(f"{c:<22} | Correct: {correct:3d} / {total:3d} | {acc*100:6.2f}%")

    print("\n=== Most common confusions (true → predicted) ===")
    pairs = []
    for i, t in enumerate(order):
        for j, p in enumerate(order):
            if i == j or cm[i, j] == 0:
                continue
            pairs.append((cm[i, j], t, p, cm_norm[i, j]))
    pairs = sorted(pairs, key=lambda x: (-x[0], -x[3]))
    for cnt, true_label, pred_label, norm_val in pairs[:10]:
        print(f"{true_label:25} → {pred_label:25} | Count: {cnt:3d} | Row%: {norm_val*100:5.1f}")

    print("\n=== Sample of misclassified rows (first 10) ===")
    mis_idx = np.where(np.asarray(y_true) != np.asarray(y_pred))[0]
    if len(mis_idx) == 0:
        print("🎉 No misclassifications!")
    else:
        for idx in mis_idx[:10]:
            true_lab = y_true.iloc[idx] if hasattr(y_true, "iloc") else y_true[idx]
            pred_lab = y_pred[idx]
            conf = float(np.max(kaggle_pred_proba[idx]))
            rank = np.argsort(-kaggle_pred_proba[idx])
            second_idx = rank[1] if rank.size > 1 else rank[0]
            second_lab = le.inverse_transform([second_idx])[0]
            second_conf = float(kaggle_pred_proba[idx][second_idx])
            print(f"Row {idx:4d}: true={true_lab:<22} pred={pred_lab:<22} conf={conf:.3f} 2nd={second_lab:<22}({second_conf:.3f})")




[MALE (Kaggle)] Fold 1/5
[MALE (Kaggle)] Best iteration (by wF1_boost): 487
[MALE (Kaggle)] Fold 1 F1(Overweight_Level_I): 0.8117
[MALE (Kaggle)] Fold 1 F1(Overweight_Level_II): 0.8324

[MALE (Kaggle)] Fold 2/5




[MALE (Kaggle)] Best iteration (by wF1_boost): 513
[MALE (Kaggle)] Fold 2 F1(Overweight_Level_I): 0.8224
[MALE (Kaggle)] Fold 2 F1(Overweight_Level_II): 0.8297

[MALE (Kaggle)] Fold 3/5




[MALE (Kaggle)] Best iteration (by wF1_boost): 232
[MALE (Kaggle)] Fold 3 F1(Overweight_Level_I): 0.7700
[MALE (Kaggle)] Fold 3 F1(Overweight_Level_II): 0.8244

[MALE (Kaggle)] Fold 4/5




[MALE (Kaggle)] Best iteration (by wF1_boost): 422
[MALE (Kaggle)] Fold 4 F1(Overweight_Level_I): 0.7982
[MALE (Kaggle)] Fold 4 F1(Overweight_Level_II): 0.8294

[MALE (Kaggle)] Fold 5/5




[MALE (Kaggle)] Best iteration (by wF1_boost): 234
[MALE (Kaggle)] Fold 5 F1(Overweight_Level_I): 0.7692
[MALE (Kaggle)] Fold 5 F1(Overweight_Level_II): 0.8195

[MALE (Kaggle)] OOF Accuracy: 0.8874 | Macro F1: 0.7514
[MALE (Kaggle)] Best iterations: [487, 513, 232, 422, 234] | Median: 422

[FEMALE (Kaggle)] Fold 1/5




[FEMALE (Kaggle)] Best iteration (by wF1_boost): 67
[FEMALE (Kaggle)] Fold 1 F1(Overweight_Level_I): 0.7937
[FEMALE (Kaggle)] Fold 1 F1(Overweight_Level_II): 0.7542

[FEMALE (Kaggle)] Fold 2/5




[FEMALE (Kaggle)] Best iteration (by wF1_boost): 96
[FEMALE (Kaggle)] Fold 2 F1(Overweight_Level_I): 0.7674
[FEMALE (Kaggle)] Fold 2 F1(Overweight_Level_II): 0.7431

[FEMALE (Kaggle)] Fold 3/5




[FEMALE (Kaggle)] Best iteration (by wF1_boost): 643
[FEMALE (Kaggle)] Fold 3 F1(Overweight_Level_I): 0.7601
[FEMALE (Kaggle)] Fold 3 F1(Overweight_Level_II): 0.7196

[FEMALE (Kaggle)] Fold 4/5




[FEMALE (Kaggle)] Best iteration (by wF1_boost): 309
[FEMALE (Kaggle)] Fold 4 F1(Overweight_Level_I): 0.7673
[FEMALE (Kaggle)] Fold 4 F1(Overweight_Level_II): 0.7773

[FEMALE (Kaggle)] Fold 5/5




[FEMALE (Kaggle)] Best iteration (by wF1_boost): 217
[FEMALE (Kaggle)] Fold 5 F1(Overweight_Level_I): 0.8092
[FEMALE (Kaggle)] Fold 5 F1(Overweight_Level_II): 0.7966

[FEMALE (Kaggle)] OOF Accuracy: 0.9166 | Macro F1: 0.7504
[FEMALE (Kaggle)] Best iterations: [67, 96, 643, 309, 217] | Median: 217

✅ Overall Accuracy on Kaggle_test: 0.90928

=== Confusion Matrix (counts) ===
Predicted →
True ↓
Insufficient_Weight   :  619 |   31 |    3 |    0 |    0 |    0 |    0
Normal_Weight         :   39 |  646 |   46 |    5 |    1 |    0 |    0
Overweight_Level_I    :    3 |   44 |  454 |   72 |   10 |    0 |    0
Overweight_Level_II   :    0 |   16 |   51 |  529 |   41 |    4 |    0
Obesity_Type_I        :    1 |    1 |   11 |   48 |  622 |   18 |    2
Obesity_Type_II       :    0 |    0 |    2 |    5 |   18 |  820 |    0
Obesity_Type_III      :    0 |    0 |    1 |    0 |    1 |    0 | 1061

=== Confusion Matrix (row-normalized) ===
Insufficient_Weight   : 0.95 | 0.05 | 0.00 | 0.00 | 0.00 | 0.00 

In [20]:
# ==============================================
# XGB (Stratified CV) with BMI + Gender & Age-group routing (4 models)
# Groups: Male<24, Male>=24, Female<24, Female>=24
# Light class-boost for Overweight I/II + Kaggle_test evaluation
# ==============================================
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.base import clone
import xgboost as xgb

# -------- Paths --------
TRAIN_PATH = "train.csv"
TEST_PATH = "test.csv"
SAMPLE_SUB_PATH = "sample_submission.csv"
KAGGLE_TEST_PATH = "Kaggle_test.csv"  # must contain WeightCategory

# -------- Globals --------
RANDOM_STATE = 42
N_FOLDS = 5
N_JOBS = -1

# ---- Emphasize the tricky classes a bit ----
BOOST_CLASSES = ("Overweight_Level_I", "Overweight_Level_II")
TRAIN_WEIGHT_MULT = 1.75      # 1.5–2.5 is a sensible range
NUM_BOOST_ROUND = 20000
EARLY_STOP = 200

# ==============================================
# Helpers
# ==============================================
def norm_col(s: str) -> str:
    if s is None:
        return s
    return str(s).replace("\ufeff", "").strip().lower()

def infer_feature_types(df):
    cat_cols = df.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    return num_cols, cat_cols

def detect_gender_column(df):
    # direct names
    for c in df.columns:
        if norm_col(c) in {"gender", "sex"}:
            return c
    # fallback: looks like M/F-ish
    for c in df.columns:
        vals = pd.Series(df[c].dropna().astype(str).str.lower().str.strip()).unique()
        if len(vals) in (2, 3):
            if any(v.startswith("m") for v in vals) and any(v.startswith("f") for v in vals):
                return c
    return None

def detect_age_column(df):
    for c in df.columns:
        if norm_col(c) in {"age", "years", "age_years"}:
            return c
    # heuristic: numeric col with reasonable range
    candidates = []
    for c in df.columns:
        if pd.api.types.is_numeric_dtype(df[c]):
            s = pd.to_numeric(df[c], errors="coerce")
            q1, q99 = np.nanpercentile(s, [1, 99]) if s.notna().any() else (np.nan, np.nan)
            if 5 <= q1 <= 60 or 5 <= q99 <= 100:  # very loose
                candidates.append(c)
    return candidates[0] if candidates else None

def split_by_gender(series):
    s = series.astype(str).str.lower().str.strip()
    male_mask = s.str.startswith(("m", "1", "true"))
    female_mask = s.str.startswith(("f", "0", "false"))
    if male_mask.sum() == 0 and female_mask.sum() == 0:
        top = s.value_counts().index.tolist()
        if len(top) >= 2:
            male_mask = s == top[0]
            female_mask = s == top[1]
    return male_mask, female_mask

def add_bmi(df):
    """BMI = Weight / (Height_m^2). If median height > 3 assume cm -> meters."""
    if ("Weight" in df.columns) and ("Height" in df.columns):
        h = pd.to_numeric(df["Height"], errors="coerce")
        height_m = np.where(np.nanmedian(h) > 3.0, h / 100.0, h)
        with np.errstate(divide="ignore", invalid="ignore"):
            bmi = pd.to_numeric(df["Weight"], errors="coerce") / (np.power(height_m, 2) + 1e-12)
        df["BMI"] = pd.Series(bmi).replace([np.inf, -np.inf], np.nan)
    return df

def add_age_features(df, age_col):
    """Create rounded age and binary AgeGroup (<24 vs >=24)."""
    if age_col is None or age_col not in df.columns:
        raise ValueError("Age column not found; cannot create AgeGroup split.")
    age = pd.to_numeric(df[age_col], errors="coerce")
    df["AgeRounded"] = np.rint(age).astype("float32")  # nearest integer (kept numeric for model)
    df["AgeGroup"] = np.where(age < 24, "<24", ">=24")
    return df

# ==============================================
# Load data
# ==============================================
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
sample_sub = pd.read_csv(SAMPLE_SUB_PATH)

# Optional drops
for c in ["MTRANS", "SMOKE"]:
    if c in train.columns:
        train.drop(columns=[c], inplace=True)
    if c in test.columns:
        test.drop(columns=[c], inplace=True)

# Feature engineering
train = add_bmi(train)
test = add_bmi(test)

# Detect ID/Target
id_col = None
for cand in ["id", "row_id", "index", "sample_id"]:
    if cand in train.columns and cand in test.columns:
        id_col = cand
        break

target_col = None
for cand in ["WeightCategory", "NObeyesdad", "label", "target", "class", "y"]:
    if cand in train.columns:
        target_col = cand
        break
if target_col is None:
    raise ValueError("Could not detect target column in train.csv")

# Build X/y
y = train[target_col].copy()
X = train.drop(columns=[target_col]).copy()
if id_col and id_col in X.columns:
    X.drop(columns=[id_col], inplace=True)

test_features = test.copy()
if id_col and id_col in test_features.columns:
    test_ids = test_features[id_col].copy()
    test_features.drop(columns=[id_col], inplace=True)
else:
    test_ids = pd.Series(np.arange(len(test_features)), name="id")

# Label encode target
le = LabelEncoder()
y_enc = le.fit_transform(y)
classes = list(le.classes_)
print(f"[Info] Classes: {classes}")

# Detect gender + age and create age groups
gender_col = detect_gender_column(pd.concat([X, test_features], axis=0))
if gender_col is None:
    raise ValueError("Could not detect a gender column (e.g., 'Gender'/'SEX').")

age_col = detect_age_column(pd.concat([X, test_features], axis=0))
if age_col is None:
    raise ValueError("Could not detect an Age column.")

# Add AgeRounded & AgeGroup in BOTH train/test (and DO NOT drop originals)
train = add_age_features(train, age_col)
test_features = add_age_features(test_features, age_col)

# Rebuild X (because train got new columns)
X = train.drop(columns=[target_col]).copy()
if id_col and id_col in X.columns:
    X.drop(columns=[id_col], inplace=True)

# Split gender masks
train_male_mask, train_female_mask = split_by_gender(train[gender_col])
test_male_mask, test_female_mask = split_by_gender(test_features[gender_col])

# Split age-group masks
train_young_mask = train["AgeGroup"] == "<24"
train_old_mask = train["AgeGroup"] != "<24"
test_young_mask = test_features["AgeGroup"] == "<24"
test_old_mask = test_features["AgeGroup"] != "<24"

print(f"[Info] Train M<24={int((train_male_mask & train_young_mask).sum())}, "
      f"M>=24={int((train_male_mask & train_old_mask).sum())}, "
      f"F<24={int((train_female_mask & train_young_mask).sum())}, "
      f"F>=24={int((train_female_mask & train_old_mask).sum())}")

print(f"[Info] Test  M<24={int((test_male_mask & test_young_mask).sum())}, "
      f"M>=24={int((test_male_mask & test_old_mask).sum())}, "
      f"F<24={int((test_female_mask & test_young_mask).sum())}, "
      f"F>=24={int((test_female_mask & test_old_mask).sum())}")

[Info] Classes: ['Insufficient_Weight', 'Normal_Weight', 'Obesity_Type_I', 'Obesity_Type_II', 'Obesity_Type_III', 'Overweight_Level_I', 'Overweight_Level_II']
[Info] Train M<24=4649, M>=24=3134, F<24=4786, F>=24=2964
[Info] Test  M<24=6130, M>=24=4206, F<24=6420, F>=24=4002


In [21]:
# ==============================================
# Training function (group-specific)
# Stratified 5-fold + light class-boost on Overweight I/II
# ==============================================
def train_group_and_predict(X_grp, y_enc_grp, test_grp, group_name,
                            boost_targets=BOOST_CLASSES, base_boost=TRAIN_WEIGHT_MULT):
    # Remove columns that are constant within this group (gender & age group stay but may be constant)
    # We’ll explicitly drop gender because it’s constant inside gender-split; AgeGroup can remain as it differs across groups globally
    cols_to_use = [c for c in X_grp.columns if c != gender_col]
    Xg = X_grp[cols_to_use].copy()
    Xtestg = test_grp[cols_to_use].copy()

    num_cols, cat_cols = infer_feature_types(Xg)

    # Preprocessor
    numeric_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler(with_mean=False))
    ])
    try:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
    except TypeError:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse=True)
    categorical_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", ohe)
    ])
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, num_cols),
            ("cat", categorical_transformer, cat_cols),
        ],
        remainder="drop",
        sparse_threshold=1.0
    )

    # XGB params
    xgb_params = {
        "objective": "multi:softprob",
        "num_class": len(classes),
        "eval_metric": "mlogloss",
        "tree_method": "hist",
        "max_depth": 6,
        "min_child_weight": 2,
        "subsample": 0.9,
        "colsample_bytree": 0.9,
        "lambda": 1.0,
        "alpha": 0.0,
        "eta": 0.03,
        "nthread": N_JOBS,
        "seed": RANDOM_STATE,
    }

    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    oof_group = np.zeros((len(Xg), len(classes)), dtype=np.float32)
    test_group_pred = np.zeros((len(Xtestg), len(classes)), dtype=np.float32)
    fold_best = []

    # map class name -> index
    cls_to_idx = {c: i for i, c in enumerate(classes)}
    boost_idx = {cls_to_idx[c] for c in boost_targets if c in cls_to_idx}

    for fold, (tr_idx, va_idx) in enumerate(skf.split(Xg, y_enc_grp), start=1):
        print(f"\n[{group_name}] Fold {fold}/{N_FOLDS}")
        X_tr, X_va = Xg.iloc[tr_idx], Xg.iloc[va_idx]
        y_tr, y_va = y_enc_grp[tr_idx], y_enc_grp[va_idx]

        prep = clone(preprocessor)
        Xtr = prep.fit_transform(X_tr)
        Xva = prep.transform(X_va)

        # deterministic class-boost weights (no jitter)
        w_tr = np.ones_like(y_tr, dtype=float)
        for idx in boost_idx:
            w_tr[y_tr == idx] = base_boost
        w_va = np.ones_like(y_va, dtype=float)

        dtrain = xgb.DMatrix(Xtr, label=y_tr, weight=w_tr)
        dval   = xgb.DMatrix(Xva, label=y_va, weight=w_va)

        bst = xgb.train(
            params=xgb_params,
            dtrain=dtrain,
            num_boost_round=NUM_BOOST_ROUND,
            evals=[(dtrain, "train"), (dval, "valid")],
            early_stopping_rounds=EARLY_STOP,
            verbose_eval=False
        )
        best_round = int(bst.best_iteration + 1)
        fold_best.append(best_round)
        print(f"[{group_name}] Best iteration: {best_round}")

        # OOF store
        oof_proba = bst.predict(dval, iteration_range=(0, best_round))
        oof_group[va_idx] = oof_proba

        # Test preds for this fold
        Xtest_tf = prep.transform(Xtestg)
        dtest = xgb.DMatrix(Xtest_tf)
        test_group_pred += bst.predict(dtest, iteration_range=(0, best_round)) / N_FOLDS

    # OOF summary
    oof_labels = np.argmax(oof_group, axis=1)
    acc_g = accuracy_score(y_enc_grp, oof_labels)
    f1_g = f1_score(y_enc_grp, oof_labels, average="macro")
    print(f"\n[{group_name}] OOF Accuracy: {acc_g:.4f} | Macro F1: {f1_g:.4f}")
    print(f"[{group_name}] Best iterations: {fold_best} | Median: {int(np.median(fold_best))}")

    return oof_group, test_group_pred



In [22]:

# ==============================================
# Build the four groups and train
# ==============================================
def pick_cols(df):
    # ensure we train on the same column set as X (already dropped id)
    return df[X.columns.tolist()].copy()

groups = {
    "MALE_<24":   (train_male_mask & train_young_mask,  test_male_mask & test_young_mask),
    "MALE_>=24":  (train_male_mask & train_old_mask,    test_male_mask & test_old_mask),
    "FEMALE_<24": (train_female_mask & train_young_mask,test_female_mask & test_young_mask),
    "FEMALE_>=24":(train_female_mask & train_old_mask,  test_female_mask & test_old_mask),
}

oof_full = np.zeros((len(X), len(classes)), dtype=np.float32)
test_pred_proba = np.zeros((len(test_features), len(classes)), dtype=np.float32)

for gname, (tr_mask, te_mask) in groups.items():
    if tr_mask.sum() == 0:
        print(f"[Warn] No training rows for group {gname}; skipping.")
        continue
    Xg = pick_cols(train.loc[tr_mask])
    yg = y_enc[tr_mask]
    Xtg = pick_cols(test_features.loc[te_mask])

    oof_g, test_g = train_group_and_predict(Xg, yg, Xtg, gname)
    oof_full[np.where(tr_mask)[0]] = oof_g
    test_pred_proba[np.where(te_mask)[0]] = test_g

# ==============================================
# OOF summary (all groups combined)
# ==============================================
oof_labels = np.argmax(oof_full, axis=1)
oof_acc = accuracy_score(y_enc, oof_labels)
oof_f1 = f1_score(y_enc, oof_labels, average="macro")
print("\n========== OVERALL OOF ==========")
print(f"OOF Accuracy: {oof_acc:.4f} | OOF Macro F1: {oof_f1:.4f}")
try:
    print("\nOOF Classification Report:\n",
          classification_report(y_enc, oof_labels, target_names=classes, zero_division=0))
except Exception as e:
    print(f"[Info] Could not print classification report: {e}")






[MALE_<24] Fold 1/5
[MALE_<24] Best iteration: 277

[MALE_<24] Fold 2/5
[MALE_<24] Best iteration: 268

[MALE_<24] Fold 3/5
[MALE_<24] Best iteration: 313

[MALE_<24] Fold 4/5
[MALE_<24] Best iteration: 329

[MALE_<24] Fold 5/5
[MALE_<24] Best iteration: 282

[MALE_<24] OOF Accuracy: 0.8692 | Macro F1: 0.7504
[MALE_<24] Best iterations: [277, 268, 313, 329, 282] | Median: 282

[MALE_>=24] Fold 1/5




[MALE_>=24] Best iteration: 261

[MALE_>=24] Fold 2/5
[MALE_>=24] Best iteration: 198

[MALE_>=24] Fold 3/5
[MALE_>=24] Best iteration: 224

[MALE_>=24] Fold 4/5
[MALE_>=24] Best iteration: 238

[MALE_>=24] Fold 5/5
[MALE_>=24] Best iteration: 242

[MALE_>=24] OOF Accuracy: 0.9017 | Macro F1: 0.6018
[MALE_>=24] Best iterations: [261, 198, 224, 238, 242] | Median: 238

[FEMALE_<24] Fold 1/5




[FEMALE_<24] Best iteration: 265

[FEMALE_<24] Fold 2/5
[FEMALE_<24] Best iteration: 391

[FEMALE_<24] Fold 3/5
[FEMALE_<24] Best iteration: 306

[FEMALE_<24] Fold 4/5
[FEMALE_<24] Best iteration: 310

[FEMALE_<24] Fold 5/5
[FEMALE_<24] Best iteration: 299

[FEMALE_<24] OOF Accuracy: 0.9070 | Macro F1: 0.7537
[FEMALE_<24] Best iterations: [265, 391, 306, 310, 299] | Median: 306

[FEMALE_>=24] Fold 1/5
[FEMALE_>=24] Best iteration: 270

[FEMALE_>=24] Fold 2/5
[FEMALE_>=24] Best iteration: 224

[FEMALE_>=24] Fold 3/5
[FEMALE_>=24] Best iteration: 216

[FEMALE_>=24] Fold 4/5
[FEMALE_>=24] Best iteration: 232

[FEMALE_>=24] Fold 5/5
[FEMALE_>=24] Best iteration: 201

[FEMALE_>=24] OOF Accuracy: 0.9288 | Macro F1: 0.6814
[FEMALE_>=24] Best iterations: [270, 224, 216, 232, 201] | Median: 224

OOF Accuracy: 0.8988 | OOF Macro F1: 0.8892

OOF Classification Report:
                      precision    recall  f1-score   support

Insufficient_Weight       0.92      0.93      0.93      1870
      

In [23]:
# ==============================================
# Build submission from test_pred_proba
# ==============================================
test_pred_int = np.argmax(test_pred_proba, axis=1)
test_pred_labels = le.inverse_transform(test_pred_int)

ss_cols = list(sample_sub.columns)
ID_HEADER = None
LABEL_HEADER = None
if len(ss_cols) == 2:
    c1, c2 = ss_cols
    if c1 in test.columns and c2 not in test.columns:
        ID_HEADER, LABEL_HEADER = c1, c2
    elif c2 in test.columns and c1 not in test.columns:
        ID_HEADER, LABEL_HEADER = c2, c1
if ID_HEADER is None:
    ID_HEADER = ss_cols[0]
    LABEL_HEADER = ss_cols[1]

sub = pd.DataFrame()
if ID_HEADER in test.columns:
    sub[ID_HEADER] = test[ID_HEADER].values
else:
    sub[ID_HEADER] = np.arange(len(test_features))
sub[LABEL_HEADER] = test_pred_labels

for c in ss_cols:
    if c not in sub.columns:
        sub[c] = sample_sub[c].iloc[0] if len(sample_sub[c]) else None
sub = sub[ss_cols]
sub.to_csv("submission.csv", index=False)
print("\nSaved submission.csv")
print(sub.head(5))



Saved submission.csv
   id       WeightCategory
0   0  Overweight_Level_II
1   1        Normal_Weight
2   2  Insufficient_Weight
3   3     Obesity_Type_III
4   4  Overweight_Level_II


In [24]:
# ==============================================
# Evaluate on Kaggle_test.csv (with ground truth)
# ==============================================
if not os.path.exists(KAGGLE_TEST_PATH):
    print(f"\n[Warn] {KAGGLE_TEST_PATH} not found. Skipping Kaggle_test evaluation.")
else:
    kdf = pd.read_csv(KAGGLE_TEST_PATH)
    if "WeightCategory" not in kdf.columns:
        raise KeyError("Kaggle_test.csv must contain 'WeightCategory'.")

    y_true = kdf["WeightCategory"].copy()
    X_k = kdf.drop(columns=["WeightCategory"], errors="ignore").copy()
    if id_col and id_col in X_k.columns:
        X_k.drop(columns=[id_col], inplace=True)

    # same drops + BMI + Age features
    for c in ["MTRANS", "SMOKE"]:
        if c in X_k.columns:
            X_k.drop(columns=[c], inplace=True)
    X_k = add_bmi(X_k)

    # detect gender/age and add age features in Kaggle set
    gender_col_k = detect_gender_column(X_k)
    if gender_col_k is None:
        raise ValueError("Could not detect a gender column in Kaggle_test.csv")
    age_col_k = detect_age_column(X_k)
    if age_col_k is None:
        raise ValueError("Could not detect an Age column in Kaggle_test.csv")
    X_k = add_age_features(X_k, age_col_k)

    km_k, kf_k = split_by_gender(X_k[gender_col_k])
    ky_k = X_k["AgeGroup"] == "<24"
    ko_k = ~ky_k

    # Reconstruct by routing through the same 4 groups
    kaggle_pred_proba = np.zeros((len(X_k), len(classes)), dtype=np.float32)

    def pick_cols_k(df):
        # ensure the same training columns order
        use_cols = [c for c in X.columns.tolist() if c in df.columns]
        # fill any missing columns (all-NaN) to match transformer columns
        missing = [c for c in X.columns.tolist() if c not in df.columns]
        tmp = df[use_cols].copy()
        for m in missing:
            tmp[m] = np.nan
        # reorder to X.columns
        return tmp[X.columns.tolist()]

    # helper to train and infer per group (reusing the same routine)
    def infer_group(train_mask, kaggle_mask, name):
        if train_mask.sum() == 0 or kaggle_mask.sum() == 0:
            print(f"[Kaggle] Skip group {name}: train={int(train_mask.sum())}, eval={int(kaggle_mask.sum())}")
            return
        Xg = X.loc[train_mask, :]
        yg = y_enc[train_mask]
        Xkg = pick_cols_k(X_k.loc[kaggle_mask, :])
        oof_g, pred_g = train_group_and_predict(Xg, yg, Xkg, f"{name} (Kaggle)")
        kaggle_pred_proba[np.where(kaggle_mask)[0]] = pred_g

    infer_group(train_male_mask & train_young_mask, km_k & ky_k, "MALE_<24")
    infer_group(train_male_mask & train_old_mask,   km_k & ko_k, "MALE_>=24")
    infer_group(train_female_mask & train_young_mask, kf_k & ky_k, "FEMALE_<24")
    infer_group(train_female_mask & train_old_mask,   kf_k & ko_k, "FEMALE_>=24")

    kaggle_pred_idx = np.argmax(kaggle_pred_proba, axis=1)
    y_pred = le.inverse_transform(kaggle_pred_idx)

        # -------- Overall accuracy to 5 decimals --------
    overall_acc = accuracy_score(y_true, y_pred)
    print(f"\n✅ Overall Accuracy on Kaggle_test: {overall_acc:.5f}")

    # -------- Detailed analysis --------
    order = [
        'Insufficient_Weight',
        'Normal_Weight',
        'Overweight_Level_I',
        'Overweight_Level_II',
        'Obesity_Type_I',
        'Obesity_Type_II',
        'Obesity_Type_III'
    ]

    cm = confusion_matrix(y_true, y_pred, labels=order)
    cm_norm = cm.astype(float) / (cm.sum(axis=1, keepdims=True) + 1e-12)

    print("\n=== Confusion Matrix (counts) ===")
    print("Predicted →")
    print("True ↓")
    for i, true_class in enumerate(order):
        row = " | ".join(f"{cm[i, j]:4d}" for j in range(len(order)))
        print(f"{true_class:<22}: {row}")

    print("\n=== Confusion Matrix (row-normalized) ===")
    for i, true_class in enumerate(order):
        row = " | ".join(f"{cm_norm[i, j]:.2f}" for j in range(len(order)))
        print(f"{true_class:<22}: {row}")

    print("\n=== Per-class metrics ===")
    try:
        print(classification_report(y_true, y_pred, labels=order, target_names=order, digits=4, zero_division=0))
    except Exception as e:
        print(f"[Info] classification_report fallback: {e}")
        print(classification_report(y_true, y_pred, digits=4, zero_division=0))

    print("\n=== Per-class accuracy (diagonal/row total) ===")
    for i, c in enumerate(order):
        total = cm[i].sum()
        correct = cm[i, i]
        acc = correct / total if total > 0 else 0.0
        print(f"{c:<22} | Correct: {correct:3d} / {total:3d} | {acc*100:6.2f}%")

    print("\n=== Most common confusions (true → predicted) ===")
    pairs = []
    for i, t in enumerate(order):
        for j, p in enumerate(order):
            if i == j or cm[i, j] == 0:
                continue
            pairs.append((cm[i, j], t, p, cm_norm[i, j]))
    pairs = sorted(pairs, key=lambda x: (-x[0], -x[3]))
    for cnt, true_label, pred_label, norm_val in pairs[:10]:
        print(f"{true_label:25} → {pred_label:25} | Count: {cnt:3d} | Row%: {norm_val*100:5.1f}")
        



[MALE_<24 (Kaggle)] Fold 1/5




[MALE_<24 (Kaggle)] Best iteration: 277

[MALE_<24 (Kaggle)] Fold 2/5
[MALE_<24 (Kaggle)] Best iteration: 268

[MALE_<24 (Kaggle)] Fold 3/5
[MALE_<24 (Kaggle)] Best iteration: 313

[MALE_<24 (Kaggle)] Fold 4/5
[MALE_<24 (Kaggle)] Best iteration: 329

[MALE_<24 (Kaggle)] Fold 5/5
[MALE_<24 (Kaggle)] Best iteration: 282

[MALE_<24 (Kaggle)] OOF Accuracy: 0.8692 | Macro F1: 0.7504
[MALE_<24 (Kaggle)] Best iterations: [277, 268, 313, 329, 282] | Median: 282

[MALE_>=24 (Kaggle)] Fold 1/5




[MALE_>=24 (Kaggle)] Best iteration: 261

[MALE_>=24 (Kaggle)] Fold 2/5
[MALE_>=24 (Kaggle)] Best iteration: 198

[MALE_>=24 (Kaggle)] Fold 3/5
[MALE_>=24 (Kaggle)] Best iteration: 224

[MALE_>=24 (Kaggle)] Fold 4/5
[MALE_>=24 (Kaggle)] Best iteration: 238

[MALE_>=24 (Kaggle)] Fold 5/5
[MALE_>=24 (Kaggle)] Best iteration: 242

[MALE_>=24 (Kaggle)] OOF Accuracy: 0.9017 | Macro F1: 0.6018
[MALE_>=24 (Kaggle)] Best iterations: [261, 198, 224, 238, 242] | Median: 238

[FEMALE_<24 (Kaggle)] Fold 1/5




[FEMALE_<24 (Kaggle)] Best iteration: 265

[FEMALE_<24 (Kaggle)] Fold 2/5
[FEMALE_<24 (Kaggle)] Best iteration: 391

[FEMALE_<24 (Kaggle)] Fold 3/5
[FEMALE_<24 (Kaggle)] Best iteration: 306

[FEMALE_<24 (Kaggle)] Fold 4/5
[FEMALE_<24 (Kaggle)] Best iteration: 310

[FEMALE_<24 (Kaggle)] Fold 5/5
[FEMALE_<24 (Kaggle)] Best iteration: 299

[FEMALE_<24 (Kaggle)] OOF Accuracy: 0.9070 | Macro F1: 0.7537
[FEMALE_<24 (Kaggle)] Best iterations: [265, 391, 306, 310, 299] | Median: 306

[FEMALE_>=24 (Kaggle)] Fold 1/5
[FEMALE_>=24 (Kaggle)] Best iteration: 270

[FEMALE_>=24 (Kaggle)] Fold 2/5
[FEMALE_>=24 (Kaggle)] Best iteration: 224

[FEMALE_>=24 (Kaggle)] Fold 3/5
[FEMALE_>=24 (Kaggle)] Best iteration: 216

[FEMALE_>=24 (Kaggle)] Fold 4/5
[FEMALE_>=24 (Kaggle)] Best iteration: 232

[FEMALE_>=24 (Kaggle)] Fold 5/5
[FEMALE_>=24 (Kaggle)] Best iteration: 201

[FEMALE_>=24 (Kaggle)] OOF Accuracy: 0.9288 | Macro F1: 0.6814
[FEMALE_>=24 (Kaggle)] Best iterations: [270, 224, 216, 232, 201] | Median: 

In [25]:
# ==============================================
# XGB (Stratified CV) with BMI + Gender & Age-group routing (4 models)
# SMOTE (train-fold only) for Overweight I/II + Kaggle_test evaluation
# ==============================================
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.base import clone
import xgboost as xgb

# Try to import SMOTE; fallback gracefully if not available
try:
    from imblearn.over_sampling import SMOTE
    IMBLEARN_OK = True
except Exception as e:
    print("[Warn] imblearn not available; SMOTE will be disabled:", e)
    IMBLEARN_OK = False

# -------- Paths --------
TRAIN_PATH = "train.csv"
TEST_PATH = "test.csv"
SAMPLE_SUB_PATH = "sample_submission.csv"
KAGGLE_TEST_PATH = "Kaggle_test.csv"  # must contain WeightCategory

# -------- Globals --------
RANDOM_STATE = 42
N_FOLDS = 5
N_JOBS = -1

# ---- Emphasize the tricky classes a bit ----
BOOST_CLASSES = ("Overweight_Level_I", "Overweight_Level_II")
TRAIN_WEIGHT_MULT = 1.75      # 1.5–2.5 is a sensible range
USE_SMOTE = True              # turn on/off globally
NUM_BOOST_ROUND = 20000
EARLY_STOP = 200

# ==============================================
# Helpers
# ==============================================
def norm_col(s: str) -> str:
    if s is None:
        return s
    return str(s).replace("\ufeff", "").strip().lower()

def infer_feature_types(df):
    cat_cols = df.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    return num_cols, cat_cols

def detect_gender_column(df):
    for c in df.columns:
        if norm_col(c) in {"gender", "sex"}:
            return c
    for c in df.columns:
        vals = pd.Series(df[c].dropna().astype(str).str.lower().str.strip()).unique()
        if len(vals) in (2, 3):
            if any(v.startswith("m") for v in vals) and any(v.startswith("f") for v in vals):
                return c
    return None

def detect_age_column(df):
    for c in df.columns:
        if norm_col(c) in {"age", "years", "age_years"}:
            return c
    # heuristic: numeric col with plausible range
    for c in df.columns:
        if pd.api.types.is_numeric_dtype(df[c]):
            s = pd.to_numeric(df[c], errors="coerce")
            if s.notna().sum() > 0:
                q1, q99 = np.nanpercentile(s, [1, 99])
                if 5 <= q1 <= 60 or 5 <= q99 <= 100:
                    return c
    return None

def split_by_gender(series):
    s = series.astype(str).str.lower().str.strip()
    male_mask = s.str.startswith(("m", "1", "true"))
    female_mask = s.str.startswith(("f", "0", "false"))
    if male_mask.sum() == 0 and female_mask.sum() == 0:
        top = s.value_counts().index.tolist()
        if len(top) >= 2:
            male_mask = s == top[0]
            female_mask = s == top[1]
    return male_mask, female_mask

def add_bmi(df):
    """BMI = Weight / (Height_m^2). If median height > 3 assume cm -> meters."""
    if ("Weight" in df.columns) and ("Height" in df.columns):
        h = pd.to_numeric(df["Height"], errors="coerce")
        height_m = np.where(np.nanmedian(h) > 3.0, h / 100.0, h)
        with np.errstate(divide="ignore", invalid="ignore"):
            bmi = pd.to_numeric(df["Weight"], errors="coerce") / (np.power(height_m, 2) + 1e-12)
        df["BMI"] = pd.Series(bmi).replace([np.inf, -np.inf], np.nan)
    return df

def add_age_features(df, age_col):
    """Create rounded age and binary AgeGroup (<24 vs >=24)."""
    if age_col is None or age_col not in df.columns:
        raise ValueError("Age column not found; cannot create AgeGroup split.")
    age = pd.to_numeric(df[age_col], errors="coerce")
    df["AgeRounded"] = np.rint(age).astype("float32")
    df["AgeGroup"] = np.where(age < 24, "<24", ">=24")
    return df


In [26]:

# ==============================================
# Load data
# ==============================================
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
sample_sub = pd.read_csv(SAMPLE_SUB_PATH)

# Optional drops
for c in ["MTRANS", "SMOKE"]:
    if c in train.columns:
        train.drop(columns=[c], inplace=True)
    if c in test.columns:
        test.drop(columns=[c], inplace=True)

# Feature engineering
train = add_bmi(train)
test = add_bmi(test)

# Detect ID/Target
id_col = None
for cand in ["id", "row_id", "index", "sample_id"]:
    if cand in train.columns and cand in test.columns:
        id_col = cand
        break

target_col = None
for cand in ["WeightCategory", "NObeyesdad", "label", "target", "class", "y"]:
    if cand in train.columns:
        target_col = cand
        break
if target_col is None:
    raise ValueError("Could not detect target column in train.csv")

# Build X/y
y = train[target_col].copy()
X = train.drop(columns=[target_col]).copy()
if id_col and id_col in X.columns:
    X.drop(columns=[id_col], inplace=True)

test_features = test.copy()
if id_col and id_col in test_features.columns:
    test_ids = test_features[id_col].copy()
    test_features.drop(columns=[id_col], inplace=True)
else:
    test_ids = pd.Series(np.arange(len(test_features)), name="id")

# Label encode target
le = LabelEncoder()
y_enc = le.fit_transform(y)
classes = list(le.classes_)
print(f"[Info] Classes: {classes}")

# Detect gender + age and create age groups
gender_col = detect_gender_column(pd.concat([X, test_features], axis=0))
if gender_col is None:
    raise ValueError("Could not detect a gender column (e.g., 'Gender'/'SEX').")

age_col = detect_age_column(pd.concat([X, test_features], axis=0))
if age_col is None:
    raise ValueError("Could not detect an Age column.")

# Add AgeRounded & AgeGroup in BOTH train/test
train = add_age_features(train, age_col)
test_features = add_age_features(test_features, age_col)

# Rebuild X (because train got new columns)
X = train.drop(columns=[target_col]).copy()
if id_col and id_col in X.columns:
    X.drop(columns=[id_col], inplace=True)

# Split gender masks
train_male_mask, train_female_mask = split_by_gender(train[gender_col])
test_male_mask, test_female_mask = split_by_gender(test_features[gender_col])

# Split age-group masks
train_young_mask = train["AgeGroup"] == "<24"
train_old_mask = ~train_young_mask
test_young_mask = test_features["AgeGroup"] == "<24"
test_old_mask = ~test_young_mask

print(f"[Info] Train M<24={int((train_male_mask & train_young_mask).sum())}, "
      f"M>=24={int((train_male_mask & train_old_mask).sum())}, "
      f"F<24={int((train_female_mask & train_young_mask).sum())}, "
      f"F>=24={int((train_female_mask & train_old_mask).sum())}")

print(f"[Info] Test  M<24={int((test_male_mask & test_young_mask).sum())}, "
      f"M>=24={int((test_male_mask & test_old_mask).sum())}, "
      f"F<24={int((test_female_mask & test_young_mask).sum())}, "
      f"F>=24={int((test_female_mask & test_old_mask).sum())}")


[Info] Classes: ['Insufficient_Weight', 'Normal_Weight', 'Obesity_Type_I', 'Obesity_Type_II', 'Obesity_Type_III', 'Overweight_Level_I', 'Overweight_Level_II']
[Info] Train M<24=4649, M>=24=3134, F<24=4786, F>=24=2964
[Info] Test  M<24=6130, M>=24=4206, F<24=6420, F>=24=4002


In [29]:

# ==============================================
# Training function (group-specific)
# Stratified 5-fold + optional SMOTE for Overweight I/II
# ==============================================
def train_group_and_predict(X_grp, y_enc_grp, test_grp, group_name,
                            boost_targets=BOOST_CLASSES, base_boost=TRAIN_WEIGHT_MULT):
    # Drop gender (constant in a gender-split); keep AgeGroup/rounded as features
    cols_to_use = [c for c in X_grp.columns if c != gender_col]
    Xg = X_grp[cols_to_use].copy()
    Xtestg = test_grp[cols_to_use].copy()

    num_cols, cat_cols = infer_feature_types(Xg)

    # Preprocessor -> DENSE (SMOTE needs dense)
    numeric_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler(with_mean=True))  # dense, so with_mean=True is fine
    ])
    try:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)  # force dense
    except TypeError:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)
    categorical_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", ohe)
    ])
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, num_cols),
            ("cat", categorical_transformer, cat_cols),
        ],
        remainder="drop",
        sparse_threshold=0.0   # <- ensure dense output
    )

    # XGB params
    xgb_params = {
        "objective": "multi:softprob",
        "num_class": len(classes),
        "eval_metric": "mlogloss",
        "tree_method": "hist",
        "max_depth": 6,
        "min_child_weight": 2,
        "subsample": 0.9,
        "colsample_bytree": 0.9,
        "lambda": 1.0,
        "alpha": 0.0,
        "eta": 0.03,
        "nthread": N_JOBS,
        "seed": RANDOM_STATE,
    }

    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    oof_group = np.zeros((len(Xg), len(classes)), dtype=np.float32)
    test_group_pred = np.zeros((len(Xtestg), len(classes)), dtype=np.float32)
    fold_best = []

    # indices
    cls_to_idx = {c: i for i, c in enumerate(classes)}
    idx_I = cls_to_idx.get("Overweight_Level_I", None)
    idx_II = cls_to_idx.get("Overweight_Level_II", None)

    for fold, (tr_idx, va_idx) in enumerate(skf.split(Xg, y_enc_grp), start=1):
        print(f"\n[{group_name}] Fold {fold}/{N_FOLDS}")
        X_tr, X_va = Xg.iloc[tr_idx], Xg.iloc[va_idx]
        y_tr, y_va = y_enc_grp[tr_idx], y_enc_grp[va_idx]

        prep = clone(preprocessor)
        Xtr = prep.fit_transform(X_tr)  # dense
        Xva = prep.transform(X_va)      # dense
        Xte = prep.transform(Xtestg)    # dense

        # --------- SMOTE (train split only) focusing on Overweight I/II ---------
        used_smote = False
        if USE_SMOTE and IMBLEARN_OK and (idx_I is not None) and (idx_II is not None):
            # class counts
            counts = np.bincount(y_tr, minlength=len(classes))
            # need at least 2 samples per class for SMOTE kNN
            if counts[idx_I] >= 2 and counts[idx_II] >= 2:
                max_target = int(np.max(counts))  # upsample the two to max class size
                sampling_strategy = {idx_I: max_target, idx_II: max_target}

                # pick k_neighbors safely (must be < minority count)
                k_safe = max(1, min(5, counts[idx_I] - 1, counts[idx_II] - 1))
                if k_safe >= 1 and max_target > counts[idx_I] and max_target > counts[idx_II]:
                    smote = SMOTE(
                        random_state=RANDOM_STATE + fold,
                        sampling_strategy=sampling_strategy,
                        k_neighbors=k_safe
                    )
                    Xtr, y_tr = smote.fit_resample(Xtr, y_tr)
                    used_smote = True
                    print(f"[{group_name}] SMOTE applied (k={k_safe}) to Overweight I/II -> size {Xtr.shape}")

        # --------- Sample weights (skip extra boosts if SMOTE already balanced) ---------
        if used_smote:
            w_tr = np.ones_like(y_tr, dtype=float)
        else:
            # light deterministic boost to I/II (kept small to avoid double counting)
            w_tr = np.ones_like(y_tr, dtype=float)
            if idx_I is not None:
                w_tr[y_tr == idx_I] = TRAIN_WEIGHT_MULT
            if idx_II is not None:
                w_tr[y_tr == idx_II] = TRAIN_WEIGHT_MULT

        w_va = np.ones_like(y_va, dtype=float)

        dtrain = xgb.DMatrix(Xtr, label=y_tr, weight=w_tr)
        dval   = xgb.DMatrix(Xva, label=y_va, weight=w_va)
        dtest  = xgb.DMatrix(Xte)

        bst = xgb.train(
            params=xgb_params,
            dtrain=dtrain,
            num_boost_round=NUM_BOOST_ROUND,
            evals=[(dtrain, "train"), (dval, "valid")],
            early_stopping_rounds=EARLY_STOP,
            verbose_eval=False
        )
        best_round = int(bst.best_iteration + 1)
        fold_best.append(best_round)
        print(f"[{group_name}] Best iteration: {best_round}")

        # OOF
        oof_proba = bst.predict(dval, iteration_range=(0, best_round))
        oof_group[va_idx] = oof_proba

        # Test preds (fold-avg)
        test_group_pred += bst.predict(dtest, iteration_range=(0, best_round)) / N_FOLDS

    # OOF summary
    oof_labels = np.argmax(oof_group, axis=1)
    acc_g = accuracy_score(y_enc_grp, oof_labels)
    f1_g = f1_score(y_enc_grp, oof_labels, average="macro")
    print(f"\n[{group_name}] OOF Accuracy: {acc_g:.4f} | Macro F1: {f1_g:.4f}")
    print(f"[{group_name}] Best iterations: {fold_best} | Median: {int(np.median(fold_best))}")

    return oof_group, test_group_pred



In [30]:

# ==============================================
# Build the four groups and train
# ==============================================
def pick_cols(df):
    return df[X.columns.tolist()].copy()

groups = {
    "MALE_<24":   (train_male_mask & train_young_mask,  test_male_mask & test_young_mask),
    "MALE_>=24":  (train_male_mask & train_old_mask,    test_male_mask & test_old_mask),
    "FEMALE_<24": (train_female_mask & train_young_mask,test_female_mask & test_young_mask),
    "FEMALE_>=24":(train_female_mask & train_old_mask,  test_female_mask & test_old_mask),
}

oof_full = np.zeros((len(X), len(classes)), dtype=np.float32)
test_pred_proba = np.zeros((len(test_features), len(classes)), dtype=np.float32)

for gname, (tr_mask, te_mask) in groups.items():
    if tr_mask.sum() == 0:
        print(f"[Warn] No training rows for group {gname}; skipping.")
        continue
    Xg = pick_cols(train.loc[tr_mask])
    yg = y_enc[tr_mask]
    Xtg = pick_cols(test_features.loc[te_mask])

    oof_g, test_g = train_group_and_predict(Xg, yg, Xtg, gname)
    oof_full[np.where(tr_mask)[0]] = oof_g
    test_pred_proba[np.where(te_mask)[0]] = test_g

# ==============================================
# OOF summary (all groups combined)
# ==============================================
oof_labels = np.argmax(oof_full, axis=1)
oof_acc = accuracy_score(y_enc, oof_labels)
oof_f1 = f1_score(y_enc, oof_labels, average="macro")
print("\n========== OVERALL OOF ==========")
print(f"OOF Accuracy: {oof_acc:.4f} | OOF Macro F1: {oof_f1:.4f}")
try:
    print("\nOOF Classification Report:\n",
          classification_report(y_enc, oof_labels, target_names=classes, zero_division=0))
except Exception as e:
    print(f"[Info] Could not print classification report: {e}")





[MALE_<24] Fold 1/5
[MALE_<24] SMOTE applied (k=5) to Overweight I/II -> size (3987, 24)
[MALE_<24] Best iteration: 287

[MALE_<24] Fold 2/5
[MALE_<24] SMOTE applied (k=5) to Overweight I/II -> size (3990, 24)
[MALE_<24] Best iteration: 268

[MALE_<24] Fold 3/5
[MALE_<24] SMOTE applied (k=5) to Overweight I/II -> size (3989, 24)
[MALE_<24] Best iteration: 301

[MALE_<24] Fold 4/5
[MALE_<24] SMOTE applied (k=5) to Overweight I/II -> size (3989, 24)
[MALE_<24] Best iteration: 321

[MALE_<24] Fold 5/5
[MALE_<24] SMOTE applied (k=5) to Overweight I/II -> size (3989, 24)
[MALE_<24] Best iteration: 254

[MALE_<24] OOF Accuracy: 0.8684 | Macro F1: 0.7494
[MALE_<24] Best iterations: [287, 268, 301, 321, 254] | Median: 287

[MALE_>=24] Fold 1/5
[MALE_>=24] SMOTE applied (k=5) to Overweight I/II -> size (5051, 24)




[MALE_>=24] Best iteration: 290

[MALE_>=24] Fold 2/5
[MALE_>=24] SMOTE applied (k=5) to Overweight I/II -> size (5051, 24)
[MALE_>=24] Best iteration: 224

[MALE_>=24] Fold 3/5
[MALE_>=24] SMOTE applied (k=5) to Overweight I/II -> size (5049, 24)
[MALE_>=24] Best iteration: 228

[MALE_>=24] Fold 4/5
[MALE_>=24] SMOTE applied (k=5) to Overweight I/II -> size (5050, 24)
[MALE_>=24] Best iteration: 216

[MALE_>=24] Fold 5/5
[MALE_>=24] SMOTE applied (k=5) to Overweight I/II -> size (5051, 24)
[MALE_>=24] Best iteration: 245

[MALE_>=24] OOF Accuracy: 0.8928 | Macro F1: 0.5900
[MALE_>=24] Best iterations: [290, 224, 228, 216, 245] | Median: 228

[FEMALE_<24] Fold 1/5
[FEMALE_<24] SMOTE applied (k=5) to Overweight I/II -> size (5007, 24)




[FEMALE_<24] Best iteration: 308

[FEMALE_<24] Fold 2/5
[FEMALE_<24] SMOTE applied (k=5) to Overweight I/II -> size (5008, 24)
[FEMALE_<24] Best iteration: 366

[FEMALE_<24] Fold 3/5
[FEMALE_<24] SMOTE applied (k=5) to Overweight I/II -> size (5007, 24)
[FEMALE_<24] Best iteration: 300

[FEMALE_<24] Fold 4/5
[FEMALE_<24] SMOTE applied (k=5) to Overweight I/II -> size (5009, 24)
[FEMALE_<24] Best iteration: 315

[FEMALE_<24] Fold 5/5
[FEMALE_<24] SMOTE applied (k=5) to Overweight I/II -> size (5009, 24)
[FEMALE_<24] Best iteration: 297

[FEMALE_<24] OOF Accuracy: 0.9056 | Macro F1: 0.7536
[FEMALE_<24] Best iterations: [308, 366, 300, 315, 297] | Median: 308

[FEMALE_>=24] Fold 1/5
[FEMALE_>=24] SMOTE applied (k=5) to Overweight I/II -> size (5059, 24)
[FEMALE_>=24] Best iteration: 344

[FEMALE_>=24] Fold 2/5
[FEMALE_>=24] SMOTE applied (k=5) to Overweight I/II -> size (5058, 24)
[FEMALE_>=24] Best iteration: 223

[FEMALE_>=24] Fold 3/5
[FEMALE_>=24] SMOTE applied (k=5) to Overweight I/I

In [31]:

# ==============================================
# Build submission
# ==============================================
test_pred_int = np.argmax(test_pred_proba, axis=1)
test_pred_labels = le.inverse_transform(test_pred_int)

ss_cols = list(sample_sub.columns)
ID_HEADER = None
LABEL_HEADER = None
if len(ss_cols) == 2:
    c1, c2 = ss_cols
    if c1 in test.columns and c2 not in test.columns:
        ID_HEADER, LABEL_HEADER = c1, c2
    elif c2 in test.columns and c1 not in test.columns:
        ID_HEADER, LABEL_HEADER = c2, c1
if ID_HEADER is None:
    ID_HEADER = ss_cols[0]
    LABEL_HEADER = ss_cols[1]

sub = pd.DataFrame()
if ID_HEADER in test.columns:
    sub[ID_HEADER] = test[ID_HEADER].values
else:
    sub[ID_HEADER] = np.arange(len(test_features))
sub[LABEL_HEADER] = test_pred_labels

for c in ss_cols:
    if c not in sub.columns:
        sub[c] = sample_sub[c].iloc[0] if len(sample_sub[c]) else None
sub = sub[ss_cols]
sub.to_csv("submission.csv", index=False)
print("\nSaved submission.csv")
print(sub.head(5))



Saved submission.csv
   id       WeightCategory
0   0  Overweight_Level_II
1   1        Normal_Weight
2   2  Insufficient_Weight
3   3     Obesity_Type_III
4   4  Overweight_Level_II


In [32]:
# ==============================================
# Evaluate on Kaggle_test.csv (with ground truth)
# ==============================================
if not os.path.exists(KAGGLE_TEST_PATH):
    print(f"\n[Warn] {KAGGLE_TEST_PATH} not found. Skipping Kaggle_test evaluation.")
else:
    kdf = pd.read_csv(KAGGLE_TEST_PATH)
    if "WeightCategory" not in kdf.columns:
        raise KeyError("Kaggle_test.csv must contain 'WeightCategory'.")

    y_true = kdf["WeightCategory"].copy()
    X_k = kdf.drop(columns=["WeightCategory"], errors="ignore").copy()
    if id_col and id_col in X_k.columns:
        X_k.drop(columns=[id_col], inplace=True)

    for c in ["MTRANS", "SMOKE"]:
        if c in X_k.columns:
            X_k.drop(columns=[c], inplace=True)
    X_k = add_bmi(X_k)

    gender_col_k = detect_gender_column(X_k)
    if gender_col_k is None:
        raise ValueError("Could not detect a gender column in Kaggle_test.csv")
    age_col_k = detect_age_column(X_k)
    if age_col_k is None:
        raise ValueError("Could not detect an Age column in Kaggle_test.csv")
    X_k = add_age_features(X_k, age_col_k)

    km_k, kf_k = split_by_gender(X_k[gender_col_k])
    ky_k = X_k["AgeGroup"] == "<24"
    ko_k = ~ky_k

    kaggle_pred_proba = np.zeros((len(X_k), len(classes)), dtype=np.float32)

    def pick_cols_k(df):
        use_cols = [c for c in X.columns.tolist() if c in df.columns]
        missing = [c for c in X.columns.tolist() if c not in df.columns]
        tmp = df[use_cols].copy()
        for m in missing:
            tmp[m] = np.nan
        return tmp[X.columns.tolist()]

    def infer_group(train_mask, kaggle_mask, name):
        if train_mask.sum() == 0 or kaggle_mask.sum() == 0:
            print(f"[Kaggle] Skip group {name}: train={int(train_mask.sum())}, eval={int(kaggle_mask.sum())}")
            return
        Xg = X.loc[train_mask, :]
        yg = y_enc[train_mask]
        Xkg = pick_cols_k(X_k.loc[kaggle_mask, :])
        oof_g, pred_g = train_group_and_predict(Xg, yg, Xkg, f"{name} (Kaggle)")
        kaggle_pred_proba[np.where(kaggle_mask)[0]] = pred_g

    infer_group(train_male_mask & train_young_mask, km_k & ky_k, "MALE_<24")
    infer_group(train_male_mask & train_old_mask,   km_k & ko_k, "MALE_>=24")
    infer_group(train_female_mask & train_young_mask, kf_k & ky_k, "FEMALE_<24")
    infer_group(train_female_mask & train_old_mask,   kf_k & ko_k, "FEMALE_>=24")

    kaggle_pred_idx = np.argmax(kaggle_pred_proba, axis=1)
    y_pred = le.inverse_transform(kaggle_pred_idx)

    overall_acc = accuracy_score(y_true, y_pred)
    print(f"\n✅ Overall Accuracy on Kaggle_test: {overall_acc:.5f}")

    order = [
        'Insufficient_Weight',
        'Normal_Weight',
        'Overweight_Level_I',
        'Overweight_Level_II',
        'Obesity_Type_I',
        'Obesity_Type_II',
        'Obesity_Type_III'
    ]

    cm = confusion_matrix(y_true, y_pred, labels=order)
    cm_norm = cm.astype(float) / (cm.sum(axis=1, keepdims=True) + 1e-12)

    print("\n=== Confusion Matrix (counts) ===")
    print("Predicted →")
    print("True ↓")
    for i, true_class in enumerate(order):
        row = " | ".join(f"{cm[i, j]:4d}" for j in range(len(order)))
        print(f"{true_class:<22}: {row}")

    print("\n=== Confusion Matrix (row-normalized) ===")
    for i, true_class in enumerate(order):
        row = " | ".join(f"{cm_norm[i, j]:.2f}" for j in range(len(order)))
        print(f"{true_class:<22}: {row}")

    print("\n=== Per-class metrics ===")
    try:
        print(classification_report(y_true, y_pred, labels=order, target_names=order, digits=4, zero_division=0))
    except Exception as e:
        print(f"[Info] classification_report fallback: {e}")
        print(classification_report(y_true, y_pred, digits=4, zero_division=0))

    print("\n=== Per-class accuracy (diagonal/row total) ===")
    for i, c in enumerate(order):
        total = cm[i].sum()
        correct = cm[i, i]
        acc = correct / total if total > 0 else 0.0
        print(f"{c:<22} | Correct: {correct:3d} / {total:3d} | {acc*100:6.2f}%")

    print("\n=== Most common confusions (true → predicted) ===")
    pairs = []
    for i, t in enumerate(order):
        for j, p in enumerate(order):
            if i == j or cm[i, j] == 0:
                continue
            pairs.append((cm[i, j], t, p, cm_norm[i, j]))
    pairs = sorted(pairs, key=lambda x: (-x[0], -x[3]))
    for cnt, true_label, pred_label, norm_val in pairs[:10]:
        print(f"{true_label:25} → {pred_label:25} | Count: {cnt:3d} | Row%: {norm_val*100:5.1f}")


[MALE_<24 (Kaggle)] Fold 1/5
[MALE_<24 (Kaggle)] SMOTE applied (k=5) to Overweight I/II -> size (3987, 24)




[MALE_<24 (Kaggle)] Best iteration: 287

[MALE_<24 (Kaggle)] Fold 2/5
[MALE_<24 (Kaggle)] SMOTE applied (k=5) to Overweight I/II -> size (3990, 24)
[MALE_<24 (Kaggle)] Best iteration: 268

[MALE_<24 (Kaggle)] Fold 3/5
[MALE_<24 (Kaggle)] SMOTE applied (k=5) to Overweight I/II -> size (3989, 24)
[MALE_<24 (Kaggle)] Best iteration: 301

[MALE_<24 (Kaggle)] Fold 4/5
[MALE_<24 (Kaggle)] SMOTE applied (k=5) to Overweight I/II -> size (3989, 24)
[MALE_<24 (Kaggle)] Best iteration: 321

[MALE_<24 (Kaggle)] Fold 5/5
[MALE_<24 (Kaggle)] SMOTE applied (k=5) to Overweight I/II -> size (3989, 24)
[MALE_<24 (Kaggle)] Best iteration: 254

[MALE_<24 (Kaggle)] OOF Accuracy: 0.8684 | Macro F1: 0.7494
[MALE_<24 (Kaggle)] Best iterations: [287, 268, 301, 321, 254] | Median: 287

[MALE_>=24 (Kaggle)] Fold 1/5
[MALE_>=24 (Kaggle)] SMOTE applied (k=5) to Overweight I/II -> size (5051, 24)




[MALE_>=24 (Kaggle)] Best iteration: 290

[MALE_>=24 (Kaggle)] Fold 2/5
[MALE_>=24 (Kaggle)] SMOTE applied (k=5) to Overweight I/II -> size (5051, 24)
[MALE_>=24 (Kaggle)] Best iteration: 224

[MALE_>=24 (Kaggle)] Fold 3/5
[MALE_>=24 (Kaggle)] SMOTE applied (k=5) to Overweight I/II -> size (5049, 24)
[MALE_>=24 (Kaggle)] Best iteration: 228

[MALE_>=24 (Kaggle)] Fold 4/5
[MALE_>=24 (Kaggle)] SMOTE applied (k=5) to Overweight I/II -> size (5050, 24)
[MALE_>=24 (Kaggle)] Best iteration: 216

[MALE_>=24 (Kaggle)] Fold 5/5
[MALE_>=24 (Kaggle)] SMOTE applied (k=5) to Overweight I/II -> size (5051, 24)
[MALE_>=24 (Kaggle)] Best iteration: 245

[MALE_>=24 (Kaggle)] OOF Accuracy: 0.8928 | Macro F1: 0.5900
[MALE_>=24 (Kaggle)] Best iterations: [290, 224, 228, 216, 245] | Median: 228

[FEMALE_<24 (Kaggle)] Fold 1/5
[FEMALE_<24 (Kaggle)] SMOTE applied (k=5) to Overweight I/II -> size (5007, 24)




[FEMALE_<24 (Kaggle)] Best iteration: 308

[FEMALE_<24 (Kaggle)] Fold 2/5
[FEMALE_<24 (Kaggle)] SMOTE applied (k=5) to Overweight I/II -> size (5008, 24)
[FEMALE_<24 (Kaggle)] Best iteration: 366

[FEMALE_<24 (Kaggle)] Fold 3/5
[FEMALE_<24 (Kaggle)] SMOTE applied (k=5) to Overweight I/II -> size (5007, 24)
[FEMALE_<24 (Kaggle)] Best iteration: 300

[FEMALE_<24 (Kaggle)] Fold 4/5
[FEMALE_<24 (Kaggle)] SMOTE applied (k=5) to Overweight I/II -> size (5009, 24)
[FEMALE_<24 (Kaggle)] Best iteration: 315

[FEMALE_<24 (Kaggle)] Fold 5/5
[FEMALE_<24 (Kaggle)] SMOTE applied (k=5) to Overweight I/II -> size (5009, 24)
[FEMALE_<24 (Kaggle)] Best iteration: 297

[FEMALE_<24 (Kaggle)] OOF Accuracy: 0.9056 | Macro F1: 0.7536
[FEMALE_<24 (Kaggle)] Best iterations: [308, 366, 300, 315, 297] | Median: 308

[FEMALE_>=24 (Kaggle)] Fold 1/5
[FEMALE_>=24 (Kaggle)] SMOTE applied (k=5) to Overweight I/II -> size (5059, 24)
[FEMALE_>=24 (Kaggle)] Best iteration: 344

[FEMALE_>=24 (Kaggle)] Fold 2/5
[FEMALE_>=