In [7]:
# ==============================================
# End-to-end (Gender-Specific Models) + BMI feature:
# Load → Detect ID/Target/Gender → Drop MTRANS/SMOKE → +BMI →
# Split by Gender → Per-gender 5-Fold XGB (ES) → Predict → submission.csv
# ==============================================

# -------- Imports --------
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.base import clone

import xgboost as xgb

# -------- Paths --------
TRAIN_PATH = "train.csv"
TEST_PATH = "test.csv"
SAMPLE_SUB_PATH = "sample_submission.csv"

RANDOM_STATE = 42
N_FOLDS = 5
N_JOBS = -1
NUM_CLASSES_EXPECTED = 7   # used only for a sanity warning

# -------- Helpers --------
def norm_col(s: str) -> str:
    if s is None: return s
    return str(s).replace("\ufeff", "").strip().lower()

def build_norm_map(cols):
    fwd = {c: norm_col(c) for c in cols}
    rev = {}
    for orig, n in fwd.items():
        if n not in rev:
            rev[n] = orig
    return fwd, rev

def find_id_and_label(sample_sub, train, test):
    ss_fwd, ss_rev = build_norm_map(sample_sub.columns)
    tr_fwd, tr_rev = build_norm_map(train.columns)
    te_fwd, te_rev = build_norm_map(test.columns)

    ss_norm_cols = [ss_fwd[c] for c in sample_sub.columns]
    tr_norm_cols = [tr_fwd[c] for c in train.columns]
    te_norm_cols = [te_fwd[c] for c in test.columns]

    id_norm, label_norm = None, None
    if len(ss_norm_cols) == 2:
        c1, c2 = ss_norm_cols
        if c1 in te_norm_cols and c2 not in te_norm_cols:
            id_norm, label_norm = c1, c2
        elif c2 in te_norm_cols and c1 not in te_norm_cols:
            id_norm, label_norm = c2, c1
        else:
            if c1 in te_norm_cols and c1 in tr_norm_cols:
                id_norm, label_norm = c1, c2
            elif c2 in te_norm_cols and c2 in tr_norm_cols:
                id_norm, label_norm = c2, c1

    if id_norm is None:
        for cand in ["id", "row_id", "index", "sample_id"]:
            if cand in te_norm_cols and cand in tr_norm_cols:
                id_norm = cand
                break

    if label_norm is None:
        candidates = [c for c in ss_norm_cols if c != id_norm]
        if len(candidates) == 1:
            label_norm = candidates[0]

    if label_norm is None:
        for cand in ["label", "target", "class", "y", "weightcategory", "nobeyesdad"]:
            if cand in tr_norm_cols and cand != id_norm:
                label_norm = cand
                break

    if label_norm is None:
        for c in reversed(tr_norm_cols):
            if c != id_norm:
                label_norm = c
                break

    return {
        "id_norm": id_norm,
        "label_norm": label_norm,
        "id_in_train": build_norm_map(train.columns)[1].get(id_norm, None),
        "id_in_test": build_norm_map(test.columns)[1].get(id_norm, None),
        "id_in_sample": build_norm_map(sample_sub.columns)[1].get(id_norm, None),
        "label_in_train": build_norm_map(train.columns)[1].get(label_norm, None),
        "label_in_sample": build_norm_map(sample_sub.columns)[1].get(label_norm, None),
    }

def infer_feature_types(df):
    cat_cols = df.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    return num_cols, cat_cols

def detect_gender_column(df):
    # try common names
    candidates = [c for c in df.columns if norm_col(c) in {"gender","sex"}]
    if candidates:
        return candidates[0]
    # weak fallback: any column with two unique values that looks like M/F
    for c in df.columns:
        vals = pd.Series(df[c].dropna().astype(str).str.lower().str.strip()).unique()
        if len(vals) in (2, 3):
            if any(v.startswith("m") for v in vals) and any(v.startswith("f") for v in vals):
                return c
    return None

def split_by_gender(series):
    s = series.astype(str).str.lower().str.strip()
    male_mask = s.str.startswith(("m","1","true"))
    female_mask = s.str.startswith(("f","0","false"))
    if male_mask.sum()==0 and female_mask.sum()==0:
        top = s.value_counts().index.tolist()
        if len(top)>=2:
            male_mask = s==top[0]
            female_mask = s==top[1]
    return male_mask, female_mask

def add_bmi(df):
    """Compute BMI = Weight / (Height_m^2) with robust height-unit detection."""
    if ("Weight" in df.columns) and ("Height" in df.columns):
        h = df["Height"].astype(float)
        # If median height > 3 assume cm and convert to meters
        height_m = np.where(h.median() > 3.0, h / 100.0, h)
        with np.errstate(divide="ignore", invalid="ignore"):
            bmi = df["Weight"].astype(float) / (np.power(height_m, 2) + 1e-12)
        df["BMI"] = bmi.replace([np.inf, -np.inf], np.nan)
    return df

# -------- Load data --------
train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)
sample_sub = pd.read_csv(SAMPLE_SUB_PATH)

# Drop not-used columns
for c in ["MTRANS","SMOKE"]:
    if c in train.columns: train.drop(columns=[c], inplace=True)
    if c in test.columns:  test.drop(columns=[c], inplace=True)

# >>> Add BMI feature (train & test) <<<
train = add_bmi(train)
test  = add_bmi(test)

info = find_id_and_label(sample_sub, train, test)

ID_COL_TRAIN   = info["id_in_train"]
ID_COL_TEST    = info["id_in_test"]
ID_COL_SAMPLE  = info["id_in_sample"]
TARGET_COL     = info["label_in_train"]
LABEL_COL_SAMP = info["label_in_sample"]

if TARGET_COL is None:
    raise ValueError("Could not detect the target column. Please ensure sample_submission and train headers align.")
if LABEL_COL_SAMP is None:
    ss_cols = list(sample_sub.columns)
    others = [c for c in ss_cols if c != ID_COL_SAMPLE]
    if len(others)==1:
        LABEL_COL_SAMP = others[0]
    else:
        raise ValueError("Could not detect label header in sample_submission.csv")

print(f"[Detected] Target in train: '{TARGET_COL}', Label in sample_sub: '{LABEL_COL_SAMP}'")
if ID_COL_TRAIN and ID_COL_TEST:
    print(f"[Detected] ID in train: '{ID_COL_TRAIN}', ID in test: '{ID_COL_TEST}'")

# -------- Target / Features --------
y = train[TARGET_COL].copy()
X = train.drop(columns=[TARGET_COL]).copy()
if ID_COL_TRAIN in X.columns:
    X.drop(columns=[ID_COL_TRAIN], inplace=True)

test_features = test.copy()
if ID_COL_TEST in test_features.columns:
    test_ids = test_features[ID_COL_TEST].copy()
    test_features.drop(columns=[ID_COL_TEST], inplace=True)
else:
    test_ids = pd.Series(np.arange(len(test_features)), name="id")

# -------- Label encode target --------
le = LabelEncoder()
y_enc = le.fit_transform(y)
classes = list(le.classes_)
if len(classes) != NUM_CLASSES_EXPECTED:
    print(f"[Warn] Expected {NUM_CLASSES_EXPECTED} classes but found {len(classes)}. Proceeding.")

# -------- Detect gender column and split --------
gender_col = detect_gender_column(pd.concat([X, test_features], axis=0))
if gender_col is None:
    raise ValueError("Could not detect a gender column (e.g., 'Gender' or 'SEX'). Please confirm the column name.")

male_mask, female_mask = split_by_gender(train[gender_col])
test_male_mask, test_female_mask = split_by_gender(test_features[gender_col])

print(f"[Info] Train male rows: {int(male_mask.sum())} | female rows: {int(female_mask.sum())}")
print(f"[Info] Test  male rows: {int(test_male_mask.sum())} | female rows: {int(test_female_mask.sum())}")

from sklearn.metrics import confusion_matrix

def penalized_metric(preds, dtrain):
    labels = dtrain.get_label().astype(int)
    preds = preds.reshape(-1, len(classes))
    preds_class = np.argmax(preds, axis=1)

    cm = confusion_matrix(labels, preds_class, labels=range(len(classes)))

    # find the encoded indices for Overweight_Level_I and II
    i1 = classes.index("Overweight_Level_I")
    i2 = classes.index("Overweight_Level_II")

    penalty = cm[i1, i2] + cm[i2, i1]   # confusions between I and II
    acc = (preds_class == labels).mean()

    # reduce metric slightly based on that penalty
    penalized_acc = acc - 0.5 * (penalty / len(labels))
    return "penalized_acc", penalized_acc


# We drop gender col inside each group (it's constant after split)
def train_group_and_predict(X_grp, y_enc_grp, test_grp, group_name):
    # remove gender from features
    cols_to_use = [c for c in X_grp.columns if c != gender_col]
    Xg = X_grp[cols_to_use].copy()
    Xtestg = test_grp[cols_to_use].copy()

    # feature types
    num_cols, cat_cols = infer_feature_types(Xg)

    # Preprocessor: sparse OHE (works with xgb DMatrix)
    numeric_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler(with_mean=False))
    ])
    try:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
    except TypeError:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse=True)

    categorical_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", ohe)
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, num_cols),
            ("cat", categorical_transformer, cat_cols),
        ],
        remainder="drop",
        sparse_threshold=1.0
    )

    # XGBoost params
    xgb_params = {
        "objective": "multi:softprob",
        "num_class": len(classes),
        "eval_metric": "mlogloss",
        "tree_method": "hist",
        "max_depth": 6,
        "min_child_weight": 2,
        "subsample": 0.9,
        "colsample_bytree": 0.9,
        "lambda": 1.0,
        "alpha": 0.0,
        "eta": 0.03,
        "nthread": N_JOBS,
        "seed": RANDOM_STATE,
    }
    NUM_BOOST_ROUND = 20000
    EARLY_STOP = 200

    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    oof_group = np.zeros((len(Xg), len(classes)), dtype=np.float32)
    test_group_pred = np.zeros((len(Xtestg), len(classes)), dtype=np.float32)
    fold_best = []
    fold_metrics = []

    for fold, (tr_idx, va_idx) in enumerate(skf.split(Xg, y_enc_grp), start=1):
        print(f"\n[{group_name}] Fold {fold}/{N_FOLDS}")
        X_tr, X_va = Xg.iloc[tr_idx], Xg.iloc[va_idx]
        y_tr, y_va = y_enc_grp[tr_idx], y_enc_grp[va_idx]

        prep = clone(preprocessor)
        Xtr = prep.fit_transform(X_tr)
        Xva = prep.transform(X_va)

        dtrain = xgb.DMatrix(Xtr, label=y_tr)
        dval   = xgb.DMatrix(Xva, label=y_va)

        bst = xgb.train(
            params=xgb_params,
            dtrain=dtrain,
            num_boost_round=NUM_BOOST_ROUND,
            evals=[(dtrain, "train"), (dval, "valid")],
            feval=penalized_metric,
            early_stopping_rounds=EARLY_STOP,
            verbose_eval=False
        )


        # bst = xgb.train(
        #     params=xgb_params,
        #     dtrain=dtrain,
        #     num_boost_round=NUM_BOOST_ROUND,
        #     evals=[(dtrain, "train"), (dval, "valid")],
        #     early_stopping_rounds=EARLY_STOP,
        #     verbose_eval=False
        # )
        best_round = int(bst.best_iteration + 1)
        fold_best.append(best_round)
        print(f"[{group_name}] Best iteration: {best_round}")

        oof_proba = bst.predict(dval, iteration_range=(0, best_round))
        oof_group[va_idx] = oof_proba
        oof_labels = np.argmax(oof_proba, axis=1)
        acc = accuracy_score(y_va, oof_labels)
        f1m = f1_score(y_va, oof_labels, average="macro")
        fold_metrics.append((acc, f1m))
        print(f"[{group_name}] Acc: {acc:.4f} | Macro F1: {f1m:.4f}")

        # test preds for this fold
        Xtest_tf = prep.transform(Xtestg)
        dtest = xgb.DMatrix(Xtest_tf)
        test_group_pred += bst.predict(dtest, iteration_range=(0, best_round)) / N_FOLDS

    # OOF summary for the group
    oof_argmax = np.argmax(oof_group, axis=1)
    acc_g = accuracy_score(y_enc_grp, oof_argmax)
    f1_g = f1_score(y_enc_grp, oof_argmax, average="macro")
    print(f"\n[{group_name}] OOF Accuracy: {acc_g:.4f} | Macro F1: {f1_g:.4f}")
    print(f"[{group_name}] Best iterations: {fold_best} | Median: {int(np.median(fold_best))}")

    return oof_group, test_group_pred

# -------- Run male model --------
X_male = X[male_mask].reset_index(drop=True)
y_male_enc = y_enc[male_mask]
test_male = test_features[test_male_mask].reset_index(drop=True)

male_oof, male_test_pred = train_group_and_predict(X_male, y_male_enc, test_male, "MALE")

# -------- Run female model --------
X_female = X[female_mask].reset_index(drop=True)
y_female_enc = y_enc[female_mask]
test_female = test_features[test_female_mask].reset_index(drop=True)

female_oof, female_test_pred = train_group_and_predict(X_female, y_female_enc, test_female, "FEMALE")

# -------- Combine OOF for overall report --------
oof_full = np.zeros((len(X), len(classes)), dtype=np.float32)
oof_full[male_mask.values] = male_oof
oof_full[female_mask.values] = female_oof

oof_labels = np.argmax(oof_full, axis=1)
oof_acc = accuracy_score(y_enc, oof_labels)
oof_f1 = f1_score(y_enc, oof_labels, average="macro")
print("\n========== OVERALL OOF ==========")
print(f"OOF Accuracy: {oof_acc:.4f} | OOF Macro F1: {oof_f1:.4f}")
try:
    print("\nOOF Classification Report:\n",
          classification_report(y_enc, oof_labels, target_names=classes))
except Exception as e:
    print(f"[Info] Could not print classification report: {e}")

# -------- Build full test predictions by placing group preds back to original order --------
test_pred_proba = np.zeros((len(test_features), len(classes)), dtype=np.float32)
test_pred_proba[test_male_mask.values] = male_test_pred
test_pred_proba[test_female_mask.values] = female_test_pred

test_pred_int = np.argmax(test_pred_proba, axis=1)
test_pred_labels = le.inverse_transform(test_pred_int)

# -------- Build submission --------
ss_cols = list(sample_sub.columns)
ID_HEADER = ID_COL_SAMPLE if ID_COL_SAMPLE in sample_sub.columns else None
LABEL_HEADER = LABEL_COL_SAMP

sub = pd.DataFrame()
if ID_HEADER is not None and ID_COL_TEST in test.columns:
    sub[ID_HEADER] = test[ID_COL_TEST].values
elif ID_HEADER is not None:
    sub[ID_HEADER] = np.arange(len(test_features))
sub[LABEL_HEADER] = test_pred_labels

# Reorder/complete to match sample_sub exactly
for c in ss_cols:
    if c not in sub.columns:
        sub[c] = sample_sub[c].iloc[0] if len(sample_sub[c]) else None
sub = sub[ss_cols]

sub.to_csv("submission.csv", index=False)
print("\nSaved submission.csv")
print(sub.head(10))


[Detected] Target in train: 'WeightCategory', Label in sample_sub: 'WeightCategory'
[Detected] ID in train: 'id', ID in test: 'id'
[Info] Train male rows: 7783 | female rows: 7750
[Info] Test  male rows: 10336 | female rows: 10422

[MALE] Fold 1/5




[MALE] Best iteration: 1
[MALE] Acc: 0.8523 | Macro F1: 0.7166

[MALE] Fold 2/5




[MALE] Best iteration: 1
[MALE] Acc: 0.8696 | Macro F1: 0.7339

[MALE] Fold 3/5




[MALE] Best iteration: 2
[MALE] Acc: 0.8542 | Macro F1: 0.7188

[MALE] Fold 4/5




[MALE] Best iteration: 2
[MALE] Acc: 0.8515 | Macro F1: 0.8338

[MALE] Fold 5/5




[MALE] Best iteration: 1
[MALE] Acc: 0.8445 | Macro F1: 0.7105

[MALE] OOF Accuracy: 0.8544 | Macro F1: 0.7189
[MALE] Best iterations: [1, 1, 2, 2, 1] | Median: 1

[FEMALE] Fold 1/5




[FEMALE] Best iteration: 1
[FEMALE] Acc: 0.8910 | Macro F1: 0.7175

[FEMALE] Fold 2/5




[FEMALE] Best iteration: 1
[FEMALE] Acc: 0.8968 | Macro F1: 0.7176

[FEMALE] Fold 3/5




[FEMALE] Best iteration: 1
[FEMALE] Acc: 0.8871 | Macro F1: 0.7053

[FEMALE] Fold 4/5




[FEMALE] Best iteration: 2
[FEMALE] Acc: 0.8948 | Macro F1: 0.7219

[FEMALE] Fold 5/5




[FEMALE] Best iteration: 1
[FEMALE] Acc: 0.8974 | Macro F1: 0.7215

[FEMALE] OOF Accuracy: 0.8934 | Macro F1: 0.7170
[FEMALE] Best iterations: [1, 1, 1, 2, 1] | Median: 1

OOF Accuracy: 0.8739 | OOF Macro F1: 0.8610

OOF Classification Report:
                      precision    recall  f1-score   support

Insufficient_Weight       0.91      0.91      0.91      1870
      Normal_Weight       0.83      0.86      0.85      2345
     Obesity_Type_I       0.86      0.84      0.85      2207
    Obesity_Type_II       0.95      0.96      0.96      2403
   Obesity_Type_III       1.00      1.00      1.00      2983
 Overweight_Level_I       0.73      0.71      0.72      1844
Overweight_Level_II       0.75      0.74      0.75      1881

           accuracy                           0.87     15533
          macro avg       0.86      0.86      0.86     15533
       weighted avg       0.87      0.87      0.87     15533


Saved submission.csv
   id       WeightCategory
0   0  Overweight_Level_II
1   1

In [8]:
# ==============================================
# Evaluate on Kaggle_test.csv (gender-specific models) + Error analysis (text)
# ==============================================
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# ---------- 1) Load Kaggle_test and prep features ----------
kdf = pd.read_csv("Kaggle_test.csv")

# y_true and features
if "WeightCategory" not in kdf.columns:
    raise KeyError("Expected 'WeightCategory' in Kaggle_test.csv")
y_true = kdf["WeightCategory"].copy()

X_k = kdf.drop(columns=["WeightCategory"], errors="ignore").copy()
if "id" in X_k.columns:
    X_k.drop(columns=["id"], inplace=True)

# same drops as training
for c in ["MTRANS", "SMOKE"]:
    if c in X_k.columns:
        X_k.drop(columns=[c], inplace=True)

# same BMI as training (uses your robust height detection)
X_k = add_bmi(X_k)

# detect gender on Kaggle set
gender_col_k = detect_gender_column(X_k)
if gender_col_k is None:
    raise ValueError("Could not detect a gender column in Kaggle_test.csv")
km_k, kf_k = split_by_gender(X_k[gender_col_k])

# ---------- 2) Predict on Kaggle_test by reusing your training function ----------
# Train male model on male subset of TRAIN and predict on male subset of Kaggle
X_male_train = X[male_mask].reset_index(drop=True)
y_male_enc   = y_enc[male_mask]
X_male_k     = X_k[km_k].reset_index(drop=True)

# Train female model on female subset of TRAIN and predict on female subset of Kaggle
X_female_train = X[female_mask].reset_index(drop=True)
y_female_enc   = y_enc[female_mask]
X_female_k     = X_k[kf_k].reset_index(drop=True)

# Containers
kaggle_pred_proba = np.zeros((len(X_k), len(classes)), dtype=np.float32)

if len(X_male_train) > 0 and len(X_male_k) > 0:
    _, male_k_pred = train_group_and_predict(X_male_train, y_male_enc, X_male_k, "MALE (Kaggle)")
    kaggle_pred_proba[km_k.values] = male_k_pred
else:
    print("[Info] Skipping MALE (Kaggle): no rows in train or Kaggle subset.")

if len(X_female_train) > 0 and len(X_female_k) > 0:
    _, female_k_pred = train_group_and_predict(X_female_train, y_female_enc, X_female_k, "FEMALE (Kaggle)")
    kaggle_pred_proba[kf_k.values] = female_k_pred
else:
    print("[Info] Skipping FEMALE (Kaggle): no rows in train or Kaggle subset.")

# Convert probabilities → labels using the same LabelEncoder
kaggle_pred_idx = np.argmax(kaggle_pred_proba, axis=1)
y_pred = le.inverse_transform(kaggle_pred_idx)

# Overall accuracy (5 decimals)
overall_acc = accuracy_score(y_true, y_pred)
print(f"\n✅ Overall Accuracy on Kaggle_test: {overall_acc:.5f}")

# ---------- 3) Error analysis (TEXT OUTPUT; no DataFrames) ----------
order = [
    'Insufficient_Weight',
    'Normal_Weight',
    'Overweight_Level_I',
    'Overweight_Level_II',
    'Obesity_Type_I',
    'Obesity_Type_II',
    'Obesity_Type_III'
]

# Use your exact order even if some classes are absent
cm = confusion_matrix(y_true, y_pred, labels=order)
cm_norm = cm.astype(float) / (cm.sum(axis=1, keepdims=True) + 1e-12)

print("\n=== Confusion Matrix (counts) ===")
print("Predicted →")
print("True ↓")
for i, true_class in enumerate(order):
    row = " | ".join(f"{cm[i, j]:4d}" for j in range(len(order)))
    print(f"{true_class:<22}: {row}")

print("\n=== Confusion Matrix (row-normalized) ===")
for i, true_class in enumerate(order):
    row = " | ".join(f"{cm_norm[i, j]:.2f}" for j in range(len(order)))
    print(f"{true_class:<22}: {row}")

print("\n=== Per-class metrics ===")
report = classification_report(
    y_true,
    y_pred,
    labels=order,
    target_names=order,
    digits=4,
    zero_division=0
)
print(report)

print("\n=== Per-class accuracy (diagonal/row total) ===")
for i, c in enumerate(order):
    total = cm[i].sum()
    correct = cm[i, i]
    acc = correct / total if total > 0 else 0.0
    print(f"{c:<22} | Correct: {correct:3d} / {total:3d} | {acc*100:6.2f}%")

print("\n=== Most common confusions (true → predicted) ===")
pairs = []
for i, t in enumerate(order):
    for j, p in enumerate(order):
        if i == j or cm[i, j] == 0:
            continue
        pairs.append((cm[i, j], t, p, cm_norm[i, j]))
pairs = sorted(pairs, key=lambda x: (-x[0], -x[3]))
for cnt, true_label, pred_label, norm_val in pairs[:10]:
    print(f"{true_label:25} → {pred_label:25} | Count: {cnt:3d} | Row%: {norm_val*100:5.1f}")

print("\n=== Sample of misclassified rows (first 10) ===")
mis_idx = np.where(y_true.values != y_pred)[0]
if len(mis_idx) == 0:
    print("🎉 No misclassifications!")
else:
    for idx in mis_idx[:10]:
        true_lab = y_true.iloc[idx]
        pred_lab = y_pred[idx]
        conf = kaggle_pred_proba[idx].max()
        # 2nd best
        rank = np.argsort(-kaggle_pred_proba[idx])
        second_idx = rank[1] if len(rank) > 1 else rank[0]
        second_lab = le.inverse_transform([second_idx])[0]
        second_conf = kaggle_pred_proba[idx][second_idx]
        print(f"Row {idx:4d}: true={true_lab:<22} pred={pred_lab:<22} conf={conf:.3f} 2nd={second_lab:<22}({second_conf:.3f})")





[MALE (Kaggle)] Fold 1/5
[MALE (Kaggle)] Best iteration: 1
[MALE (Kaggle)] Acc: 0.8523 | Macro F1: 0.7166

[MALE (Kaggle)] Fold 2/5




[MALE (Kaggle)] Best iteration: 1
[MALE (Kaggle)] Acc: 0.8696 | Macro F1: 0.7339

[MALE (Kaggle)] Fold 3/5




[MALE (Kaggle)] Best iteration: 2
[MALE (Kaggle)] Acc: 0.8542 | Macro F1: 0.7188

[MALE (Kaggle)] Fold 4/5




[MALE (Kaggle)] Best iteration: 2
[MALE (Kaggle)] Acc: 0.8515 | Macro F1: 0.8338

[MALE (Kaggle)] Fold 5/5




[MALE (Kaggle)] Best iteration: 1
[MALE (Kaggle)] Acc: 0.8445 | Macro F1: 0.7105

[MALE (Kaggle)] OOF Accuracy: 0.8544 | Macro F1: 0.7189
[MALE (Kaggle)] Best iterations: [1, 1, 2, 2, 1] | Median: 1

[FEMALE (Kaggle)] Fold 1/5




[FEMALE (Kaggle)] Best iteration: 1
[FEMALE (Kaggle)] Acc: 0.8910 | Macro F1: 0.7175

[FEMALE (Kaggle)] Fold 2/5




[FEMALE (Kaggle)] Best iteration: 1
[FEMALE (Kaggle)] Acc: 0.8968 | Macro F1: 0.7176

[FEMALE (Kaggle)] Fold 3/5




[FEMALE (Kaggle)] Best iteration: 1
[FEMALE (Kaggle)] Acc: 0.8871 | Macro F1: 0.7053

[FEMALE (Kaggle)] Fold 4/5




[FEMALE (Kaggle)] Best iteration: 2
[FEMALE (Kaggle)] Acc: 0.8948 | Macro F1: 0.7219

[FEMALE (Kaggle)] Fold 5/5




[FEMALE (Kaggle)] Best iteration: 1
[FEMALE (Kaggle)] Acc: 0.8974 | Macro F1: 0.7215

[FEMALE (Kaggle)] OOF Accuracy: 0.8934 | Macro F1: 0.7170
[FEMALE (Kaggle)] Best iterations: [1, 1, 1, 2, 1] | Median: 1

✅ Overall Accuracy on Kaggle_test: 0.89321

=== Confusion Matrix (counts) ===
Predicted →
True ↓
Insufficient_Weight   :  603 |   48 |    2 |    0 |    0 |    0 |    0
Normal_Weight         :   44 |  661 |   26 |    5 |    1 |    0 |    0
Overweight_Level_I    :    3 |   77 |  416 |   79 |    8 |    0 |    0
Overweight_Level_II   :    0 |   18 |   79 |  493 |   48 |    3 |    0
Obesity_Type_I        :    1 |    1 |   12 |   49 |  617 |   21 |    2
Obesity_Type_II       :    0 |    1 |    1 |    6 |   21 |  816 |    0
Obesity_Type_III      :    0 |    0 |    1 |    0 |    0 |    1 | 1061

=== Confusion Matrix (row-normalized) ===
Insufficient_Weight   : 0.92 | 0.07 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00
Normal_Weight         : 0.06 | 0.90 | 0.04 | 0.01 | 0.00 | 0.00 | 0.00
Overweight_Le

In [9]:
# ==============================================
# Error analysis (text output, custom order) — robust
# Requires:
#   - kdf (Kaggle_test.csv loaded) with WeightCategory
#   - le (LabelEncoder fitted on train)
#   - y_pred_proba for Kaggle set in `kaggle_pred_proba`
# If you used different names, uncomment the "builder" section below.
# ==============================================
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report

# ---------- Ensure we have y_true, y_pred, kaggle_pred_proba ----------
# If you already have them, keep as-is; otherwise, build them here:
try:
    _ = y_true, y_pred, kaggle_pred_proba  # will raise if any missing
except NameError:
    # Build from kdf and your model outputs if needed
    # y_true = kdf["WeightCategory"].copy()
    # kaggle_pred_idx = np.argmax(kaggle_pred_proba, axis=1)
    # y_pred = le.inverse_transform(kaggle_pred_idx)
    raise NameError("y_true, y_pred, or kaggle_pred_proba is not defined in this scope. Define them first.")

# ---------- Use your exact order (but warn if labels differ) ----------
order = [
    'Insufficient_Weight',
    'Normal_Weight',
    'Overweight_Level_I',
    'Overweight_Level_II',
    'Obesity_Type_I',
    'Obesity_Type_II',
    'Obesity_Type_III'
]

# Optional: sanity warning if your labels differ
present_labels = sorted(set(list(np.unique(y_true)) + list(np.unique(y_pred))))
missing_in_data = [c for c in order if c not in present_labels]
if len(missing_in_data) == len(order):
    print("[Warn] None of the ordered labels are present in y_true/y_pred. Check label names.")
elif missing_in_data:
    print(f"[Info] These ordered labels have zero support in Kaggle_test: {missing_in_data}")

# ---------- Confusion Matrix ----------
cm = confusion_matrix(y_true, y_pred, labels=order)
cm_norm = cm.astype(float) / (cm.sum(axis=1, keepdims=True) + 1e-12)

print("\n=== Confusion Matrix (counts) ===")
print("Predicted →")
print("True ↓")
for i, true_class in enumerate(order):
    row = " | ".join(f"{cm[i, j]:4d}" for j in range(len(order)))
    print(f"{true_class:<22}: {row}")

print("\n=== Confusion Matrix (row-normalized) ===")
for i, true_class in enumerate(order):
    row = " | ".join(f"{cm_norm[i, j]:.2f}" for j in range(len(order)))
    print(f"{true_class:<22}: {row}")

# ---------- Classification report ----------
print("\n=== Per-class metrics ===")
try:
    report = classification_report(
        y_true,
        y_pred,
        labels=order,
        target_names=order,
        digits=4,
        zero_division=0
    )
    print(report)
except Exception as e:
    print(f"[Info] classification_report failed with your fixed order: {e}")
    print(classification_report(y_true, y_pred, digits=4, zero_division=0))

# ---------- Per-class accuracy ----------
print("\n=== Per-class accuracy (diagonal/row total) ===")
for i, c in enumerate(order):
    total = cm[i].sum()
    correct = cm[i, i]
    acc = correct / total if total > 0 else 0
    print(f"{c:<22} | Correct: {correct:3d} / {total:3d} | {acc*100:6.2f}%")

# ---------- Top confusions ----------
print("\n=== Most common confusions (true → predicted) ===")
pairs = []
for i, t in enumerate(order):
    for j, p in enumerate(order):
        if i == j or cm[i, j] == 0:
            continue
        pairs.append((cm[i, j], t, p, cm_norm[i, j]))
pairs = sorted(pairs, key=lambda x: (-x[0], -x[3]))
for cnt, true_label, pred_label, norm_val in pairs[:10]:
    print(f"{true_label:25} → {pred_label:25} | Count: {cnt:3d} | Row%: {norm_val*100:5.1f}")

# ---------- Misclassified sample lines ----------
print("\n=== Sample of misclassified rows (first 10) ===")
mis_idx = np.where(np.asarray(y_true) != np.asarray(y_pred))[0]
if len(mis_idx) == 0:
    print("🎉 No misclassifications!")
else:
    for idx in mis_idx[:10]:
        # Guard in case y_true is a Series vs array:
        true_lab = y_true.iloc[idx] if hasattr(y_true, "iloc") else y_true[idx]
        pred_lab = y_pred[idx]
        conf = float(np.max(kaggle_pred_proba[idx])) if len(kaggle_pred_proba.shape) == 2 else 1.0
        # 2nd best safely
        rank = np.argsort(-kaggle_pred_proba[idx]) if len(kaggle_pred_proba.shape) == 2 else np.array([0, 0])
        second_idx = rank[1] if rank.size > 1 else rank[0]
        second_lab = le.inverse_transform([second_idx])[0] if hasattr(le, "inverse_transform") else str(second_idx)
        second_conf = float(kaggle_pred_proba[idx][second_idx]) if len(kaggle_pred_proba.shape) == 2 else 0.0
        print(f"Row {idx:4d}: true={true_lab:<22} pred={pred_lab:<22} conf={conf:.3f} 2nd={second_lab:<22}({second_conf:.3f})")



=== Confusion Matrix (counts) ===
Predicted →
True ↓
Insufficient_Weight   :  603 |   48 |    2 |    0 |    0 |    0 |    0
Normal_Weight         :   44 |  661 |   26 |    5 |    1 |    0 |    0
Overweight_Level_I    :    3 |   77 |  416 |   79 |    8 |    0 |    0
Overweight_Level_II   :    0 |   18 |   79 |  493 |   48 |    3 |    0
Obesity_Type_I        :    1 |    1 |   12 |   49 |  617 |   21 |    2
Obesity_Type_II       :    0 |    1 |    1 |    6 |   21 |  816 |    0
Obesity_Type_III      :    0 |    0 |    1 |    0 |    0 |    1 | 1061

=== Confusion Matrix (row-normalized) ===
Insufficient_Weight   : 0.92 | 0.07 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00
Normal_Weight         : 0.06 | 0.90 | 0.04 | 0.01 | 0.00 | 0.00 | 0.00
Overweight_Level_I    : 0.01 | 0.13 | 0.71 | 0.14 | 0.01 | 0.00 | 0.00
Overweight_Level_II   : 0.00 | 0.03 | 0.12 | 0.77 | 0.07 | 0.00 | 0.00
Obesity_Type_I        : 0.00 | 0.00 | 0.02 | 0.07 | 0.88 | 0.03 | 0.00
Obesity_Type_II       : 0.00 | 0.00 | 0.00 | 0.01 |

In [5]:
# ==============================================
# Gender-specific XGB + BMI + targeted class boost + Kaggle_test eval
# ==============================================
import numpy as np
import pandas as pd
from collections import Counter

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.base import clone
import xgboost as xgb

# -------- Paths --------
TRAIN_PATH = "train.csv"
TEST_PATH = "test.csv"
SAMPLE_SUB_PATH = "sample_submission.csv"
KAGGLE_TEST_PATH = "Kaggle_test.csv"  # has WeightCategory ground truth

RANDOM_STATE = 42
N_FOLDS = 5
N_JOBS = -1

# -------- Helpers --------
def norm_col(s: str) -> str:
    if s is None: return s
    return str(s).replace("\ufeff", "").strip().lower()

def infer_feature_types(df):
    cat_cols = df.select_dtypes(include=["object", "category", "bool"]).columns.tolist()
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    return num_cols, cat_cols

def detect_gender_column(df):
    # common names
    for c in df.columns:
        if norm_col(c) in {"gender", "sex"}:
            return c
    # fallback: column that looks like M/F
    for c in df.columns:
        vals = pd.Series(df[c].dropna().astype(str).str.lower().str.strip()).unique()
        if len(vals) in (2, 3):
            if any(v.startswith("m") for v in vals) and any(v.startswith("f") for v in vals):
                return c
    return None

def split_by_gender(series):
    s = series.astype(str).str.lower().str.strip()
    male_mask = s.str.startswith(("m","1","true"))
    female_mask = s.str.startswith(("f","0","false"))
    if male_mask.sum()==0 and female_mask.sum()==0:
        top = s.value_counts().index.tolist()
        if len(top)>=2:
            male_mask = s==top[0]
            female_mask = s==top[1]
    return male_mask, female_mask

def add_bmi(df):
    """Compute BMI = Weight / (Height_m^2).
       If median height > 3 assume cm → convert to meters."""
    if ("Weight" in df.columns) and ("Height" in df.columns):
        h = df["Height"].astype(float)
        height_m = np.where(h.median() > 3.0, h / 100.0, h)
        with np.errstate(divide="ignore", invalid="ignore"):
            bmi = df["Weight"].astype(float) / (np.power(height_m, 2) + 1e-12)
        df["BMI"] = pd.Series(bmi).replace([np.inf, -np.inf], np.nan)
    return df

# -------- Load data --------
train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)
sample_sub = pd.read_csv(SAMPLE_SUB_PATH)

# Drop columns we don’t want in this run
for c in ["MTRANS","SMOKE"]:
    if c in train.columns: train.drop(columns=[c], inplace=True)
    if c in test.columns:  test.drop(columns=[c], inplace=True)

# Feature engineering
train = add_bmi(train)
test  = add_bmi(test)

# Detect ID/Target from files (simple logic)
id_col = None
for cand in ["id", "row_id", "index", "sample_id"]:
    if cand in train.columns and cand in test.columns:
        id_col = cand
        break

target_col = None
for cand in ["WeightCategory", "NObeyesdad", "label", "target", "class", "y"]:
    if cand in train.columns:
        target_col = cand
        break
if target_col is None:
    raise ValueError("Could not detect target column in train.csv")

# Build X/y
y = train[target_col].copy()
X = train.drop(columns=[target_col]).copy()
if id_col and id_col in X.columns:
    X.drop(columns=[id_col], inplace=True)

test_features = test.copy()
if id_col and id_col in test_features.columns:
    test_ids = test_features[id_col].copy()
    test_features.drop(columns=[id_col], inplace=True)
else:
    test_ids = pd.Series(np.arange(len(test_features)), name="id")

# Label encode target
le = LabelEncoder()
y_enc = le.fit_transform(y)
classes = list(le.classes_)
print(f"[Info] Classes: {classes}")

# Detect gender and split
gender_col = detect_gender_column(pd.concat([X, test_features], axis=0))
if gender_col is None:
    raise ValueError("Could not detect a gender column (e.g., 'Gender'/'SEX').")
male_mask, female_mask = split_by_gender(train[gender_col])
test_male_mask, test_female_mask = split_by_gender(test_features[gender_col])
print(f"[Info] Train male={int(male_mask.sum())}, female={int(female_mask.sum())}")
print(f"[Info] Test  male={int(test_male_mask.sum())}, female={int(test_female_mask.sum())}")

# -------- Training function (gender-specific) with class boosting for two classes --------
def train_group_and_predict(X_grp, y_enc_grp, test_grp, group_name,
                            boost_targets=("Overweight_Level_I","Overweight_Level_II"),
                            base_boost=1.50, jitter_amp=0.10):
    # Drop gender column inside a group (constant after split)
    cols_to_use = [c for c in X_grp.columns if c != gender_col]
    Xg = X_grp[cols_to_use].copy()
    Xtestg = test_grp[cols_to_use].copy()

    num_cols, cat_cols = infer_feature_types(Xg)

    # Preprocessor
    numeric_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler(with_mean=False))
    ])
    try:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
    except TypeError:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse=True)
    categorical_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", ohe)
    ])
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, num_cols),
            ("cat", categorical_transformer, cat_cols),
        ],
        remainder="drop",
        sparse_threshold=1.0
    )

    # XGB params
    xgb_params = {
        "objective": "multi:softprob",
        "num_class": len(classes),
        "eval_metric": "mlogloss",
        "tree_method": "hist",
        "max_depth": 6,
        "min_child_weight": 2,
        "subsample": 0.9,
        "colsample_bytree": 0.9,
        "lambda": 1.0,
        "alpha": 0.0,
        "eta": 0.03,
        "nthread": N_JOBS,
        "seed": RANDOM_STATE,
    }
    NUM_BOOST_ROUND = 20000
    EARLY_STOP = 200

    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)
    oof_group = np.zeros((len(Xg), len(classes)), dtype=np.float32)
    test_group_pred = np.zeros((len(Xtestg), len(classes)), dtype=np.float32)
    fold_best = []

    # map class name -> index
    cls_to_idx = {c: i for i, c in enumerate(classes)}

    for fold, (tr_idx, va_idx) in enumerate(skf.split(Xg, y_enc_grp), start=1):
        print(f"\n[{group_name}] Fold {fold}/{N_FOLDS}")
        X_tr, X_va = Xg.iloc[tr_idx], Xg.iloc[va_idx]
        y_tr, y_va = y_enc_grp[tr_idx], y_enc_grp[va_idx]

        prep = clone(preprocessor)
        Xtr = prep.fit_transform(X_tr)
        Xva = prep.transform(X_va)

        # ---- RANDOM (non-count) WEIGHTS to gently boost two classes ----
        w_tr = np.ones_like(y_tr, dtype=float)
        rng = np.random.default_rng(RANDOM_STATE + fold)  # deterministic per fold
        for t in boost_targets:
            if t in cls_to_idx:
                cls_id = cls_to_idx[t]
                idx_t = np.where(y_tr == cls_id)[0]
                if idx_t.size > 0:
                    jitter = rng.uniform(-jitter_amp, jitter_amp, size=idx_t.size)
                    w_tr[idx_t] = base_boost + jitter
        w_va = np.ones_like(y_va, dtype=float)

        dtrain = xgb.DMatrix(Xtr, label=y_tr, weight=w_tr)
        dval   = xgb.DMatrix(Xva, label=y_va, weight=w_va)

        bst = xgb.train(
            params=xgb_params,
            dtrain=dtrain,
            num_boost_round=NUM_BOOST_ROUND,
            evals=[(dtrain, "train"), (dval, "valid")],
            feval=None,
            early_stopping_rounds=EARLY_STOP,
            verbose_eval=False
        )
        best_round = int(bst.best_iteration + 1)
        fold_best.append(best_round)
        print(f"[{group_name}] Best iteration: {best_round}")

        oof_proba = bst.predict(dval, iteration_range=(0, best_round))
        oof_group[va_idx] = oof_proba

        # test preds for this fold
        Xtest_tf = prep.transform(Xtestg)
        dtest = xgb.DMatrix(Xtest_tf)
        test_group_pred += bst.predict(dtest, iteration_range=(0, best_round)) / N_FOLDS

    # OOF summary for the group
    oof_labels = np.argmax(oof_group, axis=1)
    acc_g = accuracy_score(y_enc_grp, oof_labels)
    f1_g = f1_score(y_enc_grp, oof_labels, average="macro")
    print(f"\n[{group_name}] OOF Accuracy: {acc_g:.4f} | Macro F1: {f1_g:.4f}")
    print(f"[{group_name}] Best iterations: {fold_best} | Median: {int(np.median(fold_best))}")

    return oof_group, test_group_pred

# -------- Train per-gender and predict full test --------
X_male = X[male_mask].reset_index(drop=True)
y_male_enc = y_enc[male_mask]
test_male = test_features[test_male_mask].reset_index(drop=True)

X_female = X[female_mask].reset_index(drop=True)
y_female_enc = y_enc[female_mask]
test_female = test_features[test_female_mask].reset_index(drop=True)

male_oof, male_test_pred = train_group_and_predict(X_male, y_male_enc, test_male, "MALE")
female_oof, female_test_pred = train_group_and_predict(X_female, y_female_enc, test_female, "FEMALE")

# Combine OOF
oof_full = np.zeros((len(X), len(classes)), dtype=np.float32)
oof_full[male_mask.values] = male_oof
oof_full[female_mask.values] = female_oof

oof_labels = np.argmax(oof_full, axis=1)
oof_acc = accuracy_score(y_enc, oof_labels)
oof_f1 = f1_score(y_enc, oof_labels, average="macro")
print("\n========== OVERALL OOF ==========")
print(f"OOF Accuracy: {oof_acc:.4f} | OOF Macro F1: {oof_f1:.4f}")
try:
    print("\nOOF Classification Report:\n",
          classification_report(y_enc, oof_labels, target_names=classes))
except Exception as e:
    print(f"[Info] Could not print classification report: {e}")

# Build full test predictions (for Kaggle submission use-case)
test_pred_proba = np.zeros((len(test_features), len(classes)), dtype=np.float32)
test_pred_proba[test_male_mask.values] = male_test_pred
test_pred_proba[test_female_mask.values] = female_test_pred

test_pred_int = np.argmax(test_pred_proba, axis=1)
test_pred_labels = le.inverse_transform(test_pred_int)

# Submission
ss_cols = list(sample_sub.columns)
ID_HEADER = None
LABEL_HEADER = None
if len(ss_cols) == 2:
    # detect which is ID by presence in test
    c1, c2 = ss_cols
    if c1 in test.columns and c2 not in test.columns:
        ID_HEADER, LABEL_HEADER = c1, c2
    elif c2 in test.columns and c1 not in test.columns:
        ID_HEADER, LABEL_HEADER = c2, c1
if ID_HEADER is None:
    # fallback
    ID_HEADER = ss_cols[0]
    LABEL_HEADER = ss_cols[1]

sub = pd.DataFrame()
if ID_HEADER in test.columns:
    sub[ID_HEADER] = test[ID_HEADER].values
else:
    sub[ID_HEADER] = np.arange(len(test_features))
sub[LABEL_HEADER] = test_pred_labels

# Ensure column order
for c in ss_cols:
    if c not in sub.columns:
        sub[c] = sample_sub[c].iloc[0] if len(sample_sub[c]) else None
sub = sub[ss_cols]

sub.to_csv("submission.csv", index=False)
print("\nSaved submission.csv")
print(sub.head(5))

# ==============================================
# Evaluate on Kaggle_test.csv (with ground truth)
# ==============================================
kdf = pd.read_csv(KAGGLE_TEST_PATH)
if "WeightCategory" not in kdf.columns:
    raise KeyError("Kaggle_test.csv must contain 'WeightCategory'.")

y_true = kdf["WeightCategory"].copy()
X_k = kdf.drop(columns=["WeightCategory"], errors="ignore").copy()
if id_col and id_col in X_k.columns:
    X_k.drop(columns=[id_col], inplace=True)

# same drops + BMI
for c in ["MTRANS","SMOKE"]:
    if c in X_k.columns:
        X_k.drop(columns=[c], inplace=True)
X_k = add_bmi(X_k)

# detect gender and split for Kaggle set
gender_col_k = detect_gender_column(X_k)
if gender_col_k is None:
    raise ValueError("Could not detect a gender column in Kaggle_test.csv")
km_k, kf_k = split_by_gender(X_k[gender_col_k])

# Predict on Kaggle by reusing the same training procedure (per gender)
kaggle_pred_proba = np.zeros((len(X_k), len(classes)), dtype=np.float32)

if X_male.shape[0] > 0 and km_k.sum() > 0:
    _, male_k_pred = train_group_and_predict(X_male, y_male_enc, X_k[km_k].reset_index(drop=True), "MALE (Kaggle)")
    kaggle_pred_proba[km_k.values] = male_k_pred
if X_female.shape[0] > 0 and kf_k.sum() > 0:
    _, female_k_pred = train_group_and_predict(X_female, y_female_enc, X_k[kf_k].reset_index(drop=True), "FEMALE (Kaggle)")
    kaggle_pred_proba[kf_k.values] = female_k_pred

kaggle_pred_idx = np.argmax(kaggle_pred_proba, axis=1)
y_pred = le.inverse_transform(kaggle_pred_idx)

# -------- Overall accuracy to 5 decimals --------
overall_acc = accuracy_score(y_true, y_pred)
print(f"\n✅ Overall Accuracy on Kaggle_test: {overall_acc:.5f}")

# -------- Text-only error analysis (custom order) --------
order = [
    'Insufficient_Weight',
    'Normal_Weight',
    'Overweight_Level_I',
    'Overweight_Level_II',
    'Obesity_Type_I',
    'Obesity_Type_II',
    'Obesity_Type_III'
]

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_true, y_pred, labels=order)
cm_norm = cm.astype(float) / (cm.sum(axis=1, keepdims=True) + 1e-12)

print("\n=== Confusion Matrix (counts) ===")
print("Predicted →")
print("True ↓")
for i, true_class in enumerate(order):
    row = " | ".join(f"{cm[i, j]:4d}" for j in range(len(order)))
    print(f"{true_class:<22}: {row}")

print("\n=== Confusion Matrix (row-normalized) ===")
for i, true_class in enumerate(order):
    row = " | ".join(f"{cm_norm[i, j]:.2f}" for j in range(len(order)))
    print(f"{true_class:<22}: {row}")

print("\n=== Per-class metrics ===")
try:
    print(classification_report(y_true, y_pred, labels=order, target_names=order, digits=4, zero_division=0))
except Exception as e:
    print(f"[Info] classification_report fallback: {e}")
    print(classification_report(y_true, y_pred, digits=4, zero_division=0))

print("\n=== Per-class accuracy (diagonal/row total) ===")
for i, c in enumerate(order):
    total = cm[i].sum()
    correct = cm[i, i]
    acc = correct / total if total > 0 else 0.0
    print(f"{c:<22} | Correct: {correct:3d} / {total:3d} | {acc*100:6.2f}%")

print("\n=== Most common confusions (true → predicted) ===")
pairs = []
for i, t in enumerate(order):
    for j, p in enumerate(order):
        if i == j or cm[i, j] == 0:
            continue
        pairs.append((cm[i, j], t, p, cm_norm[i, j]))
pairs = sorted(pairs, key=lambda x: (-x[0], -x[3]))
for cnt, true_label, pred_label, norm_val in pairs[:10]:
    print(f"{true_label:25} → {pred_label:25} | Count: {cnt:3d} | Row%: {norm_val*100:5.1f}")

print("\n=== Sample of misclassified rows (first 10) ===")
mis_idx = np.where(np.asarray(y_true) != np.asarray(y_pred))[0]
if len(mis_idx) == 0:
    print("🎉 No misclassifications!")
else:
    for idx in mis_idx[:10]:
        true_lab = y_true.iloc[idx] if hasattr(y_true, "iloc") else y_true[idx]
        pred_lab = y_pred[idx]
        conf = float(np.max(kaggle_pred_proba[idx]))
        rank = np.argsort(-kaggle_pred_proba[idx])
        second_idx = rank[1] if rank.size > 1 else rank[0]
        second_lab = le.inverse_transform([second_idx])[0]
        second_conf = float(kaggle_pred_proba[idx][second_idx])
        print(f"Row {idx:4d}: true={true_lab:<22} pred={pred_lab:<22} conf={conf:.3f} 2nd={second_lab:<22}({second_conf:.3f})")


[Info] Classes: ['Insufficient_Weight', 'Normal_Weight', 'Obesity_Type_I', 'Obesity_Type_II', 'Obesity_Type_III', 'Overweight_Level_I', 'Overweight_Level_II']
[Info] Train male=7783, female=7750
[Info] Test  male=10336, female=10422

[MALE] Fold 1/5




[MALE] Best iteration: 336

[MALE] Fold 2/5
[MALE] Best iteration: 328

[MALE] Fold 3/5
[MALE] Best iteration: 398

[MALE] Fold 4/5
[MALE] Best iteration: 354

[MALE] Fold 5/5
[MALE] Best iteration: 332

[MALE] OOF Accuracy: 0.8876 | Macro F1: 0.7513
[MALE] Best iterations: [336, 328, 398, 354, 332] | Median: 336

[FEMALE] Fold 1/5
[FEMALE] Best iteration: 360

[FEMALE] Fold 2/5
[FEMALE] Best iteration: 334

[FEMALE] Fold 3/5
[FEMALE] Best iteration: 242

[FEMALE] Fold 4/5
[FEMALE] Best iteration: 368

[FEMALE] Fold 5/5
[FEMALE] Best iteration: 363

[FEMALE] OOF Accuracy: 0.9163 | Macro F1: 0.7487
[FEMALE] Best iterations: [360, 334, 242, 368, 363] | Median: 360

OOF Accuracy: 0.9019 | OOF Macro F1: 0.8925

OOF Classification Report:
                      precision    recall  f1-score   support

Insufficient_Weight       0.93      0.94      0.93      1870
      Normal_Weight       0.89      0.88      0.88      2345
     Obesity_Type_I       0.89      0.87      0.88      2207
    Obesit



[MALE (Kaggle)] Best iteration: 336

[MALE (Kaggle)] Fold 2/5
[MALE (Kaggle)] Best iteration: 328

[MALE (Kaggle)] Fold 3/5
[MALE (Kaggle)] Best iteration: 398

[MALE (Kaggle)] Fold 4/5
[MALE (Kaggle)] Best iteration: 354

[MALE (Kaggle)] Fold 5/5
[MALE (Kaggle)] Best iteration: 332

[MALE (Kaggle)] OOF Accuracy: 0.8876 | Macro F1: 0.7513
[MALE (Kaggle)] Best iterations: [336, 328, 398, 354, 332] | Median: 336

[FEMALE (Kaggle)] Fold 1/5
[FEMALE (Kaggle)] Best iteration: 360

[FEMALE (Kaggle)] Fold 2/5
[FEMALE (Kaggle)] Best iteration: 334

[FEMALE (Kaggle)] Fold 3/5
[FEMALE (Kaggle)] Best iteration: 242

[FEMALE (Kaggle)] Fold 4/5
[FEMALE (Kaggle)] Best iteration: 368

[FEMALE (Kaggle)] Fold 5/5
[FEMALE (Kaggle)] Best iteration: 363

[FEMALE (Kaggle)] OOF Accuracy: 0.9163 | Macro F1: 0.7487
[FEMALE (Kaggle)] Best iterations: [360, 334, 242, 368, 363] | Median: 360

✅ Overall Accuracy on Kaggle_test: 0.90947

=== Confusion Matrix (counts) ===
Predicted →
True ↓
Insufficient_Weight   : 

In [6]:
# -------- Overall accuracy to 5 decimals --------
overall_acc = accuracy_score(y_true, y_pred)
print(f"\n✅ Overall Accuracy on Kaggle_test: {overall_acc:.5f}")

# -------- Text-only error analysis (custom order) --------
order = [
    'Insufficient_Weight',
    'Normal_Weight',
    'Overweight_Level_I',
    'Overweight_Level_II',
    'Obesity_Type_I',
    'Obesity_Type_II',
    'Obesity_Type_III'
]

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_true, y_pred, labels=order)
cm_norm = cm.astype(float) / (cm.sum(axis=1, keepdims=True) + 1e-12)

print("\n=== Confusion Matrix (counts) ===")
print("Predicted →")
print("True ↓")
for i, true_class in enumerate(order):
    row = " | ".join(f"{cm[i, j]:4d}" for j in range(len(order)))
    print(f"{true_class:<22}: {row}")

print("\n=== Confusion Matrix (row-normalized) ===")
for i, true_class in enumerate(order):
    row = " | ".join(f"{cm_norm[i, j]:.2f}" for j in range(len(order)))
    print(f"{true_class:<22}: {row}")

print("\n=== Per-class metrics ===")
try:
    print(classification_report(y_true, y_pred, labels=order, target_names=order, digits=4, zero_division=0))
except Exception as e:
    print(f"[Info] classification_report fallback: {e}")
    print(classification_report(y_true, y_pred, digits=4, zero_division=0))

print("\n=== Per-class accuracy (diagonal/row total) ===")
for i, c in enumerate(order):
    total = cm[i].sum()
    correct = cm[i, i]
    acc = correct / total if total > 0 else 0.0
    print(f"{c:<22} | Correct: {correct:3d} / {total:3d} | {acc*100:6.2f}%")

print("\n=== Most common confusions (true → predicted) ===")
pairs = []
for i, t in enumerate(order):
    for j, p in enumerate(order):
        if i == j or cm[i, j] == 0:
            continue
        pairs.append((cm[i, j], t, p, cm_norm[i, j]))
pairs = sorted(pairs, key=lambda x: (-x[0], -x[3]))
for cnt, true_label, pred_label, norm_val in pairs[:10]:
    print(f"{true_label:25} → {pred_label:25} | Count: {cnt:3d} | Row%: {norm_val*100:5.1f}")

print("\n=== Sample of misclassified rows (first 10) ===")
mis_idx = np.where(np.asarray(y_true) != np.asarray(y_pred))[0]
if len(mis_idx) == 0:
    print("🎉 No misclassifications!")
else:
    for idx in mis_idx[:10]:
        true_lab = y_true.iloc[idx] if hasattr(y_true, "iloc") else y_true[idx]
        pred_lab = y_pred[idx]
        conf = float(np.max(kaggle_pred_proba[idx]))
        rank = np.argsort(-kaggle_pred_proba[idx])
        second_idx = rank[1] if rank.size > 1 else rank[0]
        second_lab = le.inverse_transform([second_idx])[0]
        second_conf = float(kaggle_pred_proba[idx][second_idx])
        print(f"Row {idx:4d}: true={true_lab:<22} pred={pred_lab:<22} conf={conf:.3f} 2nd={second_lab:<22}({second_conf:.3f})")


✅ Overall Accuracy on Kaggle_test: 0.90947

=== Confusion Matrix (counts) ===
Predicted →
True ↓
Insufficient_Weight   :  623 |   27 |    3 |    0 |    0 |    0 |    0
Normal_Weight         :   42 |  648 |   38 |    8 |    1 |    0 |    0
Overweight_Level_I    :    4 |   49 |  453 |   68 |    9 |    0 |    0
Overweight_Level_II   :    0 |   16 |   51 |  525 |   45 |    4 |    0
Obesity_Type_I        :    1 |    1 |   11 |   48 |  623 |   17 |    2
Obesity_Type_II       :    0 |    0 |    2 |    5 |   19 |  819 |    0
Obesity_Type_III      :    0 |    0 |    1 |    0 |    1 |    0 | 1061

=== Confusion Matrix (row-normalized) ===
Insufficient_Weight   : 0.95 | 0.04 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00
Normal_Weight         : 0.06 | 0.88 | 0.05 | 0.01 | 0.00 | 0.00 | 0.00
Overweight_Level_I    : 0.01 | 0.08 | 0.78 | 0.12 | 0.02 | 0.00 | 0.00
Overweight_Level_II   : 0.00 | 0.02 | 0.08 | 0.82 | 0.07 | 0.01 | 0.00
Obesity_Type_I        : 0.00 | 0.00 | 0.02 | 0.07 | 0.89 | 0.02 | 0.00
Obesity