<a href="https://colab.research.google.com/github/YuvrajDesh/Xgboost_obesity/blob/main/Xgboost_obesity_varaints.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


**Subject:** Machine Learning
**Team Name:** MT2025135_MT2025040  
**Member 1:** Yuvraj Deshmukh **Roll No:** MT2025040  
**Member 2:** Yash Parande  **Roll No:** MT2025135


In [None]:

#  run 16 XGBoost variants


import warnings
warnings.filterwarnings("ignore")

import time
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import xgboost as xgb
import os
import json

SEED = 42
VALID_SIZE = 0.20

OUT_PREFIX = "submission_model_"   # will append number + .csv
LOG_CSV = "model_results.csv"

#  Load data
if not os.path.exists("train.csv") or not os.path.exists("test.csv"):
    raise FileNotFoundError("Please put train.csv and test.csv in the working directory before running this script.")

train_df = pd.read_csv("train.csv")
test_df  = pd.read_csv("test.csv")

#  Preserve test ids
if "id" in test_df.columns:
    test_ids = test_df["id"].copy()
else:
    test_ids = pd.Series(np.arange(len(test_df)), name="id")

# Drop id from train if present
if "id" in train_df.columns:
    train_df = train_df.drop(columns=["id"])

#  Separate target
TARGET = "WeightCategory"
if TARGET not in train_df.columns:
    raise ValueError(f"Target column '{TARGET}' not found in train.csv")
y_raw = train_df[TARGET].astype(str).copy()
X_train_raw = train_df.drop(columns=[TARGET]).reset_index(drop=True)
X_test_raw = test_df.copy()
if "id" in X_test_raw.columns:
    X_test_raw = X_test_raw.drop(columns=["id"])
X_test_raw = X_test_raw.reset_index(drop=True)

#  Basic cleaning
for df in (X_train_raw, X_test_raw):
    for c in df.columns:
        # numeric -> median
        if df[c].dtype.kind in "biufc":
            if df[c].isnull().any():
                med = pd.concat([X_train_raw[c], X_test_raw[c]]).median()
                df[c].fillna(med, inplace=True)
        else:
            # categorical/text -> string and placeholder for missing
            df[c] = df[c].astype(str).fillna("NA").str.strip()

#  Frequency encoding for categorical columns
combined_for_freq = pd.concat([X_train_raw, X_test_raw], axis=0, ignore_index=True)
cat_cols = combined_for_freq.select_dtypes(include=["object", "category"]).columns.tolist()

for col in cat_cols:
    freq = combined_for_freq[col].value_counts(dropna=False)
    mapping = freq.to_dict()
    X_train_raw[col + "_freq"] = X_train_raw[col].map(mapping).fillna(0).astype(int)
    X_test_raw[col + "_freq"]  = X_test_raw[col].map(mapping).fillna(0).astype(int)

# drop original categorical columns
X_train_raw = X_train_raw.drop(columns=cat_cols)
X_test_raw  = X_test_raw.drop(columns=cat_cols)

#  Align features (one-hot fallback then align)
full = pd.concat([X_train_raw, X_test_raw], axis=0, ignore_index=True)
remaining_cat = full.select_dtypes(include=["object", "category"]).columns.tolist()
if remaining_cat:
    full = pd.get_dummies(full, columns=remaining_cat, drop_first=False)

X_all = full.iloc[: len(X_train_raw), :].copy()
X_test = full.iloc[len(X_train_raw): , :].copy()
X_all, X_test = X_all.align(X_test, join="left", axis=1, fill_value=0)

#  Encode target
le = LabelEncoder()
y = le.fit_transform(y_raw)
num_classes = len(le.classes_)

#  Train/validation split (stratified)
X_tr, X_val, y_tr, y_val = train_test_split(
    X_all, y, test_size=VALID_SIZE, stratify=y, random_state=SEED
)

#  Build DMatrix once (reuse for every variant)
dtrain = xgb.DMatrix(X_tr, label=y_tr, feature_names=list(X_tr.columns))
dval   = xgb.DMatrix(X_val, label=y_val, feature_names=list(X_val.columns))
dall   = xgb.DMatrix(X_all, label=y, feature_names=list(X_all.columns))
dtest  = xgb.DMatrix(X_test, feature_names=list(X_test.columns))

watchlist = [(dtrain, "train"), (dval, "valid")]

#  Base  params for reference
base = {
    "objective": "multi:softprob",
    "num_class": num_classes,
    "eval_metric": "mlogloss",
    "verbosity": 0,
    "seed": SEED,
    "tree_method": "hist"
}

#  16 variant parameter sets (small, guided perturbations)
variants = [
    # Group 1: learning rate tweaks
    {"eta": 0.015, "max_depth": 5, "subsample": 0.8, "colsample_bytree": 0.7, "min_child_weight": 3, "alpha": 0.01, "lambda": 1.0},
    {"eta": 0.008, "max_depth": 5, "subsample": 0.8, "colsample_bytree": 0.7, "min_child_weight": 3, "alpha": 0.01, "lambda": 1.0},
    {"eta": 0.012, "max_depth": 5, "subsample": 0.85, "colsample_bytree": 0.75, "min_child_weight": 3, "alpha": 0.01, "lambda": 1.0},

    # Group 2: tree structure
    {"eta": 0.01, "max_depth": 4, "subsample": 0.8, "colsample_bytree": 0.7, "min_child_weight": 3, "alpha": 0.01, "lambda": 1.0},
    {"eta": 0.01, "max_depth": 6, "subsample": 0.8, "colsample_bytree": 0.7, "min_child_weight": 3, "alpha": 0.01, "lambda": 1.0},
    {"eta": 0.01, "max_depth": 5, "subsample": 0.75, "colsample_bytree": 0.8, "min_child_weight": 3, "alpha": 0.01, "lambda": 1.0},
    {"eta": 0.01, "max_depth": 5, "subsample": 0.85, "colsample_bytree": 0.6, "min_child_weight": 3, "alpha": 0.01, "lambda": 1.0},

    # Group 3: regularization
    {"eta": 0.01, "max_depth": 5, "subsample": 0.8, "colsample_bytree": 0.7, "min_child_weight": 3, "alpha": 0.05, "lambda": 1.0},
    {"eta": 0.01, "max_depth": 5, "subsample": 0.8, "colsample_bytree": 0.7, "min_child_weight": 3, "alpha": 0.01, "lambda": 1.5},
    {"eta": 0.01, "max_depth": 5, "subsample": 0.8, "colsample_bytree": 0.7, "min_child_weight": 4, "alpha": 0.01, "lambda": 1.0},

    # Group 4: sampling
    {"eta": 0.01, "max_depth": 5, "subsample": 0.7, "colsample_bytree": 0.7, "min_child_weight": 3, "alpha": 0.01, "lambda": 1.0},
    {"eta": 0.01, "max_depth": 5, "subsample": 0.9, "colsample_bytree": 0.8, "min_child_weight": 3, "alpha": 0.01, "lambda": 1.0},

    # Group 5: eta-depth combos
    {"eta": 0.007, "max_depth": 6, "subsample": 0.8, "colsample_bytree": 0.7, "min_child_weight": 3, "alpha": 0.01, "lambda": 1.0},
    {"eta": 0.02, "max_depth": 4, "subsample": 0.8, "colsample_bytree": 0.7, "min_child_weight": 3, "alpha": 0.01, "lambda": 1.0},

    # Group 6: grow policy / booster style
    {"eta": 0.01, "max_depth": 5, "subsample": 0.8, "colsample_bytree": 0.7, "min_child_weight": 3, "alpha": 0.01, "lambda": 1.0, "grow_policy": "lossguide"},
    {"eta": 0.01, "max_depth": 5, "subsample": 0.8, "colsample_bytree": 0.7, "min_child_weight": 3, "alpha": 0.01, "lambda": 1.0, "grow_policy": "depthwise"},
]

# sanity check
if len(variants) != 16:
    raise RuntimeError("Expected 16 variants")

#  Training loop for all variants
results = []
model_files = []

for idx, v in enumerate(variants, start=1):
    # build params for xgb.train
    p = base.copy()
    p.update(v)  # variant overrides base fields

    print("\n" + "="*60)
    print(f"Running model {idx}/16 with params:")
    print(json.dumps(p, indent=2))
    print("="*60)

    # train on train/val with early stopping
    t0 = time.time()
    bst = xgb.train(
        p,
        dtrain,
        num_boost_round=5000,
        evals=watchlist,
        early_stopping_rounds=100,
        verbose_eval=50
    )
    t_elapsed = time.time() - t0
    best_iter = bst.best_iteration if bst.best_iteration is not None else 5000
    print(f"Model {idx} training done in {t_elapsed:.1f}s, best_iteration={best_iter}")

    # validation accuracy
    pred_val = bst.predict(dval)
    pred_val_idx = np.argmax(pred_val, axis=1)
    val_acc = accuracy_score(y_val, pred_val_idx)
    print(f"Validation accuracy (model {idx}): {val_acc:.5f}")

    # retrain on full data using best_iter
    retrain_rounds = best_iter + 1 if best_iter is not None else 5000
    print(f"Retraining model {idx} on full data for {retrain_rounds} rounds...")
    bst_full = xgb.train(p, dall, num_boost_round=retrain_rounds, verbose_eval=50)

    # predict on test and save submission
    pred_test = bst_full.predict(dtest)
    pred_idx = np.argmax(pred_test, axis=1)
    pred_labels = le.inverse_transform(pred_idx)

    out_name = f"{OUT_PREFIX}{idx}.csv"
    submission = pd.DataFrame({"id": test_ids, "WeightCategory": pred_labels})
    submission.to_csv(out_name, index=False)
    print(f"Saved submission: {out_name}")

    # record results
    results.append({
        "model_idx": idx,
        "val_accuracy": float(val_acc),
        "best_iter": int(best_iter) if best_iter is not None else None,
        "params": v
    })
    model_files.append(out_name)

# ----------------- Save results CSV -----------------
df_results = pd.DataFrame(results)
# flatten params column to JSON string for readability
df_results["params_json"] = df_results["params"].apply(lambda x: json.dumps(x))
df_results = df_results.drop(columns=["params"])
df_results = df_results[["model_idx", "val_accuracy", "best_iter", "params_json"]]
df_results.to_csv(LOG_CSV, index=False)
print(f"\nSaved model results to {LOG_CSV}")
print("Generated submission files:", model_files)



Running model 1/16 with params:
{
  "objective": "multi:softprob",
  "num_class": 7,
  "eval_metric": "mlogloss",
  "verbosity": 0,
  "seed": 42,
  "tree_method": "hist",
  "eta": 0.015,
  "max_depth": 5,
  "subsample": 0.8,
  "colsample_bytree": 0.7,
  "min_child_weight": 3,
  "alpha": 0.01,
  "lambda": 1.0
}
[0]	train-mlogloss:1.91438	valid-mlogloss:1.91491
[50]	train-mlogloss:1.04989	valid-mlogloss:1.06869
[100]	train-mlogloss:0.69987	valid-mlogloss:0.72584
[150]	train-mlogloss:0.52146	valid-mlogloss:0.55216
[200]	train-mlogloss:0.42068	valid-mlogloss:0.45547
[250]	train-mlogloss:0.35840	valid-mlogloss:0.39731
[300]	train-mlogloss:0.31651	valid-mlogloss:0.35967
[350]	train-mlogloss:0.28759	valid-mlogloss:0.33534
[400]	train-mlogloss:0.26699	valid-mlogloss:0.31921
[450]	train-mlogloss:0.25112	valid-mlogloss:0.30773
[500]	train-mlogloss:0.23785	valid-mlogloss:0.29873
[550]	train-mlogloss:0.22690	valid-mlogloss:0.29202
[600]	train-mlogloss:0.21747	valid-mlogloss:0.28682
[650]	train-ml