<a href="https://colab.research.google.com/github/YuvrajDesh/data/blob/main/Xgboost_obesity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
#  XGBoost script
import warnings
warnings.filterwarnings("ignore")

import time
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import xgboost as xgb
import os

SEED = 42
VALID_SIZE = 0.20
OUT_FILE = "submission_xgb_earlystop.csv"

# Load data
if not os.path.exists("train.csv") or not os.path.exists("test.csv"):
    raise FileNotFoundError("Please put train.csv and test.csv in the working directory before running this script.")

train_df = pd.read_csv("train.csv")
test_df  = pd.read_csv("test.csv")

#  Preserve test ids
if "id" in test_df.columns:
    test_ids = test_df["id"].copy()
else:
    test_ids = pd.Series(np.arange(len(test_df)), name="id")

# Drop id from train if present
if "id" in train_df.columns:
    train_df = train_df.drop(columns=["id"])

#  Separate target
TARGET = "WeightCategory"
if TARGET not in train_df.columns:
    raise ValueError(f"Target column '{TARGET}' not found in train.csv")
y_raw = train_df[TARGET].astype(str).copy()
X_train_raw = train_df.drop(columns=[TARGET]).reset_index(drop=True)
X_test_raw = test_df.copy()
if "id" in X_test_raw.columns:
    X_test_raw = X_test_raw.drop(columns=["id"])
X_test_raw = X_test_raw.reset_index(drop=True)

#  Basic cleaning
for df in (X_train_raw, X_test_raw):
    for c in df.columns:
        # numeric -> median
        if df[c].dtype.kind in "biufc":
            if df[c].isnull().any():
                med = pd.concat([X_train_raw[c], X_test_raw[c]]).median()
                df[c].fillna(med, inplace=True)
        else:
            # categorical/text -> string and placeholder for missing
            df[c] = df[c].astype(str).fillna("NA").str.strip()

#  Frequency encoding for categorical columns
combined_for_freq = pd.concat([X_train_raw, X_test_raw], axis=0, ignore_index=True)
cat_cols = combined_for_freq.select_dtypes(include=["object", "category"]).columns.tolist()

for col in cat_cols:
    freq = combined_for_freq[col].value_counts(dropna=False)
    mapping = freq.to_dict()
    X_train_raw[col + "_freq"] = X_train_raw[col].map(mapping).fillna(0).astype(int)
    X_test_raw[col + "_freq"]  = X_test_raw[col].map(mapping).fillna(0).astype(int)

# drop original categorical columns
X_train_raw = X_train_raw.drop(columns=cat_cols)
X_test_raw  = X_test_raw.drop(columns=cat_cols)

#  Align features (one-hot fallback then align)
full = pd.concat([X_train_raw, X_test_raw], axis=0, ignore_index=True)
remaining_cat = full.select_dtypes(include=["object", "category"]).columns.tolist()
if remaining_cat:
    full = pd.get_dummies(full, columns=remaining_cat, drop_first=False)

X_all = full.iloc[: len(X_train_raw), :].copy()
X_test = full.iloc[len(X_train_raw): , :].copy()
X_all, X_test = X_all.align(X_test, join="left", axis=1, fill_value=0)

#  Encode target
le = LabelEncoder()
y = le.fit_transform(y_raw)
num_classes = len(le.classes_)

#  Train/validation split (stratified)
X_tr, X_val, y_tr, y_val = train_test_split(
    X_all, y, test_size=VALID_SIZE, stratify=y, random_state=SEED
)

#  Set XGBoost params (many trees + small eta)
params = {
    "objective": "multi:softprob",
    "num_class": num_classes,
    "eta": 0.01,                 # small learning rate
    "max_depth": 5,
    "subsample": 0.8,
    "colsample_bytree": 0.7,
    "min_child_weight": 3,
    "alpha": 0.01,              # L1 reg
    "lambda": 1.0,              # L2 reg
    "eval_metric": "mlogloss",
    "verbosity": 0,
    "seed": SEED
}
# Try to use 'hist' tree method when available for speed
try:
    params["tree_method"] = "hist"
except Exception:
    pass

num_boost_round = 5000
early_stopping_rounds = 100

#  Build DMatrix with feature names preserved
# xgboost will use DataFrame column names as feature names if we pass a DataFrame
dtrain = xgb.DMatrix(X_tr, label=y_tr, feature_names=list(X_tr.columns))
dval   = xgb.DMatrix(X_val, label=y_val, feature_names=list(X_val.columns))
dall   = xgb.DMatrix(X_all, label=y, feature_names=list(X_all.columns))
dtest  = xgb.DMatrix(X_test, feature_names=list(X_test.columns))

watchlist = [(dtrain, "train"), (dval, "valid")]

#  Train with early stopping
t0 = time.time()
bst = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=watchlist,
    early_stopping_rounds=early_stopping_rounds,
    verbose_eval=50
)
t_elapsed = time.time() - t0
print(f"\nTraining completed in {t_elapsed:.1f}s. Best iteration: {bst.best_iteration}")

#  Validate
pred_val = bst.predict(dval)
pred_val_idx = np.argmax(pred_val, axis=1)
val_acc = accuracy_score(y_val, pred_val_idx)
print("Validation accuracy (after early stopping):", val_acc)

#  Retrain on full data using best_iteration
best_iter = bst.best_iteration + 1 if bst.best_iteration is not None else num_boost_round
print("Retraining on whole training set for", best_iter, "rounds...")
bst_full = xgb.train(params, dall, num_boost_round=best_iter, verbose_eval=50)

#  Predict on test and save submission
pred_test = bst_full.predict(dtest)
pred_idx = np.argmax(pred_test, axis=1)
pred_labels = le.inverse_transform(pred_idx)

submission = pd.DataFrame({"id": test_ids, "WeightCategory": pred_labels})
submission.to_csv(OUT_FILE, index=False)
print("\nSaved submission:", OUT_FILE)
print(submission.head())

#  Feature importance (top 20)
try:
    importance = bst_full.get_score(importance_type="gain")  # keys are feature names
    if not importance:
        print("No feature importance returned.")
    else:
        fi = pd.Series(importance).sort_values(ascending=False)
        fi = fi.rename_axis("feature").reset_index(name="gain")
        # Show top 20
        print("\nTop 20 features by gain:")
        print(fi.head(20).to_string(index=False))
except Exception as e:
    print("Could not extract feature importance:", e)

[0]	train-mlogloss:1.92357	valid-mlogloss:1.92392
[50]	train-mlogloss:1.24781	valid-mlogloss:1.26244
[100]	train-mlogloss:0.90665	valid-mlogloss:0.92810
[150]	train-mlogloss:0.70210	valid-mlogloss:0.72772
[200]	train-mlogloss:0.57196	valid-mlogloss:0.60114
[250]	train-mlogloss:0.48357	valid-mlogloss:0.51588
[300]	train-mlogloss:0.42076	valid-mlogloss:0.45561
[350]	train-mlogloss:0.37610	valid-mlogloss:0.41348
[400]	train-mlogloss:0.34362	valid-mlogloss:0.38365
[450]	train-mlogloss:0.31830	valid-mlogloss:0.36125
[500]	train-mlogloss:0.29762	valid-mlogloss:0.34342
[550]	train-mlogloss:0.28127	valid-mlogloss:0.33003
[600]	train-mlogloss:0.26765	valid-mlogloss:0.31932
[650]	train-mlogloss:0.25620	valid-mlogloss:0.31099
[700]	train-mlogloss:0.24625	valid-mlogloss:0.30421
[750]	train-mlogloss:0.23768	valid-mlogloss:0.29846
[800]	train-mlogloss:0.23025	valid-mlogloss:0.29384
[850]	train-mlogloss:0.22353	valid-mlogloss:0.28968
[900]	train-mlogloss:0.21751	valid-mlogloss:0.28633
[950]	train-mlo