In [None]:
# ===============================================
# 🚀 Obesity Classification - Gradient Boosting + K-Fold (Accuracy:90.9%)
# ===============================================

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report
from lightgbm import LGBMClassifier
from lightgbm.callback import early_stopping
from google.colab import drive
import warnings
warnings.filterwarnings("ignore")

# === 1️⃣ Mount Google Drive & Load Data ===
drive.mount('/mnt/drive')
train_df = pd.read_csv('/mnt/drive/MyDrive/Obesity Dataset/train.csv')
test_df = pd.read_csv('/mnt/drive/MyDrive/Obesity Dataset/test.csv')
sample_df = pd.read_csv('/mnt/drive/MyDrive/Obesity Dataset/sample_submission.csv')

print("Loaded:", train_df.shape, test_df.shape, sample_df.shape)

# === 2️⃣ Preserve Test IDs ===
test_ids = test_df["id"].copy()

# === 3️⃣ Prepare Train/Test Data ===
TARGET = "NObeyesdad" # Corrected Target Column Name based on common datasets of this type
if TARGET not in train_df.columns:
    # If the target column is different, find it by excluding ID and features present in test set
    TARGET = [col for col in train_df.columns if col not in test_df.columns and col != 'id'][0]
    print(f"Target column '{TARGET}' identified.")

X = train_df.drop(columns=["id", TARGET])
y = train_df[TARGET]

# Encode target
le = LabelEncoder()
y_enc = le.fit_transform(y)

# One-hot encode categorical features
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
X_enc = pd.get_dummies(X, columns=cat_cols)
test_enc = pd.get_dummies(test_df.drop(columns=["id"]), columns=cat_cols)

# Align train/test features
X_aligned, test_aligned = X_enc.align(test_enc, join="left", axis=1, fill_value=0)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_aligned)
test_scaled = scaler.transform(test_aligned)

# === 4️⃣ Gradient Boosting with K-Fold Cross-Validation ===
N_SPLITS = 10
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

oof_preds = np.zeros((len(X_scaled),))
test_preds_agg = np.zeros((len(test_scaled), len(le.classes_)))
models = []

print(f"\n🚀 Starting training with {N_SPLITS}-Fold Stratified CV...")

for fold, (train_idx, val_idx) in enumerate(skf.split(X_scaled, y_enc)):
    print(f"--- Fold {fold+1}/{N_SPLITS} ---")
    # Split data for this fold
    X_tr, y_tr = X_scaled[train_idx], y_enc[train_idx]
    X_val, y_val = X_scaled[val_idx], y_enc[val_idx]

    # Initialize and train the model
    model = LGBMClassifier(
        objective='multiclass',
        num_class=len(le.classes_),
        random_state=42,
        n_estimators=1000,
        n_jobs=-1
    )
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        eval_metric='multi_logloss',
        callbacks=[
            early_stopping(100, verbose=False)
        ]
    )

    # Store model and make predictions
    models.append(model)
    val_preds = model.predict(X_val)
    oof_preds[val_idx] = val_preds
    test_preds_agg += model.predict_proba(test_scaled) / N_SPLITS

print("\n✅ Training complete.")

# === 5️⃣ Evaluate Overall OOF Predictions ===
oof_accuracy = accuracy_score(y_enc, oof_preds)
print(f"\nOverall Out-of-Fold Accuracy: {oof_accuracy:.5f}")
print("\nOverall Classification Report:")
print(classification_report(y_enc, oof_preds, target_names=le.classes_))


# === 6️⃣ Final Predictions on Test Set ===
final_test_preds = np.argmax(test_preds_agg, axis=1)
pred_labels = le.inverse_transform(final_test_preds)


# === 7️⃣ Build Submission ===
submission = pd.DataFrame({
    "id": test_ids,
    TARGET: pred_labels
})

# Reorder if needed to match sample submission
if (
    len(sample_df) == len(submission)
    and set(sample_df["id"]) == set(submission["id"])
):
    submission = sample_df[["id"]].merge(submission, on="id", how="left")
    print("\nSubmission reordered to match sample_submission format.")

# === 8️⃣ Save Final File ===
submission.to_csv('/mnt/drive/MyDrive/Obesity Dataset/submission_lgbm_kfold.csv', index=False)
print("\n✅ submission_lgbm_kfold.csv saved successfully!")
print(submission.head())

