In [1]:
# Obesity Risk Prediction using Ensemble Learning
# Author: ChatGPT (based on Mahasarabesh's request)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

import xgboost as xgb
from catboost import CatBoostClassifier

import warnings
warnings.filterwarnings("ignore")

# Load dataset
col_names = ["Gender", "Age", "Height", "Weight", "family_history_with_overweight", "FAVC", "FCVC", "NCP", "CAEC", "SMOKE", "CH2O", "SCC", "FAF", "TUE", "CALC", "MTRANS", "NObeyesdad"]
data = pd.read_csv("/kaggle/input/obesity-data-set/ObesityDataSet_raw_and_data_sinthetic.csv", names=col_names, header=0)

# Label Encoding
encoder = LabelEncoder()
for col in ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS', 'NObeyesdad']:
    if col in data.columns:
        data[col] = encoder.fit_transform(data[col].astype(str))

# Outlier handling (Age and NCP)
def remove_outliers_iqr(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    median_val = df[col].median()
    df[col] = np.where((df[col] < lower_bound) | (df[col] > upper_bound), median_val, df[col])
    return df

for col in ['Age', 'NCP']:
    if col in data.columns:
        data = remove_outliers_iqr(data, col)

# Feature Scaling
target = 'NObeyesdad'
features = data.columns.tolist()
features.remove(target)
scaler = MinMaxScaler()
data[features] = scaler.fit_transform(data[features])

# Split data
X = data[features]
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Hyperparameter grids
param_grids = {
    'XGB': {
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'n_estimators': [500, 1000],
        'min_child_weight': [1, 5, 10],
        'gamma': [0, 0.5, 1],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0]
    },
    'GB': {
        'learning_rate': [0.01, 0.05, 0.1],
        'n_estimators': [500, 1000],
        'max_depth': [3, 4, 5]
    },
    'CB': {
        'learning_rate': [0.01, 0.05, 0.1],
        'n_estimators': [500, 1000],
        'depth': [2, 4, 6],
        'min_data_in_leaf': [50, 100, 200]
    },
    'BDT': {
        'n_estimators': [100, 300, 500],
        'bootstrap_features': [True, False]
    },
    'RF': {
        'n_estimators': [500, 1000],
        'max_features': ['sqrt', 'log2', None],
        'max_depth': [None, 10, 20]
    },
    'ET': {
        'n_estimators': [500, 1000],
        'max_depth': [None, 50, 100],
        'min_samples_split': [2, 5, 10]
    },
    'VC': {
        # No parameters to tune directly in VotingClassifier unless tuning base models
    }
}

# Base models
base_models = {
    'XGB': xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42),
    'GB': GradientBoostingClassifier(random_state=45),
    'CB': CatBoostClassifier(verbose=0, thread_count=-1, random_state=45),
    'BDT': BaggingClassifier(base_estimator=DecisionTreeClassifier(), random_state=42, n_jobs=-1),
    'RF': RandomForestClassifier(random_state=42),
    'ET': ExtraTreesClassifier(random_state=42),
    'VC': VotingClassifier(estimators=[
        ('lr', LogisticRegression(max_iter=1000, random_state=42)),
        ('dt', DecisionTreeClassifier(random_state=42)),
        ('svm', SVC(probability=True, random_state=42))
    ], voting='soft')
}

# Tune models
tuned_models = {}
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [2]:
for name in base_models:
    print(f"\nTuning {name}...")
    if name == 'VC':
        tuned_models[name] = base_models[name]
    else:
        grid = GridSearchCV(base_models[name], param_grids[name], scoring='accuracy', cv=kfold, n_jobs=-1)
        grid.fit(X_train, y_train)
        print(f"Best Params for {name}:", grid.best_params_)
        tuned_models[name] = grid.best_estimator_

# Evaluate tuned models
for name, model in tuned_models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    rec = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    print(f"\n{name} Evaluation:")
    print(f"Accuracy: {acc*100:.2f}%")
    print(f"Precision: {prec*100:.2f}%")
    print(f"Recall: {rec*100:.2f}%")
    print(f"F1-Score: {f1*100:.2f}%")
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred, zero_division=0))


Tuning XGB...
Best Params for XGB: {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.05, 'max_depth': 5, 'min_child_weight': 1, 'n_estimators': 1000, 'subsample': 0.8}

Tuning GB...
Best Params for GB: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 500}

Tuning CB...
Best Params for CB: {'depth': 6, 'learning_rate': 0.1, 'min_data_in_leaf': 50, 'n_estimators': 1000}

Tuning BDT...




Best Params for BDT: {'bootstrap_features': True, 'n_estimators': 500}

Tuning RF...
Best Params for RF: {'max_depth': None, 'max_features': None, 'n_estimators': 1000}

Tuning ET...
Best Params for ET: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 500}

Tuning VC...

XGB Evaluation:
Accuracy: 96.22%
Precision: 96.52%
Recall: 96.22%
F1-Score: 96.26%
Confusion Matrix:
 [[49  5  0  0  0  0  0]
 [ 1 57  0  0  0  0  0]
 [ 0  0 68  0  0  0  2]
 [ 0  0  1 59  0  0  0]
 [ 0  0  0  1 64  0  0]
 [ 0  5  0  0  0 53  0]
 [ 0  0  1  0  0  0 57]]
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.91      0.94        54
           1       0.85      0.98      0.91        58
           2       0.97      0.97      0.97        70
           3       0.98      0.98      0.98        60
           4       1.00      0.98      0.99        65
           5       1.00      0.91      0.95        58
           6       0.97      0.98      0.97