In [1]:
# Obesity Risk Prediction using Ensemble Learning
# Author: ChatGPT (based on Mahasarabesh's request)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

import xgboost as xgb
from catboost import CatBoostClassifier

import warnings
warnings.filterwarnings("ignore")

# Load dataset
col_names = ["Gender", "Age", "Height", "Weight", "family_history_with_overweight", "FAVC", "FCVC", "NCP", "CAEC", "SMOKE", "CH2O", "SCC", "FAF", "TUE", "CALC", "MTRANS", "NObeyesdad"]
data = pd.read_csv("ObesityDataSet.csv", names=col_names, header=0)

# Label Encoding
encoder = LabelEncoder()
for col in ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS', 'NObeyesdad']:
    if col in data.columns:
        data[col] = encoder.fit_transform(data[col].astype(str))

# Outlier handling (Age and NCP)
def remove_outliers_iqr(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    median_val = df[col].median()
    df[col] = np.where((df[col] < lower_bound) | (df[col] > upper_bound), median_val, df[col])
    return df

for col in ['Age', 'NCP']:
    if col in data.columns:
        data = remove_outliers_iqr(data, col)

# Feature Scaling
target = 'NObeyesdad'
features = data.columns.tolist()
features.remove(target)
scaler = MinMaxScaler()
data[features] = scaler.fit_transform(data[features])

# Split data
X = data[features]
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Hyperparameter grids
param_grids = {
    'XGB': {
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7],
        'n_estimators': [500, 1000],
        'min_child_weight': [1, 5, 10],
        'gamma': [0, 0.5, 1],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0]
    },
    'GB': {
        'learning_rate': [0.01, 0.05, 0.1],
        'n_estimators': [500, 1000],
        'max_depth': [3, 4, 5]
    },
    'CB': {
        'learning_rate': [0.01, 0.05, 0.1],
        'n_estimators': [500, 1000],
        'depth': [2, 4, 6],
        'min_data_in_leaf': [50, 100, 200]
    },
    'BDT': {
        'n_estimators': [100, 300, 500],
        'bootstrap_features': [True, False]
    },
    'RF': {
        'n_estimators': [500, 1000],
        'max_features': ['sqrt', 'log2', None],
        'max_depth': [None, 10, 20]
    },
    'ET': {
        'n_estimators': [500, 1000],
        'max_depth': [None, 50, 100],
        'min_samples_split': [2, 5, 10]
    },
    'VC': {
        # No parameters to tune directly in VotingClassifier unless tuning base models
    }
}

# Base models
base_models = {
    'XGB': xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42,colsample_bytree= 0.8, gamma= 0, learning_rate= 0.05, max_depth= 5, min_child_weight= 1, n_estimators= 1000, subsample= 0.8),
    'GB': GradientBoostingClassifier(random_state=45,learning_rate= 0.1, max_depth= 5, n_estimators= 500),
    'CB': CatBoostClassifier(verbose=0, thread_count=-1, random_state=45,depth= 6, learning_rate= 0.1, min_data_in_leaf= 50, n_estimators= 1000),
    'BDT': BaggingClassifier(estimator=DecisionTreeClassifier(), random_state=42, n_jobs=-1,bootstrap_features= True, n_estimators= 500),
    'RF': RandomForestClassifier(random_state=42,max_depth= None, max_features= None, n_estimators= 1000),
    'ET': ExtraTreesClassifier(random_state=42,max_depth= None, min_samples_split= 2, n_estimators= 500),
    'VC': VotingClassifier(estimators=[
        ('lr', LogisticRegression(max_iter=1000, random_state=42)),
        ('dt', DecisionTreeClassifier(random_state=42)),
        ('svm', SVC(probability=True, random_state=42))
    ], voting='soft')
}

# Tune models
tuned_models = {}
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [24]:
# Evaluate tuned models
for name, model in base_models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    rec = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    print(f"\n{name} Evaluation:")
    print(f"Accuracy: {acc*100:.2f}%")
    print(f"Precision: {prec*100:.2f}%")
    print(f"Recall: {rec*100:.2f}%")
    print(f"F1-Score: {f1*100:.2f}%")
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred, zero_division=0))


XGB Evaluation:
Accuracy: 96.22%
Precision: 96.53%
Recall: 96.22%
F1-Score: 96.25%
Confusion Matrix:
 [[49  5  0  0  0  0  0]
 [ 1 57  0  0  0  0  0]
 [ 0  0 69  0  0  0  1]
 [ 0  0  1 59  0  0  0]
 [ 0  0  0  1 64  0  0]
 [ 0  5  0  0  0 52  1]
 [ 0  0  0  1  0  0 57]]
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.91      0.94        54
           1       0.85      0.98      0.91        58
           2       0.99      0.99      0.99        70
           3       0.97      0.98      0.98        60
           4       1.00      0.98      0.99        65
           5       1.00      0.90      0.95        58
           6       0.97      0.98      0.97        58

    accuracy                           0.96       423
   macro avg       0.96      0.96      0.96       423
weighted avg       0.97      0.96      0.96       423


GB Evaluation:
Accuracy: 95.74%
Precision: 95.96%
Recall: 95.74%
F1-Score: 95.76%
Confusion Matrix:
 [[48 

In [19]:
import pickle

# Directory path where you want to save the models
save_dir = 'H:\mini_project6'

# Ensure the directory exists
import os
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

# Save each model to a separate pickle file
for model_name, model in base_models.items():
    file_path = os.path.join(save_dir, f"{model_name}.pkl")
    with open(file_path, 'wb') as file:
        pickle.dump(model, file)
    print(f"Model '{model_name}' saved at {file_path}")


Model 'XGB' saved at H:\mini_project6\XGB.pkl
Model 'GB' saved at H:\mini_project6\GB.pkl
Model 'CB' saved at H:\mini_project6\CB.pkl
Model 'BDT' saved at H:\mini_project6\BDT.pkl
Model 'RF' saved at H:\mini_project6\RF.pkl
Model 'ET' saved at H:\mini_project6\ET.pkl
Model 'VC' saved at H:\mini_project6\VC.pkl


In [20]:
import pickle

# Model names
# Dictionary to store loaded models
loaded_models = {}
models=['XGB','GB','CB','BDT','RF','ET','VC']
# Load each model
for model_name in models:
    file_path = f'H:\\mini_project\\{model_name}.pkl'
    with open(file_path, 'rb') as file:
        loaded_models[model_name] = pickle.load(file)
    print(f"Model '{model_name}' loaded successfully.")

# Now, you can access each model through the 'loaded_models' dictionary


Model 'XGB' loaded successfully.
Model 'GB' loaded successfully.
Model 'CB' loaded successfully.
Model 'BDT' loaded successfully.
Model 'RF' loaded successfully.
Model 'ET' loaded successfully.
Model 'VC' loaded successfully.


In [21]:
# Example: Make predictions with the XGBoost model
xgb_model = loaded_models['BDT']

# Assuming you have preprocessed test data X_test
y_pred = xgb_model.predict(X_test)

# Print the predictions
print("Predictions:", y_pred)

# Evaluate the model
from sklearn.metrics import classification_report, accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Classification Report
print("Classification Report:")
print(classification_report(y_test, y_pred))


Predictions: [0 1 6 4 3 2 3 1 2 1 0 1 4 5 1 5 2 2 6 1 1 3 3 5 6 5 2 4 6 2 3 2 5 5 3 1 3
 0 4 0 5 0 4 4 6 4 0 6 3 1 4 0 5 2 6 1 4 4 4 3 6 6 0 6 5 2 2 0 3 2 4 5 0 0
 2 0 0 0 2 2 1 6 4 2 1 1 3 5 6 6 3 6 2 3 5 6 0 1 4 3 2 3 3 5 6 4 2 2 6 3 6
 1 4 3 4 4 0 4 1 1 4 6 4 0 2 3 6 6 5 3 4 6 0 0 0 5 2 2 3 0 4 2 3 2 2 3 6 2
 0 5 5 5 4 0 3 3 4 2 5 6 1 3 2 2 4 6 5 6 6 3 5 1 1 3 3 5 2 5 1 6 2 3 2 3 0
 0 1 4 2 5 5 5 4 5 1 3 6 1 3 2 3 4 6 6 1 0 2 5 1 3 0 4 1 6 6 6 5 2 2 6 1 4
 2 2 1 5 3 4 3 5 6 0 2 4 6 6 1 0 3 5 4 1 4 2 1 5 1 0 2 3 6 1 4 6 3 4 2 4 6
 3 4 2 1 2 4 2 6 4 1 2 2 0 4 5 6 4 3 0 2 4 2 4 6 1 1 1 2 6 2 4 5 1 0 1 2 0
 5 5 6 1 6 5 1 0 1 6 2 5 4 4 4 1 4 2 6 4 1 4 2 5 0 4 1 2 6 3 5 3 0 4 6 4 2
 1 2 0 1 5 6 3 3 5 1 3 0 1 1 0 3 3 4 3 1 3 1 1 4 0 4 2 1 3 0 6 2 0 4 5 5 1
 0 6 3 4 3 2 2 0 1 2 3 1 0 2 0 5 5 1 5 2 4 5 4 6 1 0 6 3 6 1 0 3 4 0 4 2 5
 1 5 2 2 6 1 1 1 3 2 1 3 6 3 1 2]
Accuracy: 96.93%
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.93