In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc
from sklearn.model_selection import train_test_split, GridSearchCV
import xgboost as xgb
from catboost import CatBoostClassifier
import plotly.graph_objects as go
from matplotlib.patches import Circle, RegularPolygon
from matplotlib.path import Path
from matplotlib.projections import register_projection
from matplotlib.projections.polar import PolarAxes
from matplotlib.spines import Spine
from matplotlib.transforms import Affine2D
import matplotlib.cm as cm
import pickle

In [2]:
# Load the dataset
df = pd.read_csv("C:/Users/Yassine Lahniche/Downloads/processed_obesity_dataset.csv")

# Prepare features and target
X = df.drop("NObeyesdad", axis=1)
y = df["NObeyesdad"]

# Bin the target variable into 6 categories
Y = pd.cut(y, bins=6, labels=[0, 1, 2, 3, 4, 5])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [3]:
# Random Forest Classifier
print("Training Random Forest Classifier...")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# CatBoost Classifier
print("Training CatBoost Classifier...")
catboost_model = CatBoostClassifier(iterations=100, depth=5, learning_rate=0.1, random_state=42, verbose=False)
catboost_model.fit(X_train, y_train)

# Baseline XGBoost Classifier
print("Training Baseline XGBoost Classifier...")
xgb_baseline = xgb.XGBClassifier(random_state=42)
xgb_baseline.fit(X_train, y_train)

Training Random Forest Classifier...
Training CatBoost Classifier...
Training Baseline XGBoost Classifier...


In [4]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve, auc
import xgboost as xgb
from scipy.stats import randint, uniform
import time

# Define the parameter distribution for tuning
def get_parameter_distribution():
    param_dist = {
        'n_estimators': randint(50, 500),
        'max_depth': randint(3, 10),
        'learning_rate': uniform(0.01, 0.3),
        'subsample': uniform(0.6, 0.4),
        'colsample_bytree': uniform(0.6, 0.4),
        'gamma': uniform(0, 0.5),
        'min_child_weight': randint(1, 10),
        'reg_alpha': uniform(0, 1),
        'reg_lambda': uniform(0, 1),
        'scale_pos_weight': uniform(0.5, 2.5),
    }
    return param_dist

# Function to tune XGBoost
def tune_xgboost(X_train, y_train, scoring='f1_weighted', n_iter=50, cv=5, n_jobs=-1, verbose=2):
    print(f"Starting XGBoost tuning with RandomizedSearchCV for {scoring}...")
    start_time = time.time()

    # Define the base model
    xgb_model = xgb.XGBClassifier(
        objective='binary:logistic' if len(np.unique(y_train)) == 2 else 'multi:softprob',
        random_state=42,
        use_label_encoder=False,
        eval_metric='logloss'
    )

    # Define the parameter space
    param_dist = get_parameter_distribution()

    # Set up the cross-validation strategy
    cv_strategy = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)

    # Set up RandomizedSearchCV
    search = RandomizedSearchCV(
        estimator=xgb_model,
        param_distributions=param_dist,
        n_iter=n_iter,
        scoring=scoring,
        cv=cv_strategy,
        verbose=verbose,
        random_state=42,
        n_jobs=n_jobs,
        return_train_score=True
    )

    # Fit RandomizedSearchCV
    search.fit(X_train, y_train)

    # Get the best model
    best_model = search.best_estimator_

    # Calculate time taken
    time_taken = time.time() - start_time
    print(f"XGBoost tuning completed in {time_taken:.2f} seconds")
    print(f"Best {scoring} score: {search.best_score_:.4f}")
    print("Best parameters:")
    for param, value in search.best_params_.items():
        print(f"  {param}: {value}")

    return best_model, search

# Tune XGBoost
xgb_tuned = tune_xgboost(X_train, y_train)[0]

Starting XGBoost tuning with RandomizedSearchCV for f1_weighted...
Fitting 5 folds for each of 50 candidates, totalling 250 fits


Parameters: { "scale_pos_weight", "use_label_encoder" } are not used.



XGBoost tuning completed in 42.57 seconds
Best f1_weighted score: 0.9749
Best parameters:
  colsample_bytree: 0.8605939087431151
  gamma: 0.10334217993837269
  learning_rate: 0.09218833819634012
  max_depth: 6
  min_child_weight: 2
  n_estimators: 302
  reg_alpha: 0.0944429607559284
  reg_lambda: 0.6830067734163568
  scale_pos_weight: 0.6779716211505724
  subsample: 0.7275902521175045


In [5]:
# Make predictions
y1_pred = rf_model.predict(X_test)
y2_pred = catboost_model.predict(X_test)
y3_pred = xgb_baseline.predict(X_test)
y4_pred = xgb_tuned.predict(X_test)

# Function to evaluate models
def evaluate_model(y_true, preds, model_name):
    accuracy = accuracy_score(y_true, preds)
    precision = precision_score(y_true, preds, average="weighted")
    recall = recall_score(y_true, preds, average="weighted")
    f1 = f1_score(y_true, preds, average="weighted")

    print(f"Metrics for {model_name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    print("-" * 30)

# Evaluate each model
evaluate_model(y_test, y1_pred, "Random Forest")
evaluate_model(y_test, y2_pred, "CatBoost")
evaluate_model(y_test, y3_pred, "XGBoost Baseline")
evaluate_model(y_test, y4_pred, "XGBoost Tuned")

Metrics for Random Forest:
Accuracy: 0.9624
Precision: 0.9623
Recall: 0.9624
F1-Score: 0.9619
------------------------------
Metrics for CatBoost:
Accuracy: 0.9549
Precision: 0.9588
Recall: 0.9549
F1-Score: 0.9555
------------------------------
Metrics for XGBoost Baseline:
Accuracy: 0.9875
Precision: 0.9875
Recall: 0.9875
F1-Score: 0.9875
------------------------------
Metrics for XGBoost Tuned:
Accuracy: 0.9850
Precision: 0.9850
Recall: 0.9850
F1-Score: 0.9849
------------------------------


In [6]:
# Save models to disk
with open("xgb_baseline.pkl", "wb") as file:
    pickle.dump(xgb_baseline, file)
with open("xgb_tuned.pkl", "wb") as file:
    pickle.dump(xgb_tuned, file)
with open("rf_model.pkl", "wb") as file:
    pickle.dump(rf_model, file)
with open("catboost_model.pkl", "wb") as file:
    pickle.dump(catboost_model, file)

print("Models saved successfully!")

Models saved successfully!
