In [1]:
# Import necessary Modules
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
# Load the dataset
data = pd.read_csv('ObesityDataSet.csv')  
# Separate features and target variable
X = data.drop(labels=['NObeyesdad'], axis=1)  # Features
y = data['NObeyesdad'] #Target

In [3]:
# Separating numerical and categorical features
categorical_cols = X.select_dtypes(include='O').columns
numerical_cols = X.select_dtypes(exclude='O').columns

In [4]:
#creating pipeline
# Numerical pipeline
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scalar', StandardScaler())
])

# Categorical pipeline
cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder())
])

# Combining categorical and numerical pipelines
preprocessor = ColumnTransformer(transformers=[
    ('num_pipeline', num_pipeline, numerical_cols),
    ('cat_pipeline', cat_pipeline, categorical_cols)
])


In [5]:
# Preprocess the data
X_preprocessed = pd.DataFrame(preprocessor.fit_transform(X))

In [6]:

# Splitting the Data 
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y, test_size=0.3, random_state=42)

In [7]:
# Model Training and Evaluation
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'Decission Tree': DecisionTreeClassifier()
}

In [8]:
results = {}
for model_name, model in models.items():
    # Model training
    model.fit(X_train, y_train)

    # Model evaluation
    y_pred = model.predict(X_test)
    accuracy = round(accuracy_score(y_test, y_pred)*100,2)
    report = classification_report(y_test, y_pred)

    results[model_name] = {'model': model, 'accuracy': accuracy, 'classification_report': report}

In [9]:
# Display accuracy score for all models
print("="*50)    
for model_name, result in results.items():
    print(f'{model_name} Accuracy: {result["accuracy"]:.2f}%')
print("="*50)

# Model Selection
best_model_name = max(results, key=lambda k: results[k]['accuracy'])
best_model = results[best_model_name]['model']
print(f'Best Model: {best_model_name}')
print(f'Accuracy: {results[best_model_name]["accuracy"]:.2f} %')
print(f'Classification Report:\n{"="*65}\n{results[best_model_name]["classification_report"]}\n{"="*65}\n')

Logistic Regression Accuracy: 87.22%
Random Forest Accuracy: 92.74%
SVM Accuracy: 91.80%
Decission Tree Accuracy: 91.01%
Best Model: Random Forest
Accuracy: 92.74 %
Classification Report:
                     precision    recall  f1-score   support

Insufficient_Weight       0.95      0.94      0.95        86
      Normal_Weight       0.77      0.92      0.84        93
     Obesity_Type_I       0.97      0.92      0.94       102
    Obesity_Type_II       0.97      0.99      0.98        88
   Obesity_Type_III       1.00      0.99      0.99        98
 Overweight_Level_I       0.93      0.81      0.87        88
Overweight_Level_II       0.92      0.91      0.92        79

           accuracy                           0.93       634
          macro avg       0.93      0.93      0.93       634
       weighted avg       0.93      0.93      0.93       634


