In [1]:
import warnings
import numpy as np
import pandas as pd
import json
import seaborn as sns
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
data=pd.read_csv('ObesityDataSet_raw_and_data_sinthetic.csv')

In [4]:
data.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [5]:
data.isna().sum()

Gender                            0
Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                              0
SMOKE                             0
CH2O                              0
SCC                               0
FAF                               0
TUE                               0
CALC                              0
MTRANS                            0
NObeyesdad                        0
dtype: int64

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2111 non-null   object 
 1   Age                             2111 non-null   float64
 2   Height                          2111 non-null   float64
 3   Weight                          2111 non-null   float64
 4   family_history_with_overweight  2111 non-null   object 
 5   FAVC                            2111 non-null   object 
 6   FCVC                            2111 non-null   float64
 7   NCP                             2111 non-null   float64
 8   CAEC                            2111 non-null   object 
 9   SMOKE                           2111 non-null   object 
 10  CH2O                            2111 non-null   float64
 11  SCC                             2111 non-null   object 
 12  FAF                             21

In [7]:
data['NObeyesdad'].value_counts()

Obesity_Type_I         351
Obesity_Type_III       324
Obesity_Type_II        297
Overweight_Level_I     290
Overweight_Level_II    290
Normal_Weight          287
Insufficient_Weight    272
Name: NObeyesdad, dtype: int64

In [8]:
data['MTRANS'].value_counts()

Public_Transportation    1580
Automobile                457
Walking                    56
Motorbike                  11
Bike                        7
Name: MTRANS, dtype: int64

In [9]:
data['family_history_with_overweight'].value_counts()

yes    1726
no      385
Name: family_history_with_overweight, dtype: int64

In [49]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder,StandardScaler
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

In [40]:
le=LabelEncoder()
data['NObeyesdad']=le.fit_transform(data['NObeyesdad'])

In [41]:
X=data.iloc[:,:-1]
y=data.iloc[:,-1]

In [42]:
data_num=[feature for feature in X.columns if X[feature].dtype!='O']
data_cat=[feature for feature in X.columns if X[feature].dtype=='O']
data_target=data['NObeyesdad']
print(f'numerical values are: {data_num}')
print('='*32)
print(f'categorical values are: {data_cat}')
print("="*32)
print("target column is: 'NObeyesdad'")


numerical values are: ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
categorical values are: ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']
target column is: 'NObeyesdad'


In [43]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.20,random_state=42)

In [44]:
num_pipeline=Pipeline(steps=[('imputer',SimpleImputer(strategy='median')),
                            ('scale', StandardScaler())
                            ])
cat_pipeline=Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                            ('encoder', OneHotEncoder())
                            ])


In [45]:
preprocessor=ColumnTransformer([('numerical_pipeline',num_pipeline,data_num),
                                ('categorical_pipeline',cat_pipeline,data_cat)])

In [46]:
X_train=preprocessor.fit_transform(X_train)
X_test=preprocessor.transform(X_test)

In [63]:
models={'LogisticRegression': LogisticRegression(),
       'RandomForest': RandomForestClassifier(),
       'GradientBoost': GradientBoostingClassifier(),
       'Xgboost': XGBClassifier()}

params={"LogisticRegression":{'max_iter':[1000] },
    "RandomForest":{'n_estimators': [8,16,32,64]},
    "GradientBoost":{'n_estimators': [8,16,32,64], 'learning_rate':[0.01,0.01],'max_depth':[3]},
    "Xgboost":{'n_estimators': [8,16,32,64], 'learning_rate':[0.01,0.01],'max_depth':[3]}
}

In [103]:
def evaluate_models(X_train,X_test,y_test,y_train,models,params):
    report={}
    report_class={}
    
    for i in range(len(list(models))):
        model=list(models.values())[i]
        param=params[list(models.keys())[i]]
        
        gs = GridSearchCV(model,param,cv=3)
        gs.fit(X_train,y_train)
        
        model.set_params(**gs.best_params_)
        model.fit(X_train,y_train)
        
        y_test_pred=model.predict(X_test)
        acc_score=accuracy_score(y_test,y_test_pred)
        
        print(f'{list(models.keys())[i]} :')
        print(classification_report(y_test,y_test_pred))
        
        report[list(models.keys())[i]]=acc_score
        
        best_score=max(list(report.values()))
        best_model=list(filter(lambda x:report[x]==best_score,report))[0]
    print(f'The Best score of models:{best_model} : {best_score}')
    return report
    
        

In [104]:
evaluate_models(X_train,X_test,y_test,y_train,models,params)

LogisticRegression :
              precision    recall  f1-score   support

           0       0.85      1.00      0.92        56
           1       0.91      0.65      0.75        62
           2       0.93      0.90      0.92        78
           3       0.90      0.97      0.93        58
           4       1.00      1.00      1.00        63
           5       0.75      0.75      0.75        56
           6       0.74      0.84      0.79        50

    accuracy                           0.87       423
   macro avg       0.87      0.87      0.87       423
weighted avg       0.88      0.87      0.87       423

RandomForest :
              precision    recall  f1-score   support

           0       0.95      0.96      0.96        56
           1       0.83      0.85      0.84        62
           2       0.99      0.94      0.96        78
           3       0.97      0.98      0.97        58
           4       1.00      1.00      1.00        63
           5       0.87      0.84      0.8

{'LogisticRegression': 0.8723404255319149,
 'RandomForest': 0.9314420803782506,
 'GradientBoost': 0.8156028368794326,
 'Xgboost': 0.7541371158392435}