In [2]:
import warnings
import numpy as np
import pandas as pd
import json
import seaborn as sns
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
data=pd.read_csv('ObesityDataSet_raw_and_data_sinthetic.csv')

In [3]:
data.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [26]:
data.columns

Index(['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight',
       'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE',
       'CALC', 'MTRANS', 'NObeyesdad'],
      dtype='object')

In [4]:
data.isna().sum()

Gender                            0
Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                              0
SMOKE                             0
CH2O                              0
SCC                               0
FAF                               0
TUE                               0
CALC                              0
MTRANS                            0
NObeyesdad                        0
dtype: int64

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2111 non-null   object 
 1   Age                             2111 non-null   float64
 2   Height                          2111 non-null   float64
 3   Weight                          2111 non-null   float64
 4   family_history_with_overweight  2111 non-null   object 
 5   FAVC                            2111 non-null   object 
 6   FCVC                            2111 non-null   float64
 7   NCP                             2111 non-null   float64
 8   CAEC                            2111 non-null   object 
 9   SMOKE                           2111 non-null   object 
 10  CH2O                            2111 non-null   float64
 11  SCC                             2111 non-null   object 
 12  FAF                             21

In [4]:
data['NObeyesdad'].value_counts()

NObeyesdad
Obesity_Type_I         351
Obesity_Type_III       324
Obesity_Type_II        297
Overweight_Level_I     290
Overweight_Level_II    290
Normal_Weight          287
Insufficient_Weight    272
Name: count, dtype: int64

In [5]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
data['NObeyesdad']=le.fit_transform(data['NObeyesdad'])
data['NObeyesdad'].value_counts()

NObeyesdad
2    351
4    324
3    297
5    290
6    290
1    287
0    272
Name: count, dtype: int64

In [None]:
Obesity_Type_I         2
Obesity_Type_III       4
Obesity_Type_II        3
Overweight_Level_I     5
Overweight_Level_II    6
Normal_Weight          1
Insufficient_Weight    0

In [7]:
data['CH2O'].unique

<bound method Series.unique of 0       2.000000
1       3.000000
2       2.000000
3       2.000000
4       2.000000
          ...   
2106    1.728139
2107    2.005130
2108    2.054193
2109    2.852339
2110    2.863513
Name: CH2O, Length: 2111, dtype: float64>

In [8]:
data['family_history_with_overweight'].value_counts()

family_history_with_overweight
yes    1726
no      385
Name: count, dtype: int64

In [11]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder,StandardScaler
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
#from xgboost import XGBClassifier
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

In [12]:
le=LabelEncoder()
data['NObeyesdad']=le.fit_transform(data['NObeyesdad'])

In [13]:
X=data.iloc[:,:-1]
y=data.iloc[:,-1]

In [14]:
data_num=[feature for feature in X.columns if X[feature].dtype!='O']
data_cat=[feature for feature in X.columns if X[feature].dtype=='O']
data_target=data['NObeyesdad']
print(f'numerical values are: {data_num}')
print('='*32)
print(f'categorical values are: {data_cat}')
print("="*32)
print("target column is: 'NObeyesdad'")


numerical values are: ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
categorical values are: ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']
target column is: 'NObeyesdad'


In [15]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.20,random_state=42)

In [16]:
num_pipeline=Pipeline(steps=[('imputer',SimpleImputer(strategy='median')),
                            ('scale', StandardScaler())
                            ])
cat_pipeline=Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),
                            ('encoder', OneHotEncoder())
                            ])


In [17]:
preprocessor=ColumnTransformer([('numerical_pipeline',num_pipeline,data_num),
                                ('categorical_pipeline',cat_pipeline,data_cat)])

In [18]:
X_train=preprocessor.fit_transform(X_train)
X_test=preprocessor.transform(X_test)

In [20]:
models={'LogisticRegression': LogisticRegression(),
       'RandomForest': RandomForestClassifier(),
       'GradientBoost': GradientBoostingClassifier()
       #'Xgboost': XGBClassifier()
       }

params={"LogisticRegression":{'max_iter':[1000] },
    "RandomForest":{'n_estimators': [8,16,32,64]},
    "GradientBoost":{'n_estimators': [8,16,32,64], 'learning_rate':[0.01,0.01],'max_depth':[3]}
    #"Xgboost":{'n_estimators': [8,16,32,64], 'learning_rate':[0.01,0.01],'max_depth':[3]}
}

In [34]:
def evaluate_models(X_train,x1,y_train,models,params):
    report={}
    report_class={}
    
    for i in range(len(list(models))):
        model=list(models.values())[i]
        param=params[list(models.keys())[i]]
        
        gs = GridSearchCV(model,param,cv=3)
        gs.fit(X_train,y_train)
        
        model.set_params(**gs.best_params_)
        model.fit(X_train,y_train)
        
        y_test_pred=model.predict(x1)
        #acc_score=accuracy_score(y_test,y_test_pred)
        
        #print(f'{list(models.keys())[i]} :')
        #print(classification_report(y_test,y_test_pred))
        
        #report[list(models.keys())[i]]=acc_score
        
        #best_score=max(list(report.values()))
        #best_model=list(filter(lambda x:report[x]==best_score,report))[0]
    #print(f'The Best score of models:{best_model} : {best_score}')
    return y_test_pred
    
        

In [36]:
evaluate_models(X_train,x1,y_train,models,params)

array([1])

In [22]:
evaluate_models(X_train,X_test,y_test,y_train,models,params)

LogisticRegression :
              precision    recall  f1-score   support

           0       0.85      1.00      0.92        56
           1       0.91      0.65      0.75        62
           2       0.93      0.90      0.92        78
           3       0.90      0.97      0.93        58
           4       1.00      1.00      1.00        63
           5       0.75      0.75      0.75        56
           6       0.74      0.84      0.79        50

    accuracy                           0.87       423
   macro avg       0.87      0.87      0.87       423
weighted avg       0.88      0.87      0.87       423

RandomForest :
              precision    recall  f1-score   support

           0       0.98      0.96      0.97        56
           1       0.85      0.89      0.87        62
           2       0.97      0.94      0.95        78
           3       0.97      0.98      0.97        58
           4       1.00      1.00      1.00        63
           5       0.86      0.86      0.8

{'LogisticRegression': 0.8723404255319149,
 'RandomForest': 0.9385342789598109,
 'GradientBoost': 0.8156028368794326}

In [28]:
custom_data_input_dict=[{
                      "Gender":'Male',
                      "Age":21.0,
                      "Height":1.67,
                      "Weight":64.0,
                      "family_history_with_overweight":'yes',
                      "FAVC":'no',
                      "FCVC":2.0,
                      "NCP":3.0,
                      "CAEC":'Sometimes',
                      "SMOKE":'no',
                      "CH2O":2.0,
                      "SCC":'no',
                      "FAF":0.0,
                      "TUE":1.0,
                      "CALC":'no',
                      "MTRANS":'Public_Transportation'        
                }]

In [31]:

XX=pd.DataFrame(custom_data_input_dict)
XX

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS
0,Male,21.0,1.67,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation


In [33]:
x1=preprocessor.transform(XX)

In [37]:
x1

array([[-0.53264595, -0.3390416 , -0.86610746, -0.7857288 ,  0.40151284,
        -0.01014187, -1.19623756,  0.59027817,  0.        ,  1.        ,
         0.        ,  1.        ,  1.        ,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  1.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ]])

In [None]:
Female	2data=['Male',21.0,1.67,64.0,'yes','no',2.0,3.0,'Sometimes','no',2.0,'no',0.0,1.0,'no','Public_Transportation']
columns=['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight',
       'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE',
       'CALC', 'MTRANS']