### 1. Importing Dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv('../Datasets/ObesityDataSet_raw_and_data_sinthetic.csv')

In [3]:
df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


### 2. Data Preprocessing

In [4]:
df.shape

(2111, 17)

In [5]:
df.describe()

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
count,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0
mean,24.3126,1.701677,86.586058,2.419043,2.685628,2.008011,1.010298,0.657866
std,6.345968,0.093305,26.191172,0.533927,0.778039,0.612953,0.850592,0.608927
min,14.0,1.45,39.0,1.0,1.0,1.0,0.0,0.0
25%,19.947192,1.63,65.473343,2.0,2.658738,1.584812,0.124505,0.0
50%,22.77789,1.700499,83.0,2.385502,3.0,2.0,1.0,0.62535
75%,26.0,1.768464,107.430682,3.0,3.0,2.47742,1.666678,1.0
max,61.0,1.98,173.0,3.0,4.0,3.0,3.0,2.0


#### **a). Handling missing value**

In [6]:
df.isnull().sum()

Gender                            0
Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                              0
SMOKE                             0
CH2O                              0
SCC                               0
FAF                               0
TUE                               0
CALC                              0
MTRANS                            0
NObeyesdad                        0
dtype: int64

#### **b). Handling categorical columns**

##### → Checking unique values in each categorical column

In [7]:
df['Gender'].unique()

array(['Female', 'Male'], dtype=object)

In [8]:
df['family_history_with_overweight'].unique()

array(['yes', 'no'], dtype=object)

In [9]:
df['FAVC'].unique()

array(['no', 'yes'], dtype=object)

In [10]:
df['CAEC'].unique()

array(['Sometimes', 'Frequently', 'Always', 'no'], dtype=object)

In [11]:
df['SMOKE'].unique()

array(['no', 'yes'], dtype=object)

In [12]:
df['SCC'].unique()

array(['no', 'yes'], dtype=object)

In [13]:
df['CALC'].unique()

array(['no', 'Sometimes', 'Frequently', 'Always'], dtype=object)

In [14]:
df['MTRANS'].unique()

array(['Public_Transportation', 'Walking', 'Automobile', 'Motorbike',
       'Bike'], dtype=object)

In [15]:
df['NObeyesdad'].unique()

array(['Normal_Weight', 'Overweight_Level_I', 'Overweight_Level_II',
       'Obesity_Type_I', 'Insufficient_Weight', 'Obesity_Type_II',
       'Obesity_Type_III'], dtype=object)

##### → Converting

In [16]:
pd.factorize(df.Gender)

(array([0, 0, 1, ..., 0, 0, 0]), Index(['Female', 'Male'], dtype='object'))

In [17]:
df['Gender']=pd.factorize(df.Gender)[0]
df['family_history_with_overweight']=pd.factorize(df.family_history_with_overweight)[0]
df['FAVC']=pd.factorize(df.FAVC)[0]
df['CAEC']=pd.factorize(df.CAEC)[0]
df['SMOKE']=pd.factorize(df.SMOKE)[0]
df['SCC']=pd.factorize(df.SCC)[0]
df['CALC']=pd.factorize(df.CALC)[0]
df['MTRANS']=pd.factorize(df.MTRANS)[0]
df['NObeyesdad']=pd.factorize(df.NObeyesdad)[0]

##### → Data Check

In [18]:
df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,21.0,1.62,64.0,0,0,2.0,3.0,0,0,2.0,0,0.0,1.0,0,0,0
1,0,21.0,1.52,56.0,0,0,3.0,3.0,0,1,3.0,1,3.0,0.0,1,0,0
2,1,23.0,1.8,77.0,0,0,2.0,3.0,0,0,2.0,0,2.0,1.0,2,0,0
3,1,27.0,1.8,87.0,1,0,3.0,3.0,0,0,2.0,0,2.0,0.0,2,1,1
4,1,22.0,1.78,89.8,1,0,2.0,1.0,0,0,2.0,0,0.0,0.0,1,0,2


In [19]:
df.describe()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
count,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0
mean,0.505921,24.3126,1.701677,86.586058,0.182378,0.883941,2.419043,2.685628,0.237328,0.020843,2.008011,0.045476,1.010298,0.657866,0.731407,0.488394,3.050687
std,0.500083,6.345968,0.093305,26.191172,0.386247,0.320371,0.533927,0.778039,0.613474,0.142893,0.612953,0.208395,0.850592,0.608927,0.515498,0.868475,1.99568
min,0.0,14.0,1.45,39.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,19.947192,1.63,65.473343,0.0,1.0,2.0,2.658738,0.0,0.0,1.584812,0.0,0.124505,0.0,0.0,0.0,1.0
50%,1.0,22.77789,1.700499,83.0,0.0,1.0,2.385502,3.0,0.0,0.0,2.0,0.0,1.0,0.62535,1.0,0.0,3.0
75%,1.0,26.0,1.768464,107.430682,0.0,1.0,3.0,3.0,0.0,0.0,2.47742,0.0,1.666678,1.0,1.0,1.0,5.0
max,1.0,61.0,1.98,173.0,1.0,1.0,3.0,4.0,3.0,1.0,3.0,1.0,3.0,2.0,3.0,4.0,6.0


In [20]:
df['NObeyesdad'].value_counts()

NObeyesdad
3    351
6    324
5    297
1    290
2    290
0    287
4    272
Name: count, dtype: int64

### 3. Split into feature and target

In [21]:
feature=df.drop(columns='NObeyesdad',axis=1)
target=df['NObeyesdad']

In [22]:
feature.shape

(2111, 16)

In [23]:
target.shape

(2111,)

#### **a). Data Standardization**

In [24]:
scaler=StandardScaler()
scaler.fit(feature)
standardized_data=scaler.transform(feature)

In [25]:
standardized_data

array([[-1.01191369, -0.52212439, -0.87558934, ...,  0.56199675,
        -1.4191716 , -0.56249143],
       [-1.01191369, -0.52212439, -1.94759928, ..., -1.08062463,
         0.52115952, -0.56249143],
       [ 0.98822657, -0.20688898,  1.05402854, ...,  0.56199675,
         2.46149063, -0.56249143],
       ...,
       [-1.01191369, -0.28190933,  0.54167211, ..., -0.01901815,
         0.52115952, -0.56249143],
       [-1.01191369,  0.00777624,  0.40492652, ..., -0.11799101,
         0.52115952, -0.56249143],
       [-1.01191369, -0.10211908,  0.39834438, ...,  0.09243207,
         0.52115952, -0.56249143]])

In [26]:
feature=standardized_data

### 4. Train test split

In [27]:
X_train,X_test,Y_train,Y_test = train_test_split(feature,target,random_state=2,test_size=0.2,stratify=target)

In [28]:
print(X_train.shape,Y_train.shape)

(1688, 16) (1688,)


In [29]:
print(X_test.shape,Y_test.shape)

(423, 16) (423,)


### 5. Training Model

In [30]:
model=LogisticRegression()

In [31]:
model.fit(X_train,Y_train)

### 6. Model Evaluation

In [32]:
X_train_prediction=model.predict(X_train)
accuracy_score(X_train_prediction,Y_train)

0.9004739336492891

In [33]:
X_test_prediction=model.predict(X_test)
accuracy_score(X_test_prediction,Y_test)

0.8723404255319149

### 7. Predictive system

In [34]:
# row index no : 2
input=(1. , 23. ,  1.8, 77. ,  0. ,  0. ,  2. ,  3. ,  0. ,  0. ,  2. ,
        0. ,  2. ,  1. ,  2. ,  0. )
input_as_np_array=np.asarray(input)
reshaped_input=input_as_np_array.reshape(1,-1)
standard_input=scaler.transform(reshaped_input)
prediction=model.predict(standard_input)
prediction

array([0])

In [35]:
# row index no : 3
input=( 1. , 27. ,  1.8, 87. ,  1. ,  0. ,  3. ,  3. ,  0. ,  0. ,  2. ,
        0. ,  2. ,  0. ,  2. ,  1. )
input_as_np_array=np.asarray(input)
reshaped_input=input_as_np_array.reshape(1,-1)
standard_input=scaler.transform(reshaped_input)
prediction=model.predict(standard_input)
prediction

array([1])

In [36]:
# row index no : 4
input=( 1.  , 22.  ,  1.78, 89.8 ,  1.  ,  0.  ,  2.  ,  1.  ,  0.  ,
        0.  ,  2.  ,  0.  ,  0.  ,  0.  ,  1.  ,  0.   )
input_as_np_array=np.asarray(input)
reshaped_input=input_as_np_array.reshape(1,-1)
standard_input=scaler.transform(reshaped_input)
prediction=model.predict(standard_input)
prediction

array([2])

In [37]:
# row index no : 500
input=( 0.      ,  26.      ,   1.622397, 110.79263 ,   0.      ,
         1.      ,   3.      ,   3.      ,   0.      ,   0.      ,
         2.704507,   0.      ,   0.      ,   0.29499 ,   1.      ,
         0.      )
input_as_np_array=np.asarray(input)
reshaped_input=input_as_np_array.reshape(1,-1)
standard_input=scaler.transform(reshaped_input)
prediction=model.predict(standard_input)
prediction

array([6])

**Target Values :**
* Insufficient Weight : 4
* Normal Weight : 0
* Overweight Level I : 1 
* Overweight Level II : 2
* Obesity Type I : 3
* Obesity Type II : 5
* Obesity Type III : 6

In [38]:
df.iloc[500].values

array([  0.      ,  26.      ,   1.622397, 110.79263 ,   0.      ,
         1.      ,   3.      ,   3.      ,   0.      ,   0.      ,
         2.704507,   0.      ,   0.      ,   0.29499 ,   1.      ,
         0.      ,   6.      ])