<a href="https://colab.research.google.com/github/agoritma/obesity-prediction/blob/main/obesity_prediction_by_user_behaviour.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Installing Depedencies

In [None]:
!pip install kagglehub

In [None]:
# %pip install pandas
# %pip install numpy
# %pip install matplotlib
# %pip install kagglehub
# %pip install xgboost

## Importing Depedencies

In [99]:
import pandas as pd
import numpy as np
import kagglehub
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

RANDOM_STATE = 55

## Load Dataset

In [100]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("suleymansulak/obesity-dataset")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/suleymansulak/obesity-dataset/versions/1


In [101]:
df = pd.read_excel(path + "/Obesity_Dataset.xlsx")

In [102]:
df.head()

Unnamed: 0,Sex,Age,Height,Overweight_Obese_Family,Consumption_of_Fast_Food,Frequency_of_Consuming_Vegetables,Number_of_Main_Meals_Daily,Food_Intake_Between_Meals,Smoking,Liquid_Intake_Daily,Calculation_of_Calorie_Intake,Physical_Excercise,Schedule_Dedicated_to_Technology,Type_of_Transportation_Used,Class
0,2,18,155,2,2,3,1,3,2,1,2,3,3,4,2
1,2,18,158,2,2,3,1,1,2,1,2,1,3,3,2
2,2,18,159,2,2,2,1,3,2,3,2,2,3,4,2
3,2,18,162,2,2,2,2,2,2,2,2,1,3,4,2
4,2,18,165,2,1,2,1,3,2,1,2,3,3,2,2


## Normalize dataset

### One Hot

In [103]:
# Sex column
df['Sex'] = df['Sex'].apply(lambda x: x - 1)

In [104]:
df["Sex"].unique() # 0:Male 1:Female

array([1, 0])

In [105]:
df['Overweight_Obese_Family'] = df['Overweight_Obese_Family'].apply(lambda x: 0 if x == 2 else 1)

In [106]:
df["Overweight_Obese_Family"].unique()

array([0, 1])

In [107]:
df['Smoking'] = df['Smoking'].apply(lambda x: 0 if x == 2 else 1)

In [108]:
df["Smoking"].unique()

array([0, 1])

In [109]:
df['Calculation_of_Calorie_Intake'] = df['Calculation_of_Calorie_Intake'].apply(lambda x: 0 if x == 2 else 1)

In [110]:
df["Calculation_of_Calorie_Intake"].unique()

array([0, 1])

In [111]:
oneHotCols = ["Frequency_of_Consuming_Vegetables", "Number_of_Main_Meals_Daily",
                 "Food_Intake_Between_Meals", "Liquid_Intake_Daily", "Physical_Excercise", "Schedule_Dedicated_to_Technology", "Type_of_Transportation_Used"]

In [112]:
df_onehot = pd.get_dummies(data=df, prefix=oneHotCols, columns=oneHotCols)

In [113]:
df_onehot.head()

Unnamed: 0,Sex,Age,Height,Overweight_Obese_Family,Consumption_of_Fast_Food,Smoking,Calculation_of_Calorie_Intake,Class,Frequency_of_Consuming_Vegetables_1,Frequency_of_Consuming_Vegetables_2,...,Physical_Excercise_4,Physical_Excercise_5,Schedule_Dedicated_to_Technology_1,Schedule_Dedicated_to_Technology_2,Schedule_Dedicated_to_Technology_3,Type_of_Transportation_Used_1,Type_of_Transportation_Used_2,Type_of_Transportation_Used_3,Type_of_Transportation_Used_4,Type_of_Transportation_Used_5
0,1,18,155,0,2,0,0,2,False,False,...,False,False,False,False,True,False,False,False,True,False
1,1,18,158,0,2,0,0,2,False,False,...,False,False,False,False,True,False,False,True,False,False
2,1,18,159,0,2,0,0,2,False,True,...,False,False,False,False,True,False,False,False,True,False
3,1,18,162,0,2,0,0,2,False,True,...,False,False,False,False,True,False,False,False,True,False
4,1,18,165,0,1,0,0,2,False,True,...,False,False,False,False,True,False,True,False,False,False


### Augmented Data

In [114]:
df_onehot["Growth_Rate"] = df_onehot["Height"] / df_onehot["Age"]
df_onehot

Unnamed: 0,Sex,Age,Height,Overweight_Obese_Family,Consumption_of_Fast_Food,Smoking,Calculation_of_Calorie_Intake,Class,Frequency_of_Consuming_Vegetables_1,Frequency_of_Consuming_Vegetables_2,...,Physical_Excercise_5,Schedule_Dedicated_to_Technology_1,Schedule_Dedicated_to_Technology_2,Schedule_Dedicated_to_Technology_3,Type_of_Transportation_Used_1,Type_of_Transportation_Used_2,Type_of_Transportation_Used_3,Type_of_Transportation_Used_4,Type_of_Transportation_Used_5,Growth_Rate
0,1,18,155,0,2,0,0,2,False,False,...,False,False,False,True,False,False,False,True,False,8.611111
1,1,18,158,0,2,0,0,2,False,False,...,False,False,False,True,False,False,True,False,False,8.777778
2,1,18,159,0,2,0,0,2,False,True,...,False,False,False,True,False,False,False,True,False,8.833333
3,1,18,162,0,2,0,0,2,False,True,...,False,False,False,True,False,False,False,True,False,9.000000
4,1,18,165,0,1,0,0,2,False,True,...,False,False,False,True,False,True,False,False,False,9.166667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1605,1,51,165,0,1,1,1,4,True,False,...,False,True,False,False,True,False,False,False,False,3.235294
1606,1,51,163,0,1,0,0,4,False,True,...,True,True,False,False,True,False,False,False,False,3.196078
1607,1,52,162,1,2,1,1,4,True,False,...,False,True,False,False,True,False,False,False,False,3.115385
1608,1,53,168,0,1,1,0,4,True,False,...,False,True,False,False,True,False,False,False,False,3.169811


In [115]:
df_onehot.columns

Index(['Sex', 'Age', 'Height', 'Overweight_Obese_Family',
       'Consumption_of_Fast_Food', 'Smoking', 'Calculation_of_Calorie_Intake',
       'Class', 'Frequency_of_Consuming_Vegetables_1',
       'Frequency_of_Consuming_Vegetables_2',
       'Frequency_of_Consuming_Vegetables_3', 'Number_of_Main_Meals_Daily_1',
       'Number_of_Main_Meals_Daily_2', 'Number_of_Main_Meals_Daily_3',
       'Food_Intake_Between_Meals_1', 'Food_Intake_Between_Meals_2',
       'Food_Intake_Between_Meals_3', 'Food_Intake_Between_Meals_4',
       'Liquid_Intake_Daily_1', 'Liquid_Intake_Daily_2',
       'Liquid_Intake_Daily_3', 'Physical_Excercise_1', 'Physical_Excercise_2',
       'Physical_Excercise_3', 'Physical_Excercise_4', 'Physical_Excercise_5',
       'Schedule_Dedicated_to_Technology_1',
       'Schedule_Dedicated_to_Technology_2',
       'Schedule_Dedicated_to_Technology_3', 'Type_of_Transportation_Used_1',
       'Type_of_Transportation_Used_2', 'Type_of_Transportation_Used_3',
       'Type_of_Tr

## Preparing data for training

In [116]:
X = df_onehot.drop(["Age", "Height"], axis=1)
y = df["Class"]

In [117]:
y = y.apply(lambda x: x -1)
y.unique()

array([1, 2, 3, 0])

In [118]:
X = np.array(X)
X

array([[1, 0, 2, ..., True, False, 8.61111111111111],
       [1, 0, 2, ..., False, False, 8.777777777777779],
       [1, 0, 2, ..., True, False, 8.833333333333334],
       ...,
       [1, 1, 2, ..., False, False, 3.1153846153846154],
       [1, 0, 1, ..., False, False, 3.169811320754717],
       [1, 1, 1, ..., False, False, 3.1481481481481484]], dtype=object)

In [119]:
y = np.array(y)
y

array([1, 1, 1, ..., 3, 3, 3])

In [120]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

## Training using XGBoost

In [121]:
xgb_model = XGBClassifier(n_estimators = 200, learning_rate = 0.1,verbosity = 1, random_state = RANDOM_STATE, early_stopping_rounds = 10)
xgb_model.fit(X_train, y_train, eval_set = [(X_val,y_val)])

[0]	validation_0-mlogloss:1.19464
[1]	validation_0-mlogloss:1.04086
[2]	validation_0-mlogloss:0.91373
[3]	validation_0-mlogloss:0.80652
[4]	validation_0-mlogloss:0.71484
[5]	validation_0-mlogloss:0.63564
[6]	validation_0-mlogloss:0.56668
[7]	validation_0-mlogloss:0.50628
[8]	validation_0-mlogloss:0.45312
[9]	validation_0-mlogloss:0.40615
[10]	validation_0-mlogloss:0.36451
[11]	validation_0-mlogloss:0.32750
[12]	validation_0-mlogloss:0.29453
[13]	validation_0-mlogloss:0.26510
[14]	validation_0-mlogloss:0.23880
[15]	validation_0-mlogloss:0.21525
[16]	validation_0-mlogloss:0.19414
[17]	validation_0-mlogloss:0.17520
[18]	validation_0-mlogloss:0.15818
[19]	validation_0-mlogloss:0.14289
[20]	validation_0-mlogloss:0.12913
[21]	validation_0-mlogloss:0.11675
[22]	validation_0-mlogloss:0.10560
[23]	validation_0-mlogloss:0.09555
[24]	validation_0-mlogloss:0.08649
[25]	validation_0-mlogloss:0.07832
[26]	validation_0-mlogloss:0.07095
[27]	validation_0-mlogloss:0.06429
[28]	validation_0-mlogloss:0.0

In [122]:
xgb_model.best_iteration

199

In [123]:
print(f"Metrics train:\n\tAccuracy score: {accuracy_score(xgb_model.predict(X_train),y_train):.4f}\nMetrics test:\n\tAccuracy score: {accuracy_score(xgb_model.predict(X_val),y_val):.4f}")

Metrics train:
	Accuracy score: 1.0000
Metrics test:
	Accuracy score: 1.0000


## Testing the model

In [124]:
classMap = {
    0: "Underweigth",
    1: "Normal",
    2: "Overweight",
    3: "Obesity",
}

In [125]:
import random

In [126]:
index = random.randint(0, len(X_val)-1)

data = np.array(X_val)
data = data[index]
ground = np.array(y_val)
ground = ground[index]

predict = xgb_model.predict(data.reshape(1, -1))

print("Ground Thruth:" , classMap[ground])
print("Prediction: ", classMap[predict[0]])

Ground Thruth: Normal
Prediction:  Normal
