In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

In [6]:
df = pd.read_csv(r"D:\ObesityDataSet_raw_and_data_sinthetic.csv")

In [7]:
df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [8]:
df.shape

(2111, 17)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2111 non-null   object 
 1   Age                             2111 non-null   float64
 2   Height                          2111 non-null   float64
 3   Weight                          2111 non-null   float64
 4   family_history_with_overweight  2111 non-null   object 
 5   FAVC                            2111 non-null   object 
 6   FCVC                            2111 non-null   float64
 7   NCP                             2111 non-null   float64
 8   CAEC                            2111 non-null   object 
 9   SMOKE                           2111 non-null   object 
 10  CH2O                            2111 non-null   float64
 11  SCC                             2111 non-null   object 
 12  FAF                             21

In [11]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,2111.0,24.3126,6.345968,14.0,19.947192,22.77789,26.0,61.0
Height,2111.0,1.701677,0.093305,1.45,1.63,1.700499,1.768464,1.98
Weight,2111.0,86.586058,26.191172,39.0,65.473343,83.0,107.430682,173.0
FCVC,2111.0,2.419043,0.533927,1.0,2.0,2.385502,3.0,3.0
NCP,2111.0,2.685628,0.778039,1.0,2.658738,3.0,3.0,4.0
CH2O,2111.0,2.008011,0.612953,1.0,1.584812,2.0,2.47742,3.0
FAF,2111.0,1.010298,0.850592,0.0,0.124505,1.0,1.666678,3.0
TUE,2111.0,0.657866,0.608927,0.0,0.0,0.62535,1.0,2.0


In [12]:
df.columns

Index(['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight',
       'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE',
       'CALC', 'MTRANS', 'NObeyesdad'],
      dtype='object')

In [15]:
df.isna().sum()

Gender                            0
Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                              0
SMOKE                             0
CH2O                              0
SCC                               0
FAF                               0
TUE                               0
CALC                              0
MTRANS                            0
NObeyesdad                        0
dtype: int64

#### There is no single null value.

### Split the dataset

In [20]:
features = ['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight', 'FAVC', 'FCVC', 'NCP', 'CAEC',
            'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS']
target = 'NObeyesdad'

In [22]:
X = df[features]
y = df[target]

### Label encoding for categorical variables.

In [23]:
encoder = LabelEncoder()
X_encoded = X.copy()
for feature in ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']:
    X_encoded[feature] = encoder.fit_transform(X_encoded[feature])

y_encoded = encoder.fit_transform(y)

### Split the data into train test split

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2,random_state=42)

### Initialize the four different classifiers

In [25]:
decision_tree = DecisionTreeClassifier()
random_forest = RandomForestClassifier()
svm = SVC()
knn = KNeighborsClassifier()

### Train the Models

In [27]:
decision_tree.fit(X_train, y_train)

DecisionTreeClassifier()

In [28]:
random_forest.fit(X_train, y_train)

RandomForestClassifier()

In [29]:
svm.fit(X_train, y_train)

SVC()

In [30]:
knn.fit(X_train, y_train)

KNeighborsClassifier()

### Evaluate the Models

####  Decision Tree

In [31]:
dt_predictions = decision_tree.predict(X_test)
print("Decision Tree:")
print(confusion_matrix(y_test, dt_predictions))
print(classification_report(y_test, dt_predictions))

Decision Tree:
[[55  1  0  0  0  0  0]
 [ 4 54  0  0  0  4  0]
 [ 0  1 71  4  0  0  2]
 [ 0  0  3 55  0  0  0]
 [ 0  0  0  0 63  0  0]
 [ 0  5  0  0  0 50  1]
 [ 0  0  0  0  0  4 46]]
              precision    recall  f1-score   support

           0       0.93      0.98      0.96        56
           1       0.89      0.87      0.88        62
           2       0.96      0.91      0.93        78
           3       0.93      0.95      0.94        58
           4       1.00      1.00      1.00        63
           5       0.86      0.89      0.88        56
           6       0.94      0.92      0.93        50

    accuracy                           0.93       423
   macro avg       0.93      0.93      0.93       423
weighted avg       0.93      0.93      0.93       423



#### Random Forest

In [32]:
rf_predictions = random_forest.predict(X_test)
print("Random Forest:")
print(confusion_matrix(y_test, rf_predictions))
print(classification_report(y_test, rf_predictions))

Random Forest:
[[54  2  0  0  0  0  0]
 [ 0 58  0  0  0  4  0]
 [ 0  0 75  2  0  0  1]
 [ 0  0  1 57  0  0  0]
 [ 0  0  0  0 63  0  0]
 [ 0  6  0  0  0 49  1]
 [ 0  0  0  0  0  3 47]]
              precision    recall  f1-score   support

           0       1.00      0.96      0.98        56
           1       0.88      0.94      0.91        62
           2       0.99      0.96      0.97        78
           3       0.97      0.98      0.97        58
           4       1.00      1.00      1.00        63
           5       0.88      0.88      0.88        56
           6       0.96      0.94      0.95        50

    accuracy                           0.95       423
   macro avg       0.95      0.95      0.95       423
weighted avg       0.95      0.95      0.95       423



#### Support Vector Machines (SVM)

In [33]:
svm_predictions = svm.predict(X_test)
print("Support Vector Machines:")
print(confusion_matrix(y_test, svm_predictions))
print(classification_report(y_test, svm_predictions))

Support Vector Machines:
[[49  7  0  0  0  0  0]
 [18 21  0  0  0 20  3]
 [ 0  0 26  7 21  1 23]
 [ 0  0  5 24 29  0  0]
 [ 0  0  0  0 63  0  0]
 [ 2 14  0  0  0 27 13]
 [ 0  2  9  0  0 10 29]]
              precision    recall  f1-score   support

           0       0.71      0.88      0.78        56
           1       0.48      0.34      0.40        62
           2       0.65      0.33      0.44        78
           3       0.77      0.41      0.54        58
           4       0.56      1.00      0.72        63
           5       0.47      0.48      0.47        56
           6       0.43      0.58      0.49        50

    accuracy                           0.57       423
   macro avg       0.58      0.57      0.55       423
weighted avg       0.59      0.57      0.54       423



#### K-Nearest Neighbors (KNN)

In [35]:
knn_predictions = knn.predict(X_test)
print("K-Nearest Neighbors:")
print(confusion_matrix(y_test, knn_predictions))
print(classification_report(y_test, knn_predictions))

K-Nearest Neighbors:
[[54  2  0  0  0  0  0]
 [13 29  2  0  0 15  3]
 [ 0  0 76  1  1  0  0]
 [ 0  0  1 56  1  0  0]
 [ 0  0  0  0 63  0  0]
 [ 0  4  1  0  0 50  1]
 [ 0  0  5  0  0  0 45]]
              precision    recall  f1-score   support

           0       0.81      0.96      0.88        56
           1       0.83      0.47      0.60        62
           2       0.89      0.97      0.93        78
           3       0.98      0.97      0.97        58
           4       0.97      1.00      0.98        63
           5       0.77      0.89      0.83        56
           6       0.92      0.90      0.91        50

    accuracy                           0.88       423
   macro avg       0.88      0.88      0.87       423
weighted avg       0.88      0.88      0.87       423



  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


### The Random Forest is the better model among all of the above.