# Machine Learning Project SS24

Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.impute import SimpleImputer

Read data from the file

In [None]:
dataset = pd.read_csv('ObesityDataSet_raw_and_data_sinthetic.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

Check for missing values in each column

In [None]:
missing_values = dataset.isnull().sum()
print(missing_values)

Gender                            0
Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                              0
SMOKE                             0
CH2O                              0
SCC                               0
FAF                               0
TUE                               0
CALC                              0
MTRANS                            0
NObeyesdad                        0
dtype: int64


Encoding categorical data with LabelEncoder

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(y)
columns_to_encode = [4, 5, 8, 9, 11, 14, 15]
for col in columns_to_encode:
    X[:, col] = np.array(le.fit_transform(X[:, col]))

print(X)
print(y)

[['Female' 21.0 1.62 ... 1.0 3 3]
 ['Female' 21.0 1.52 ... 0.0 2 3]
 ['Male' 23.0 1.8 ... 1.0 1 3]
 ...
 ['Female' 22.524036 1.752206 ... 0.646288 2 3]
 ['Female' 24.361936 1.73945 ... 0.586035 2 3]
 ['Female' 23.664709 1.738836 ... 0.714137 2 3]]
[1 1 1 ... 4 4 4]


Displaying dataset

In [None]:
# Assuming X and y are numpy arrays
# Convert the numpy array to a pandas DataFrame
print(type(X))
df_X = pd.DataFrame(X)
df_y = pd.Series(y)

# Set display options to show all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Print the DataFrame and Series
print(df_X)

<class 'numpy.ndarray'>
          0          1         2           3  4  5         6         7  8  9   \
0     Female       21.0      1.62        64.0  1  0       2.0       3.0  2  0   
1     Female       21.0      1.52        56.0  1  0       3.0       3.0  2  1   
2       Male       23.0       1.8        77.0  1  0       2.0       3.0  2  0   
3       Male       27.0       1.8        87.0  0  0       3.0       3.0  2  0   
4       Male       22.0      1.78        89.8  0  0       2.0       1.0  2  0   
5       Male       29.0      1.62        53.0  0  1       2.0       3.0  2  0   
6     Female       23.0       1.5        55.0  1  1       3.0       3.0  2  0   
7       Male       22.0      1.64        53.0  0  0       2.0       3.0  2  0   
8       Male       24.0      1.78        64.0  1  1       3.0       3.0  2  0   
9       Male       22.0      1.72        68.0  1  1       2.0       3.0  2  0   
10      Male       26.0      1.85       105.0  1  1       3.0       3.0  1  0   
11  

Encoding categorical data with OneHotEncoder

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')
X = np.array(ct.fit_transform(X))  # Train_function expects Numpy Array

print(X[0])

[1.0 0.0 21.0 1.62 64.0 1 0 2.0 3.0 2 0 2.0 0 0.0 1.0 3 3]


Splitting the dataset into Training Set and Test set

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
#columns_to_scale = [2, 4, 8, 16]
columns_to_scale = [2, 4]
for col in columns_to_scale:
  X_train[:, col] = sc.fit_transform(X_train[:, col].reshape(-1, 1)).reshape(-1)
  X_test[:, col] = sc.transform(X_test[:, col].reshape(-1, 1)).reshape(-1)
  X_train[0]

In [None]:
print(X_train[0])

[1.0 0.0 1.129181413357841 1.666023 -0.5161786829404091 1 1 2.0 3.0 2 0
 1.0 0 0.0 1.301385 2 0]


Training the K-NN model in the training set

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=1)
classifier.fit(X_train, y_train)

Predicting the Test set results

In [None]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[5 5]
 [4 4]
 [2 5]
 [4 4]
 [0 1]
 [4 4]
 [4 4]
 [2 6]
 [5 2]
 [5 6]
 [2 2]
 [6 2]
 [4 4]
 [3 3]
 [4 4]
 [0 0]
 [2 2]
 [3 3]
 [3 3]
 [6 0]
 [5 5]
 [5 1]
 [2 2]
 [0 0]
 [6 2]
 [3 3]
 [2 2]
 [4 4]
 [2 2]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [4 4]
 [2 2]
 [6 6]
 [4 4]
 [3 3]
 [6 5]
 [3 3]
 [3 3]
 [6 6]
 [4 4]
 [3 3]
 [1 1]
 [2 2]
 [6 6]
 [2 2]
 [2 2]
 [2 2]
 [1 0]
 [5 5]
 [6 6]
 [0 0]
 [4 4]
 [2 2]
 [5 5]
 [2 2]
 [6 6]
 [5 5]
 [1 1]
 [4 4]
 [4 4]
 [4 4]
 [6 6]
 [4 4]
 [6 6]
 [2 2]
 [3 3]
 [4 4]
 [2 6]
 [2 2]
 [3 3]
 [4 4]
 [4 4]
 [2 2]
 [4 4]
 [0 1]
 [1 5]
 [6 6]
 [5 1]
 [2 2]
 [3 3]
 [2 2]
 [5 5]
 [4 4]
 [5 5]
 [2 2]
 [6 6]
 [5 5]
 [5 5]
 [3 3]
 [0 1]
 [1 1]
 [6 6]
 [6 6]
 [1 1]
 [4 4]
 [0 0]
 [4 4]
 [2 5]
 [5 5]
 [2 2]
 [4 4]
 [1 5]
 [6 6]
 [1 1]
 [6 2]
 [2 2]
 [0 0]
 [0 1]
 [2 2]
 [4 4]
 [5 5]
 [6 1]
 [0 0]
 [2 2]
 [5 5]
 [5 5]
 [6 6]
 [2 2]
 [5 5]
 [0 0]
 [1 5]
 [4 4]
 [0 0]
 [5 1]
 [6 6]
 [5 5]
 [4 4]
 [3 3]
 [0 0]
 [3 3]
 [5 5]
 [2 2]
 [2 6]
 [0 0]
 [4 4]
 [6 1]
 [0 0]
 [3 3]

Making the Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[40  4  0  0  0  1  1]
 [ 9 30  0  0  0 13  8]
 [ 0  0 65  0  0  3  5]
 [ 0  0  0 50  0  1  0]
 [ 0  0  0  0 69  0  0]
 [ 0  5  3  0  0 50  6]
 [ 1  6  6  0  0  2 45]]


0.8250591016548463

Training the Kernel SVM model in the training set

In [None]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf')
classifier.fit(X_train, y_train)

Predicting the Test set results

In [None]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[5 5]
 [4 4]
 [5 5]
 [4 4]
 [0 1]
 [4 4]
 [4 4]
 [5 6]
 [1 2]
 [6 6]
 [2 2]
 [3 2]
 [4 4]
 [3 3]
 [4 4]
 [0 0]
 [2 2]
 [3 3]
 [3 3]
 [1 0]
 [5 5]
 [1 1]
 [2 2]
 [0 0]
 [6 2]
 [3 3]
 [2 2]
 [4 4]
 [6 2]
 [0 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [4 4]
 [2 2]
 [6 6]
 [4 4]
 [3 3]
 [5 5]
 [3 3]
 [3 3]
 [6 6]
 [4 4]
 [3 3]
 [1 1]
 [3 2]
 [6 6]
 [2 2]
 [2 2]
 [2 2]
 [1 0]
 [5 5]
 [6 6]
 [0 0]
 [4 4]
 [2 2]
 [5 5]
 [2 2]
 [6 6]
 [5 5]
 [1 1]
 [4 4]
 [4 4]
 [4 4]
 [6 6]
 [4 4]
 [6 6]
 [6 2]
 [3 3]
 [4 4]
 [2 6]
 [2 2]
 [3 3]
 [4 4]
 [4 4]
 [2 2]
 [4 4]
 [0 1]
 [5 5]
 [6 6]
 [1 1]
 [2 2]
 [3 3]
 [2 2]
 [5 5]
 [4 4]
 [5 5]
 [2 2]
 [6 6]
 [6 5]
 [5 5]
 [3 3]
 [0 1]
 [1 1]
 [6 6]
 [5 6]
 [1 1]
 [4 4]
 [0 0]
 [4 4]
 [5 5]
 [5 5]
 [2 2]
 [4 4]
 [1 5]
 [6 6]
 [1 1]
 [6 2]
 [2 2]
 [0 0]
 [1 1]
 [2 2]
 [4 4]
 [5 5]
 [1 1]
 [0 0]
 [2 2]
 [5 5]
 [5 5]
 [6 6]
 [6 2]
 [5 5]
 [0 0]
 [2 5]
 [4 4]
 [0 0]
 [5 1]
 [6 6]
 [5 5]
 [4 4]
 [3 3]
 [0 0]
 [3 3]
 [2 5]
 [2 2]
 [5 6]
 [0 0]
 [4 4]
 [1 1]
 [0 0]
 [3 3]

Making the Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[41  5  0  0  0  0  0]
 [10 40  0  0  0  8  2]
 [ 0  2 61  3  0  0  7]
 [ 0  0  0 51  0  0  0]
 [ 0  0  0  0 69  0  0]
 [ 0  2  2  0  0 55  5]
 [ 0  2  9  0  0 11 38]]


0.8392434988179669

Training the Naive Bayes model in the training set

In [None]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

Predicting the Test set results

In [None]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[2 5]
 [4 4]
 [3 5]
 [4 4]
 [0 1]
 [4 4]
 [4 4]
 [2 6]
 [1 2]
 [3 6]
 [3 2]
 [3 2]
 [4 4]
 [3 3]
 [4 4]
 [0 0]
 [3 2]
 [3 3]
 [3 3]
 [0 0]
 [3 5]
 [1 1]
 [3 2]
 [0 0]
 [3 2]
 [3 3]
 [2 2]
 [4 4]
 [2 2]
 [4 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [0 1]
 [4 4]
 [3 2]
 [3 6]
 [4 4]
 [3 3]
 [3 5]
 [3 3]
 [3 3]
 [3 6]
 [4 4]
 [3 3]
 [1 1]
 [3 2]
 [2 6]
 [2 2]
 [3 2]
 [3 2]
 [0 0]
 [2 5]
 [3 6]
 [0 0]
 [4 4]
 [2 2]
 [3 5]
 [3 2]
 [2 6]
 [5 5]
 [5 1]
 [4 4]
 [4 4]
 [4 4]
 [3 6]
 [4 4]
 [6 6]
 [3 2]
 [3 3]
 [4 4]
 [2 6]
 [3 2]
 [3 3]
 [4 4]
 [4 4]
 [2 2]
 [4 4]
 [0 1]
 [5 5]
 [2 6]
 [4 1]
 [3 2]
 [3 3]
 [3 2]
 [3 5]
 [4 4]
 [5 5]
 [2 2]
 [3 6]
 [3 5]
 [5 5]
 [3 3]
 [5 1]
 [1 1]
 [3 6]
 [2 6]
 [1 1]
 [4 4]
 [0 0]
 [4 4]
 [2 5]
 [3 5]
 [2 2]
 [4 4]
 [1 5]
 [6 6]
 [1 1]
 [3 2]
 [3 2]
 [0 0]
 [5 1]
 [2 2]
 [4 4]
 [5 5]
 [1 1]
 [0 0]
 [3 2]
 [2 5]
 [3 5]
 [3 6]
 [3 2]
 [5 5]
 [0 0]
 [2 5]
 [4 4]
 [0 0]
 [5 1]
 [2 6]
 [5 5]
 [4 4]
 [3 3]
 [0 0]
 [3 3]
 [2 5]
 [2 2]
 [6 6]
 [0 0]
 [4 4]
 [5 1]
 [0 0]
 [3 3]

Making the Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[42  1  0  0  0  2  1]
 [18 19  3  0  2 11  7]
 [ 0  2 30 40  0  0  1]
 [ 0  1  0 50  0  0  0]
 [ 0  0  0  0 69  0  0]
 [ 1  4 19 20  0 20  0]
 [ 1  3 19 22  0  4 11]]


0.5697399527186762

Training the Decision Tree Classification model in the training set

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy')
classifier.fit(X_train, y_train)

Predicting the Test set results

In [None]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[5 5]
 [4 4]
 [5 5]
 [4 4]
 [1 1]
 [4 4]
 [4 4]
 [6 6]
 [2 2]
 [6 6]
 [2 2]
 [2 2]
 [4 4]
 [3 3]
 [4 4]
 [0 0]
 [2 2]
 [3 3]
 [3 3]
 [1 0]
 [5 5]
 [1 1]
 [2 2]
 [0 0]
 [2 2]
 [3 3]
 [2 2]
 [4 4]
 [2 2]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [4 4]
 [2 2]
 [6 6]
 [4 4]
 [3 3]
 [5 5]
 [3 3]
 [3 3]
 [6 6]
 [4 4]
 [3 3]
 [1 1]
 [2 2]
 [6 6]
 [2 2]
 [2 2]
 [2 2]
 [0 0]
 [5 5]
 [6 6]
 [0 0]
 [4 4]
 [2 2]
 [5 5]
 [2 2]
 [6 6]
 [5 5]
 [1 1]
 [4 4]
 [4 4]
 [4 4]
 [6 6]
 [4 4]
 [6 6]
 [2 2]
 [3 3]
 [4 4]
 [6 6]
 [2 2]
 [3 3]
 [4 4]
 [4 4]
 [2 2]
 [4 4]
 [1 1]
 [5 5]
 [6 6]
 [1 1]
 [2 2]
 [3 3]
 [2 2]
 [5 5]
 [4 4]
 [5 5]
 [2 2]
 [5 6]
 [5 5]
 [5 5]
 [3 3]
 [0 1]
 [1 1]
 [6 6]
 [6 6]
 [1 1]
 [4 4]
 [0 0]
 [4 4]
 [5 5]
 [5 5]
 [2 2]
 [4 4]
 [5 5]
 [6 6]
 [1 1]
 [2 2]
 [2 2]
 [0 0]
 [1 1]
 [2 2]
 [4 4]
 [5 5]
 [1 1]
 [0 0]
 [2 2]
 [5 5]
 [5 5]
 [6 6]
 [2 2]
 [5 5]
 [0 0]
 [5 5]
 [4 4]
 [0 0]
 [1 1]
 [6 6]
 [1 5]
 [4 4]
 [3 3]
 [0 0]
 [3 3]
 [5 5]
 [2 2]
 [6 6]
 [0 0]
 [4 4]
 [1 1]
 [0 0]
 [3 3]

Making the Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[44  2  0  0  0  0  0]
 [ 2 56  0  0  0  2  0]
 [ 0  0 72  0  0  0  1]
 [ 0  0  0 51  0  0  0]
 [ 0  0  0  0 69  0  0]
 [ 0  4  0  0  0 59  1]
 [ 0  0  1  0  0  2 57]]


0.9645390070921985

Training the Decision Tree Classification model in the training set

In [None]:
from  sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=100, criterion = 'entropy')
classifier.fit(X_train, y_train)

Predicting the Test set results

In [None]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[5 5]
 [4 4]
 [5 5]
 [4 4]
 [1 1]
 [4 4]
 [4 4]
 [6 6]
 [2 2]
 [6 6]
 [2 2]
 [2 2]
 [4 4]
 [3 3]
 [4 4]
 [0 0]
 [2 2]
 [3 3]
 [3 3]
 [1 0]
 [5 5]
 [1 1]
 [2 2]
 [0 0]
 [2 2]
 [3 3]
 [2 2]
 [4 4]
 [2 2]
 [1 1]
 [0 0]
 [0 0]
 [0 0]
 [0 0]
 [1 1]
 [4 4]
 [2 2]
 [6 6]
 [4 4]
 [3 3]
 [5 5]
 [3 3]
 [3 3]
 [6 6]
 [4 4]
 [3 3]
 [1 1]
 [2 2]
 [6 6]
 [2 2]
 [2 2]
 [2 2]
 [1 0]
 [5 5]
 [6 6]
 [0 0]
 [4 4]
 [2 2]
 [5 5]
 [2 2]
 [6 6]
 [5 5]
 [1 1]
 [4 4]
 [4 4]
 [4 4]
 [6 6]
 [4 4]
 [6 6]
 [2 2]
 [3 3]
 [4 4]
 [6 6]
 [2 2]
 [3 3]
 [4 4]
 [4 4]
 [2 2]
 [4 4]
 [1 1]
 [5 5]
 [6 6]
 [1 1]
 [2 2]
 [3 3]
 [2 2]
 [5 5]
 [4 4]
 [1 5]
 [2 2]
 [6 6]
 [5 5]
 [5 5]
 [3 3]
 [1 1]
 [1 1]
 [6 6]
 [6 6]
 [1 1]
 [4 4]
 [0 0]
 [4 4]
 [5 5]
 [5 5]
 [2 2]
 [4 4]
 [1 5]
 [6 6]
 [1 1]
 [2 2]
 [2 2]
 [0 0]
 [0 1]
 [2 2]
 [4 4]
 [5 5]
 [1 1]
 [0 0]
 [2 2]
 [5 5]
 [5 5]
 [6 6]
 [2 2]
 [5 5]
 [0 0]
 [5 5]
 [4 4]
 [0 0]
 [1 1]
 [6 6]
 [5 5]
 [4 4]
 [3 3]
 [0 0]
 [3 3]
 [5 5]
 [2 2]
 [6 6]
 [0 0]
 [4 4]
 [1 1]
 [0 0]
 [3 3]

Making the Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[41  5  0  0  0  0  0]
 [ 2 56  0  0  0  1  1]
 [ 0  0 73  0  0  0  0]
 [ 0  0  0 51  0  0  0]
 [ 0  0  0  0 69  0  0]
 [ 0  5  0  0  0 59  0]
 [ 0  3  1  0  0  0 56]]


0.9574468085106383