# Imports

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
import warnings
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

# Loading Dataset

In [2]:
# Read the CSV file into a pandas DataFrame
df = pd.read_csv('dataset.csv')
# Display the first few rows of the DataFrame
# df = df.drop('BMI', axis=1)
print(df.head())
print('Data set shape:', df.shape)

# Split the data into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(
    df.drop('Body_Level', axis=1), df['Body_Level'], test_size=0.2, random_state=42)

# Print the shapes of the training and test sets
print('Training set shape:', X_train.shape)
print('Test set shape:', X_test.shape)

   Body_Level  Gender       Age    Height    Weight  H_Cal_Consump  \
0           3       1  0.646666  0.690578  1.016135              1   
1           3       1 -0.649582 -0.003364  0.686578              1   
2           3       0 -0.537725  0.493657  1.790354              1   
3           3       0 -1.291128 -0.545353 -0.036499              1   
4           3       0  0.270463 -0.818715  0.951256              1   

   Veg_Consump  Water_Consump  Smoking  Meal_Count  ...  \
0    -0.199318       0.236558        0    0.402155  ...   
1     0.428856      -1.026715        0   -1.792121  ...   
2     1.069487       1.401741        0    0.402155  ...   
3     0.986227      -1.620907        0    0.402155  ...   
4     1.069487       0.974150        0    0.402155  ...   

   Food_Between_Meals_Always  Food_Between_Meals_Frequently  \
0                          0                              0   
1                          0                              0   
2                          0       

In [3]:
# # Apply standard scaling to the data
# scaler = StandardScaler(with_mean=False, with_std=False)
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

# Common Functions

In [4]:
def apply_crossvalidation(model, X_train, Y_train, k=10):

    # create a k-fold cross-validation iterator
    kf = KFold(n_splits=k, shuffle=True, random_state=42)

    # perform k-fold cross-validation and compute accuracy
    scores = cross_val_score(model, X_train, Y_train, cv=kf, scoring='accuracy')
    # print the average accuracy score and its standard deviation
    print('Accuracy: {} +/- {}'.format(scores.mean(), scores.std()))

    # perform k-fold cross-validation and compute F1-score
    scores = cross_val_score(model, X_train, Y_train, cv=kf, scoring='f1_weighted')
    # print the average F1-score and its standard deviation
    print('F1-score: {} +/- {}'.format(scores.mean(), scores.std()))


In [5]:
def Evaluate(model, X_test, Y_test):
    
    # predict the class labels for the test set
    y_pred = model.predict(X_test)

    # calculate the accuracy
    accuracy = accuracy_score(Y_test, y_pred)

    # calculate the precision
    precision = precision_score(Y_test, y_pred, average='weighted')

    # calculate the recall
    recall = recall_score(Y_test, y_pred, average='weighted')

    # calculate the F1 score
    f1 = f1_score(Y_test, y_pred, average='weighted')

    # calculate the confusion matrix
    cm = confusion_matrix(Y_test, y_pred)

    # print the results
    print('Accuracy: {}'.format(accuracy))
    print('weighted F1 score: {}'.format(f1))
    print('weighted Precision: {}'.format(precision))
    print('weighted Recall: {}'.format(recall))
    print('Confusion matrix:\n', cm)

# Logistic Regression

Cross Validation

In [6]:
# Create a logistic regression object
logistic_regression = LogisticRegression()

# Ignore warnings
warnings.filterwarnings('ignore')

# Apply cross-validation to the logistic regression object
apply_crossvalidation(logistic_regression, X_train, Y_train)

# Turn warnings back on
warnings.filterwarnings('default')

Accuracy: 0.9576556046147273 +/- 0.018962656222099406
F1-score: 0.9563573954681557 +/- 0.020300126700354002


Fit and Evaluate

In [7]:
# Create a logistic regression object
logistic_regression = LogisticRegression()

# Ignore warnings
warnings.filterwarnings('ignore')

# Fit the model to the training data
logistic_regression.fit(X_train, Y_train)

# Turn warnings back on
warnings.filterwarnings('default')

In [8]:
# Predict the labels of the test set
y_pred = logistic_regression.predict(X_test)

# Evaluate the model
Evaluate(logistic_regression, X_test, Y_test)

Accuracy: 0.972972972972973
weighted F1 score: 0.9723522941546198
weighted Precision: 0.9738593206440364
weighted Recall: 0.972972972972973
Confusion matrix:
 [[ 41   0   0   0]
 [  4  36   3   0]
 [  0   1  84   0]
 [  0   0   0 127]]


# Random Forest

Cross Validation

In [9]:
random_forest = RandomForestClassifier()

# Apply cross-validation to the random forest object
apply_crossvalidation(random_forest, X_train, Y_train)

Accuracy: 0.9923728813559322 +/- 0.0070395117482356375
F1-score: 0.9941191564266825 +/- 0.0065565542203537925


Fit and Evaluate

In [10]:
# Create a random forest object
random_forest = RandomForestClassifier()

# Fit the model to the training data
random_forest.fit(X_train, Y_train)

In [11]:
# Predict the labels of the test set
y_pred = random_forest.predict(X_test)

# Evaluate the model
Evaluate(random_forest, X_test, Y_test)

Accuracy: 0.9932432432432432
weighted F1 score: 0.9932290646920463
weighted Precision: 0.9933464181511056
weighted Recall: 0.9932432432432432
Confusion matrix:
 [[ 41   0   0   0]
 [  0  43   0   0]
 [  0   1  83   1]
 [  0   0   0 127]]


# Decision Tree

Cross Validation

In [12]:
# Create a decision tree object
decision_tree = DecisionTreeClassifier()

# Apply cross-validation to the decision tree object
apply_crossvalidation(decision_tree, X_train, Y_train)

Accuracy: 0.9898305084745761 +/- 0.009127397978194068
F1-score: 0.9932354755384036 +/- 0.007375387423291595


Fit and Evaluate

In [13]:
# Create a decision tree object
decision_tree = DecisionTreeClassifier()

# Fit the model to the training data
decision_tree.fit(X_train, Y_train)

In [14]:
# Predict the labels of the test set
y_pred = decision_tree.predict(X_test)

# Evaluate the model
Evaluate(decision_tree, X_test, Y_test)

Accuracy: 0.9864864864864865
weighted F1 score: 0.9865520393066755
weighted Precision: 0.9871738671415982
weighted Recall: 0.9864864864864865
Confusion matrix:
 [[ 40   1   0   0]
 [  0  43   0   0]
 [  0   2  82   1]
 [  0   0   0 127]]


# Naive Bayes

cross validation

In [15]:
# Create a naive bayes object
naive_bayes = GaussianNB()

# Apply cross-validation to the naive bayes object
apply_crossvalidation(naive_bayes, X_train, Y_train)

Accuracy: 0.7265061956986185 +/- 0.055427938465524025
F1-score: 0.692502362303756 +/- 0.06261481159591056


Fit and Evaluate

In [16]:
# Create a naive bayes object
naive_bayes = GaussianNB()

# Fit the model to the training data
naive_bayes.fit(X_train, Y_train)

In [17]:
# Predict the labels of the test set
y_pred = naive_bayes.predict(X_test)

# Evaluate the model
Evaluate(naive_bayes, X_test, Y_test)

Accuracy: 0.6993243243243243
weighted F1 score: 0.6626931100160571
weighted Precision: 0.6949017452248851
weighted Recall: 0.6993243243243243
Confusion matrix:
 [[ 41   0   0   0]
 [ 20  12   8   3]
 [  0   7  33  45]
 [  0   3   3 121]]


# KNN

Cross Validation

In [18]:
# Create a k-nearest neighbors object
knn = KNeighborsClassifier()

# Apply cross-validation to the k-nearest neighbors object
apply_crossvalidation(knn, X_train, Y_train)

Accuracy: 0.8653753026634382 +/- 0.02314724385866101
F1-score: 0.8483020065424782 +/- 0.028318628852564524


Fit and Evaluate

In [19]:
# Create a k-nearest neighbors object
knn = KNeighborsClassifier()

# Fit the model to the training data
knn.fit(X_train, Y_train)

In [20]:
# predict the labels of the test set
y_pred = knn.predict(X_test)

# Evaluate the model
Evaluate(knn, X_test, Y_test)

Accuracy: 0.8581081081081081
weighted F1 score: 0.8382772172030531
weighted Precision: 0.858173172564193
weighted Recall: 0.8581081081081081
Confusion matrix:
 [[ 40   1   0   0]
 [ 13  13  16   1]
 [  1   2  75   7]
 [  0   0   1 126]]


# Neural Network

Cross Validation

In [21]:
# Create a multi-layer perceptron object
mlp = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42)

# Apply cross-validation to the multi-layer perceptron object
apply_crossvalidation(mlp, X_train, Y_train)

Accuracy: 0.9703674690215068 +/- 0.018657045450929943
F1-score: 0.9702265869227148 +/- 0.0187212650749201


Fit and Evaluate

In [22]:
# Create a multi-layer perceptron object
mlp = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42)

# Fit the model to the training data
mlp.fit(X_train, Y_train)

In [23]:
# Predict the labels of the test set
y_pred = mlp.predict(X_test)

# Evaluate the model
Evaluate(mlp, X_test, Y_test)

Accuracy: 0.9695945945945946
weighted F1 score: 0.9694739382239382
weighted Precision: 0.9698010627980509
weighted Recall: 0.9695945945945946
Confusion matrix:
 [[ 41   0   0   0]
 [  3  38   2   0]
 [  0   4  81   0]
 [  0   0   0 127]]


In [24]:
"""
First thing we did to just kickstart the project is to try all classification models we know and see how they perform on the data. 
We used the default parameters for all models. 
We also used cross-validation to get a better estimate of the performance of the models.

The results are shown in the table below.

Model	Accuracy    F1-score
Logistic Regression	95.76%	95.63%

Random Forest	99.23%	99.41%

Decision Tree	98.98%	99.32%

Naive Bayes	72.65%	69.25%

K-Nearest Neighbors	86.53%	84.83%

Multi-Layer Perceptron	97.03%	97.02%

Although this we have done any hyperparameters tuning yet, this can help us to get a better idea of which models are worth tuning and which are not.

We can see that the Random Forest, Decision Tree and Multi-Layer Perceptron models perform very well on the data.

So we decided to tune Random forest in addition to the three models we have chosen from the course (logistic regression, support vector machine and perceptron).

"""

'\n\n'