In [1]:
import sys; sys.path.append('../'); sys.path.append('../Preprocess')
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression

# use sklearn metrics, single function
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import Preprocess.preprocessor as preprocessor
from imblearn.over_sampling import SMOTE

import xgboost as xgb
from lightgbm.sklearn import LGBMClassifier
from catboost import CatBoostClassifier

In [2]:
df_h = preprocessor.ReadData(pth='../Dataset/body_level_classification_train.csv', label='Body_Level')
df_h.head()

Unnamed: 0,Gender,Age,Height,Weight,H_Cal_Consump,Veg_Consump,Water_Consump,Alcohol_Consump,Smoking,Meal_Count,Food_Between_Meals,Fam_Hist,H_Cal_Burn,Phys_Act,Time_E_Dev,Transport,Body_Level
0,Female,22.547298,1.722461,51.881263,yes,2.663421,1.04111,no,no,3.0,Frequently,yes,no,0.794402,1.391948,Public_Transportation,Body Level 1
1,Male,19.799054,1.743702,54.927529,yes,2.0,2.847264,Sometimes,no,3.28926,Sometimes,yes,no,1.680844,2.0,Public_Transportation,Body Level 1
2,Female,17.823438,1.708406,50.0,yes,1.642241,1.099231,Sometimes,no,3.45259,Sometimes,no,no,0.418875,1.0,Public_Transportation,Body Level 1
3,Female,19.007177,1.690727,49.895716,yes,1.212908,1.029703,Sometimes,no,3.207071,Sometimes,no,no,2.0,1.0,Public_Transportation,Body Level 1
4,Male,19.72925,1.793315,58.19515,yes,2.508835,2.076933,no,no,3.435905,Sometimes,yes,no,2.026668,1.443328,Automobile,Body Level 1


In [3]:
# add augmented columns like BMI ...
AGGREGATE = True
DISCRETIZE = False
# one hot encoding for categorical columns
ONE_HOT = False
# resample data
RESAMPLE = False
# resmapling using SMOTE
APPLY_SMOTE = False
# Preprocess
df_h = preprocessor.LabelOrdinalEncode(df_h)
if AGGREGATE:
    df_h = preprocessor.Aggregate(df_h, discretize=DISCRETIZE)
if ONE_HOT:
    df_h = preprocessor.OneHotEncode(df_h, label='Body_Level')
if APPLY_SMOTE:
    df_h = preprocessor.SMOTE(df_h)
elif RESAMPLE:
    df_h = preprocessor.Resample(df_h)
#
df_h.head()

Unnamed: 0,Gender,Age,Height,Weight,H_Cal_Consump,Veg_Consump,Water_Consump,Alcohol_Consump,Smoking,Meal_Count,Food_Between_Meals,Fam_Hist,H_Cal_Burn,Phys_Act,Time_E_Dev,Transport,BMI,Body_Level
0,0,22.547298,1.722461,51.881263,1,2.663421,1.04111,0,0,3.0,2,1,0,0.794402,1.391948,2,17.486856,0
1,1,19.799054,1.743702,54.927529,1,2.0,2.847264,1,0,3.28926,1,1,0,1.680844,2.0,2,18.065315,0
2,0,17.823438,1.708406,50.0,1,1.642241,1.099231,1,0,3.45259,1,0,0,0.418875,1.0,2,17.131202,0
3,0,19.007177,1.690727,49.895716,1,1.212908,1.029703,1,0,3.207071,1,0,0,2.0,1.0,2,17.454857,0
4,1,19.72925,1.793315,58.19515,1,2.508835,2.076933,0,0,3.435905,1,1,0,2.026668,1.443328,4,18.095627,0


In [4]:
X_train, X_test, y_train, y_test = preprocessor.Split(df_h, test_size=0.2, random_state=42)

print('X_train.shape: ', X_train.shape)
print('X_test.shape: ', X_test.shape)
print('y_train.shape: ', y_train.shape)
print('y_test.shape: ', y_test.shape)

print(f'{X_train.head()=}')
print(f'{y_train.head()=}')

X_train.shape:  (1181, 17)
X_test.shape:  (296, 17)
y_train.shape:  (1181,)
y_test.shape:  (296,)
X_train.head()=      Gender        Age    Height      Weight  H_Cal_Consump  Veg_Consump  \
660        1  33.081600  1.705617   83.016968              1     2.000000   
933        0  26.000000  1.610636  105.423532              1     3.000000   
254        0  18.000000  1.600000   55.000000              1     2.000000   
1117       1  30.607546  1.757132  118.565568              1     2.918113   
812        1  40.501722  1.744974  111.169678              1     2.294259   

      Water_Consump  Alcohol_Consump  Smoking  Meal_Count  Food_Between_Meals  \
660        2.991671                0        0    2.797600                   1   
933        2.180566                1        0    3.000000                   1   
254        2.000000                1        0    4.000000                   2   
1117       2.240463                1        0    3.000000                   1   
812        1.870290

# XGBOOST

In [5]:
# XGBoost
xgb_model = xgb.XGBClassifier(
                            booster='gbtree',
                            learning_rate=0.3,
                            max_depth=3,
                            objective="multi:softprob",
                            random_state=42,
                            num_class=4,
                            # eval_metric="auc",
                            eval_metric="mlogloss",
                            )
xgb_model.fit(X_train, y_train)

y_pred_train = xgb_model.predict(X_train)
y_pred = xgb_model.predict(X_test)

print('Training Metrics for XGBoost:')
# print(classification_report(y_train, y_pred_train, digits=4))
print(confusion_matrix(y_train, y_pred_train))
# print(confusion_matrix(y_train, y_pred_train, normalize='true'))

# print(classification_report(y_train, y_pred_train, digits=4).split('\n')[-2])
# print(classification_report(y_test, y_pred, digits=4).split('\n')[-2])

print('\nTesting Metrics for XGBoost:')
print(classification_report(y_test, y_pred, digits=4))
print(confusion_matrix(y_test, y_pred))
# print(confusion_matrix(y_test, y_pred, normalize='true'))

Training Metrics for XGBoost:
[[159   0   0   0]
 [  0 156   0   0]
 [  0   0 324   0]
 [  0   0   0 542]]

Testing Metrics for XGBoost:
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000        31
           1     1.0000    1.0000    1.0000        45
           2     1.0000    1.0000    1.0000        82
           3     1.0000    1.0000    1.0000       138

    accuracy                         1.0000       296
   macro avg     1.0000    1.0000    1.0000       296
weighted avg     1.0000    1.0000    1.0000       296

[[ 31   0   0   0]
 [  0  45   0   0]
 [  0   0  82   0]
 [  0   0   0 138]]


# LightGBM

In [6]:
# lightgbm
lgb_model = LGBMClassifier(
    boosting_type='gbdt',
    class_weight=None,
    # importance_type='split',
    learning_rate=0.2,
    # max_depth=-1,
    # min_child_samples=20,
    # min_child_weight=0.001,
    # min_split_gain=0.0,
    n_estimators=1000,
    # num_leaves=31,
    objective='multiclass',
    # reg_alpha=0.0,
    # reg_lambda=0.,
    # verbose=-10
)

# lgb_model = lgb.LGBMClassifier(random_state=42)
lgb_model.fit(X_train, y_train)

y_pred_train = lgb_model.predict(X_train)
y_pred = lgb_model.predict(X_test)

print('Training Metrics for LightGBM:')
# print(classification_report(y_train, y_pred_train, digits=4))
print(confusion_matrix(y_train, y_pred_train))
# print(confusion_matrix(y_train, y_pred_train, normalize='true'))

print('\nTesting Metrics for LightGBM:')
print(classification_report(y_test, y_pred, digits=4))
print(confusion_matrix(y_test, y_pred))
# print(confusion_matrix(y_test, y_pred, normalize='true'))

Training Metrics for LightGBM:
[[159   0   0   0]
 [  0 156   0   0]
 [  0   0 324   0]
 [  0   0   0 542]]

Testing Metrics for LightGBM:
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000        31
           1     1.0000    1.0000    1.0000        45
           2     0.9877    0.9756    0.9816        82
           3     0.9856    0.9928    0.9892       138

    accuracy                         0.9899       296
   macro avg     0.9933    0.9921    0.9927       296
weighted avg     0.9899    0.9899    0.9899       296

[[ 31   0   0   0]
 [  0  45   0   0]
 [  0   0  80   2]
 [  0   0   1 137]]


# Looks like SVM Overfits, specially with high C

In [7]:
# SVM
# svm = SVC(kernel='linear', C=1.0, random_state=0)
svm = SVC(kernel='linear', C= 100, random_state=0)
svm.fit(X_train, y_train)

y_pred_train = svm.predict(X_train)
y_pred = svm.predict(X_test)

print('Training Metrics for SVM:')
print(classification_report(y_train, y_pred_train, digits=4))
print(confusion_matrix(y_train, y_pred_train))
# print(confusion_matrix(y_train, y_pred_train, normalize='true'))

print('\nTesting Metrics for SVM:')
print(classification_report(y_test, y_pred, digits=4))
print(confusion_matrix(y_test, y_pred))
# print(confusion_matrix(y_test, y_pred, normalize='true'))

Training Metrics for SVM:
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000       159
           1     0.9936    1.0000    0.9968       156
           2     1.0000    0.9969    0.9985       324
           3     1.0000    1.0000    1.0000       542

    accuracy                         0.9992      1181
   macro avg     0.9984    0.9992    0.9988      1181
weighted avg     0.9992    0.9992    0.9992      1181

[[159   0   0   0]
 [  0 156   0   0]
 [  0   1 323   0]
 [  0   0   0 542]]

Testing Metrics for SVM:
              precision    recall  f1-score   support

           0     0.9118    1.0000    0.9538        31
           1     0.9524    0.8889    0.9195        45
           2     0.9634    0.9634    0.9634        82
           3     0.9928    0.9928    0.9928       138

    accuracy                         0.9696       296
   macro avg     0.9551    0.9613    0.9574       296
weighted avg     0.9700    0.9696    0.9694       296

[

# Linear Regression

In [8]:
# linear regression
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred_train = np.round(lr.predict(X_train))
y_pred = np.round(lr.predict(X_test))

print('Training Metrics for Linear Regression:')
print(classification_report(y_train, y_pred_train, digits=4, zero_division=0))
print(confusion_matrix(y_train, y_pred_train))
# print(confusion_matrix(y_train, y_pred_train, normalize='true'))

print('\nTesting Metrics for Linear Regression:')
print(classification_report(y_test, y_pred, digits=4, zero_division=0))
print(confusion_matrix(y_test, y_pred))
# print(confusion_matrix(y_test, y_pred, normalize='true'))

Training Metrics for Linear Regression:
              precision    recall  f1-score   support

         0.0     0.9298    0.6667    0.7766       159
         1.0     0.6036    0.8590    0.7090       156
         2.0     0.7669    0.8735    0.8167       324
         3.0     0.9867    0.8210    0.8963       542
         4.0     0.0000    0.0000    0.0000         0

    accuracy                         0.8196      1181
   macro avg     0.6574    0.6440    0.6397      1181
weighted avg     0.8681    0.8196    0.8336      1181

[[106  53   0   0   0]
 [  8 134  14   0   0]
 [  0  35 283   6   0]
 [  0   0  72 445  25]
 [  0   0   0   0   0]]

Testing Metrics for Linear Regression:
              precision    recall  f1-score   support

         0.0     0.8800    0.7097    0.7857        31
         1.0     0.6607    0.8222    0.7327        45
         2.0     0.7071    0.8537    0.7735        82
         3.0     0.9811    0.7536    0.8525       138
         4.0     0.0000    0.0000    0.0000 

# Logistic Regression

In [9]:
# logistic regression, we have 4 classes
lr = LogisticRegression( solver='lbfgs', multi_class='multinomial', max_iter=5000, random_state=0, penalty='l2')
lr.fit(X_train, y_train)

y_pred_train = lr.predict(X_train)
y_pred = lr.predict(X_test)

print('Training Metrics for Logistic Reg:')
print(classification_report(y_train, y_pred_train, digits=4))
# print(confusion_matrix(y_train, y_pred_train))
# print(confusion_matrix(y_train, y_pred_train, normalize='true'))

print('\nTesting Metrics for Logistic Reg:')
print(classification_report(y_test, y_pred, digits=4))
print(confusion_matrix(y_test, y_pred))
# print(confusion_matrix(y_test, y_pred, normalize='true'))

Training Metrics for Logistic Reg:
              precision    recall  f1-score   support

           0     0.9938    1.0000    0.9969       159
           1     0.9935    0.9872    0.9904       156
           2     0.9969    0.9938    0.9954       324
           3     0.9982    1.0000    0.9991       542

    accuracy                         0.9966      1181
   macro avg     0.9956    0.9953    0.9954      1181
weighted avg     0.9966    0.9966    0.9966      1181


Testing Metrics for Logistic Reg:
              precision    recall  f1-score   support

           0     0.9394    1.0000    0.9688        31
           1     1.0000    0.9111    0.9535        45
           2     0.9759    0.9878    0.9818        82
           3     0.9928    1.0000    0.9964       138

    accuracy                         0.9831       296
   macro avg     0.9770    0.9747    0.9751       296
weighted avg     0.9836    0.9831    0.9829       296

[[ 31   0   0   0]
 [  2  41   2   0]
 [  0   0  81   1]
 [ 

# SVM with RBF Kernel

In [10]:
# SVM rbf
svm = SVC(kernel='rbf', C=1.0, random_state=0)
svm.fit(X_train, y_train)

y_pred_train = svm.predict(X_train)
y_pred = svm.predict(X_test)

print('Training Metrics for SVM with RBF Kernel:')
print(classification_report(y_train, y_pred_train, digits=4))
# print(confusion_matrix(y_train, y_pred_train))
# print(confusion_matrix(y_train, y_pred_train, normalize='true'))

print('\nTesting Metrics for SVM with RBF Kernel:')
print(classification_report(y_test, y_pred, digits=4))
print(confusion_matrix(y_test, y_pred))
# print(confusion_matrix(y_test, y_pred, normalize='true'))

Training Metrics for SVM with RBF Kernel:
              precision    recall  f1-score   support

           0     0.7662    0.9686    0.8556       159
           1     0.8158    0.3974    0.5345       156
           2     0.7173    0.9321    0.8107       324
           3     0.9731    0.8672    0.9171       542

    accuracy                         0.8366      1181
   macro avg     0.8181    0.7913    0.7795      1181
weighted avg     0.8543    0.8366    0.8291      1181


Testing Metrics for SVM with RBF Kernel:
              precision    recall  f1-score   support

           0     0.7317    0.9677    0.8333        31
           1     0.8750    0.4667    0.6087        45
           2     0.6842    0.9512    0.7959        82
           3     0.9829    0.8333    0.9020       138

    accuracy                         0.8243       296
   macro avg     0.8185    0.8047    0.7850       296
weighted avg     0.8574    0.8243    0.8208       296

[[ 30   1   0   0]
 [ 11  21  13   0]
 [  0   

# Naive Bayes

In [11]:
# Naive Bayes
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
nb = MultinomialNB()
# nb = GaussianNB( )
# nb = BernoulliNB()
nb.fit(X_train, y_train)

y_pred = nb.predict(X_test)
y_pred_train = nb.predict(X_train)

print('Training Metrics for Naive Bayes:')
print(classification_report(y_train, y_pred_train, digits=4))
# print(confusion_matrix(y_train, y_pred_train))
# print(confusion_matrix(y_train, y_pred_train, normalize='true'))

print('\nTesting Metrics for Naive Bayes:')
print(classification_report(y_test, y_pred, digits=4))
print(confusion_matrix(y_test, y_pred))
# print(confusion_matrix(y_test, y_pred, normalize='true'))

Training Metrics for Naive Bayes:
              precision    recall  f1-score   support

           0     0.7598    0.8553    0.8047       159
           1     0.6290    0.2500    0.3578       156
           2     0.5213    0.5278    0.5245       324
           3     0.7680    0.8672    0.8146       542

    accuracy                         0.6909      1181
   macro avg     0.6695    0.6251    0.6254      1181
weighted avg     0.6809    0.6909    0.6733      1181


Testing Metrics for Naive Bayes:
              precision    recall  f1-score   support

           0     0.7353    0.8065    0.7692        31
           1     0.5455    0.1333    0.2143        45
           2     0.4615    0.5854    0.5161        82
           3     0.7619    0.8116    0.7860       138

    accuracy                         0.6453       296
   macro avg     0.6260    0.5842    0.5714       296
weighted avg     0.6430    0.6453    0.6225       296

[[ 25   2   4   0]
 [  9   6  26   4]
 [  0   3  48  31]
 [  0

# Catboost 

In [12]:
# CatBoost
model = CatBoostClassifier(
    iterations=100,
    learning_rate=0.1,
    depth=6,
    random_seed=42,
    verbose=False
)

# Fit the model on the training data
model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
y_pred = model.predict(X_test)

print('Training Metrics for Catboost:')
print(classification_report(y_train, y_pred_train, digits=4))
print(confusion_matrix(y_train, y_pred_train))
# print(confusion_matrix(y_train, y_pred_train, normalize='true'))

print('\nTesting Metrics for Catboost:')
print(classification_report(y_test, y_pred, digits=4))
print(confusion_matrix(y_test, y_pred))
# print(confusion_matrix(y_test, y_pred, normalize='true'))

Training Metrics for Catboost:
              precision    recall  f1-score   support

           0     1.0000    0.9937    0.9968       159
           1     0.9811    1.0000    0.9905       156
           2     1.0000    0.9938    0.9969       324
           3     1.0000    1.0000    1.0000       542

    accuracy                         0.9975      1181
   macro avg     0.9953    0.9969    0.9961      1181
weighted avg     0.9975    0.9975    0.9975      1181

[[158   1   0   0]
 [  0 156   0   0]
 [  0   2 322   0]
 [  0   0   0 542]]

Testing Metrics for Catboost:
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000        31
           1     1.0000    1.0000    1.0000        45
           2     1.0000    1.0000    1.0000        82
           3     1.0000    1.0000    1.0000       138

    accuracy                         1.0000       296
   macro avg     1.0000    1.0000    1.0000       296
weighted avg     1.0000    1.0000    1.0000   

In [None]:
# Create a dictionary with sample data
new_data_dict = {
    'Age': [32, 45, 21],
    'Height': [1.65, 1.75, 1.68],
    'Weight': [72, 89, 55],
    'Veg_Consump': [3, 2, 4],
    'Water_Consump': [4, 3, 2],
    'Meal_Count': [3, 4, 2],
    'Phys_Act': [2, 3, 4],
    'Time_E_Dev': [3, 2, 1],
    'Gender_Female': [0, 1, 0],
    'Gender_Male': [1, 0, 1],
    'H_Cal_Consump_no': [1, 0, 0],
    'H_Cal_Consump_yes': [0, 1, 1],
    'Alcohol_Consump_Always': [0, 0, 1],
    'Alcohol_Consump_Frequently': [1, 0, 0],
    'Alcohol_Consump_Sometimes': [0, 1, 0],
    'Alcohol_Consump_no': [0, 0, 0],
    'Smoking_no': [1, 1, 0],
    'Smoking_yes': [0, 0, 1],
    'Food_Between_Meals_Always': [0, 1, 0],
    'Food_Between_Meals_Frequently': [0, 0, 1],
    'Food_Between_Meals_Sometimes': [1, 0, 0],
    'Food_Between_Meals_no': [0, 0, 0],
    'Fam_Hist_no': [1, 0, 1],
    'Fam_Hist_yes': [0, 1, 0],
    'H_Cal_Burn_no': [1, 0, 0],
    'H_Cal_Burn_yes': [0, 1, 1],
    'Transport_Automobile': [1, 0, 0],
    'Transport_Bike': [0, 1, 0],
    'Transport_Motorbike': [0, 0, 1],
    'Transport_Public_Transportation': [0, 0, 0],
    'Transport_Walking': [0, 0, 0]
}

# Convert the dictionary to a Pandas DataFrame
new_data = pd.DataFrame(new_data_dict)

print(len(df_h.columns))
print("---------------------------")
print(len(new_data.columns))
print("")

# Make prediction using the model
predictions = model.predict(new_data)

# Print the predicted class labels
print("predictions are : \n", predictions)
