In [1]:
import sys; sys.path.append('../'); sys.path.append('../Preprocess')
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression

# use sklearn metrics, single function
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import Preprocess.preprocessor as preprocessor
from imblearn.over_sampling import SMOTE

import xgboost as xgb
from lightgbm.sklearn import LGBMClassifier
from catboost import CatBoostClassifier

from mlxtend.evaluate import bias_variance_decomp

In [2]:
df_h = preprocessor.ReadData(pth='../Dataset/body_level_classification_train.csv', label='Body_Level')
df_h.head()

Unnamed: 0,Gender,Age,Height,Weight,H_Cal_Consump,Veg_Consump,Water_Consump,Alcohol_Consump,Smoking,Meal_Count,Food_Between_Meals,Fam_Hist,H_Cal_Burn,Phys_Act,Time_E_Dev,Transport,Body_Level
0,Female,22.547298,1.722461,51.881263,yes,2.663421,1.04111,no,no,3.0,Frequently,yes,no,0.794402,1.391948,Public_Transportation,Body Level 1
1,Male,19.799054,1.743702,54.927529,yes,2.0,2.847264,Sometimes,no,3.28926,Sometimes,yes,no,1.680844,2.0,Public_Transportation,Body Level 1
2,Female,17.823438,1.708406,50.0,yes,1.642241,1.099231,Sometimes,no,3.45259,Sometimes,no,no,0.418875,1.0,Public_Transportation,Body Level 1
3,Female,19.007177,1.690727,49.895716,yes,1.212908,1.029703,Sometimes,no,3.207071,Sometimes,no,no,2.0,1.0,Public_Transportation,Body Level 1
4,Male,19.72925,1.793315,58.19515,yes,2.508835,2.076933,no,no,3.435905,Sometimes,yes,no,2.026668,1.443328,Automobile,Body Level 1


In [3]:
# add augmented columns like BMI ...
AGGREGATE = True
DISCRETIZE = False
# one hot encoding for categorical columns
ONE_HOT = False
# resample data
RESAMPLE = False
# resmapling using SMOTE
APPLY_SMOTE = False
# Preprocess
df_h = preprocessor.LabelOrdinalEncode(df_h)
if AGGREGATE:
    df_h = preprocessor.Aggregate(df_h, discretize=DISCRETIZE)
if ONE_HOT:
    df_h = preprocessor.OneHotEncode(df_h, label='Body_Level')
if APPLY_SMOTE:
    df_h = preprocessor.SMOTE(df_h)
elif RESAMPLE:
    df_h = preprocessor.Resample(df_h)
#
df_h.head()

Unnamed: 0,Gender,Age,Height,Weight,H_Cal_Consump,Veg_Consump,Water_Consump,Alcohol_Consump,Smoking,Meal_Count,Food_Between_Meals,Fam_Hist,H_Cal_Burn,Phys_Act,Time_E_Dev,Transport,BMI,Body_Level
0,0,22.547298,1.722461,51.881263,1,2.663421,1.04111,0,0,3.0,2,1,0,0.794402,1.391948,2,17.486856,0
1,1,19.799054,1.743702,54.927529,1,2.0,2.847264,1,0,3.28926,1,1,0,1.680844,2.0,2,18.065315,0
2,0,17.823438,1.708406,50.0,1,1.642241,1.099231,1,0,3.45259,1,0,0,0.418875,1.0,2,17.131202,0
3,0,19.007177,1.690727,49.895716,1,1.212908,1.029703,1,0,3.207071,1,0,0,2.0,1.0,2,17.454857,0
4,1,19.72925,1.793315,58.19515,1,2.508835,2.076933,0,0,3.435905,1,1,0,2.026668,1.443328,4,18.095627,0


In [4]:
# df_bmi
df_bmi = df_h[['BMI', 'Body_Level']]
df_bmi.head()

Unnamed: 0,BMI,Body_Level
0,17.486856,0
1,18.065315,0
2,17.131202,0
3,17.454857,0
4,18.095627,0


In [5]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(df_bmi['BMI'], df_bmi['Body_Level'], test_size=0.2, random_state=42)
X_train = np.array(X_train).reshape(-1, 1)
X_test = np.array(X_test).reshape(-1, 1)

In [6]:
def RunModel(model, X_train, y_train, X_test, y_test, name=None,
             Train_Report=False, Train_Conf_Mat=False, Train_F1=False,
             Test_Report=False, Test_Conf_Mat=False, Test_F1=False,
             Bias_Var=False, Bias_Var_Rounds=50):
    #
    if name is None:
        name = model.__class__.__name__
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred = model.predict(X_test)

    if name == 'Linear Regression':
      y_pred, y_pred_train = np.round(y_pred), np.round(y_pred_train)
      y_pred = np.where(y_pred == 4, 3, y_pred)
      y_pred_train = np.where(y_pred_train == 4, 3, y_pred_train)

    if Train_Report or Train_Conf_Mat or Train_F1:
        print(f'Training Metrics for {name}:')
        if Train_Report:
            print(classification_report(y_train, y_pred_train, digits=4, zero_division=0))
        if Train_Conf_Mat:
            print(confusion_matrix(y_train, y_pred_train))
        if Train_F1:
            print(classification_report(
                y_train, y_pred_train, digits=4, zero_division=0).split('\n')[-2])

    if Test_Report or Test_Conf_Mat or Test_F1:
        print(f'\nTesting Metrics for {name}:')
        if Test_Report:
            print(classification_report(y_test, y_pred, digits=4, zero_division=0))
        if Test_Conf_Mat:
            print(confusion_matrix(y_test, y_pred))
        if Test_F1:
            print(classification_report(
                y_test, y_pred, digits=4, zero_division=0).split('\n')[-2])

    if Bias_Var:
        print(f'\nBias-Variance Decomposition for {name}:')
        avg_expected_loss, avg_bias, avg_var = \
            bias_variance_decomp(model, X_train.values, y_train.values,
                                 X_test.values, y_test.values,
                                 loss='0-1_loss', random_seed=42, num_rounds=Bias_Var_Rounds)

        print(f'Avg. Exp. Loss: {avg_expected_loss:.4f}')
        print(f'Avg. Bias: {avg_bias:.4f}')
        print(f'Avg. Variance: {avg_var:.4f}')

    return model

In [50]:
# logistic regression, we have 4 classes
lr = LogisticRegression( solver='lbfgs', multi_class='multinomial', max_iter=5000, random_state=0, penalty='l2')
lr = RunModel(lr, X_train, y_train, X_test, y_test,
            Train_Conf_Mat=True, Test_Conf_Mat=True, Bias_Var=False)

Training Metrics for Logistic Reg:
              precision    recall  f1-score   support

           0     0.9632    0.9874    0.9752       159
           1     0.9474    0.9231    0.9351       156
           2     0.9753    0.9753    0.9753       324
           3     0.9963    0.9963    0.9963       542

    accuracy                         0.9797      1181
   macro avg     0.9705    0.9705    0.9705      1181
weighted avg     0.9796    0.9797    0.9796      1181


Testing Metrics for Logistic Reg:
              precision    recall  f1-score   support

           0     0.9688    1.0000    0.9841        31
           1     1.0000    0.9111    0.9535        45
           2     0.9643    0.9878    0.9759        82
           3     0.9928    1.0000    0.9964       138

    accuracy                         0.9831       296
   macro avg     0.9815    0.9747    0.9775       296
weighted avg     0.9835    0.9831    0.9829       296

[[ 31   0   0   0]
 [  1  41   3   0]
 [  0   0  81   1]
 [ 

In [51]:
# linear regression
lr = LinearRegression()

lr = RunModel(lr, X_train, y_train, X_test, y_test, name='Linear Regression',
               Train_Conf_Mat=True, Train_Report=True,
               Test_Report=True, Test_Conf_Mat=True, Bias_Var=True)

Training Metrics for Linear Regression:
              precision    recall  f1-score   support

         0.0     1.0000    0.0943    0.1724       159
         1.0     0.5065    1.0000    0.6724       156
         2.0     0.6653    0.9753    0.7910       324
         3.0     1.0000    0.5166    0.6813       542
         4.0     0.0000    0.0000    0.0000         0

    accuracy                         0.6494      1181
   macro avg     0.6344    0.5173    0.4634      1181
weighted avg     0.8430    0.6494    0.6417      1181

[[ 15 144   0   0   0]
 [  0 156   0   0   0]
 [  0   8 316   0   0]
 [  0   0 159 280 103]
 [  0   0   0   0   0]]

Testing Metrics for Linear Regression:
              precision    recall  f1-score   support

         0.0     1.0000    0.2258    0.3684        31
         1.0     0.6522    1.0000    0.7895        45
         2.0     0.6560    1.0000    0.7923        82
         3.0     1.0000    0.4565    0.6269       138
         4.0     0.0000    0.0000    0.0000 

In [52]:
# SVM
# svm = SVC(kernel='linear', C=1.0, random_state=0)
svm = SVC(kernel='linear', C= 100, random_state=0)

svm = RunModel(svm, X_train, y_train, X_test, y_test, name='SVM',
               Train_Conf_Mat=True, Train_Report=True,
               Test_Report=True, Test_Conf_Mat=True, Bias_Var=False)

Training Metrics for SVM:
              precision    recall  f1-score   support

           0     0.9812    0.9874    0.9843       159
           1     0.9441    0.9744    0.9590       156
           2     0.9905    0.9691    0.9797       324
           3     0.9945    0.9963    0.9954       542

    accuracy                         0.9848      1181
   macro avg     0.9776    0.9818    0.9796      1181
weighted avg     0.9850    0.9848    0.9848      1181

[[157   2   0   0]
 [  3 152   1   0]
 [  0   7 314   3]
 [  0   0   2 540]]

Testing Metrics for SVM:
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000        31
           1     1.0000    0.9778    0.9888        45
           2     0.9878    0.9878    0.9878        82
           3     0.9928    1.0000    0.9964       138

    accuracy                         0.9932       296
   macro avg     0.9952    0.9914    0.9932       296
weighted avg     0.9933    0.9932    0.9932       296

[

In [53]:
# random forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

rf = RunModel(rf, X_train, y_train, X_test, y_test,
               Train_Conf_Mat=True, Train_Report=True,
               Test_Report=True, Test_Conf_Mat=True, Bias_Var=False)

Random Forest
[[ 31   0   0   0]
 [  0  45   0   0]
 [  0   0  81   1]
 [  0   0   3 135]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        31
           1       1.00      1.00      1.00        45
           2       0.96      0.99      0.98        82
           3       0.99      0.98      0.99       138

    accuracy                           0.99       296
   macro avg       0.99      0.99      0.99       296
weighted avg       0.99      0.99      0.99       296

Weighted F1 score:  0.9865185738020144


In [54]:
# XGBoost
xgb_model = xgb.XGBClassifier(
                            booster='gbtree',
                            learning_rate=0.3,
                            max_depth=3,
                            objective="multi:softprob",
                            random_state=42,
                            num_class=4,
                            # eval_metric="auc",
                            eval_metric="mlogloss",
                            )
xgb_model = RunModel(xgb_model, X_train, y_train, X_test, y_test,
               Train_Conf_Mat=True, Train_Report=True,
               Test_Report=True, Test_Conf_Mat=True, Bias_Var=False)

Training Metrics for XGBoost:
[[159   0   0   0]
 [  0 156   0   0]
 [  0   0 324   0]
 [  0   0   0 542]]

Testing Metrics for XGBoost:
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000        31
           1     1.0000    1.0000    1.0000        45
           2     0.9643    0.9878    0.9759        82
           3     0.9926    0.9783    0.9854       138

    accuracy                         0.9865       296
   macro avg     0.9892    0.9915    0.9903       296
weighted avg     0.9867    0.9865    0.9865       296

[[ 31   0   0   0]
 [  0  45   0   0]
 [  0   0  81   1]
 [  0   0   3 135]]


In [55]:
# lightgbm
lgb_model = LGBMClassifier(
    boosting_type='gbdt',
    class_weight=None,
    # importance_type='split',
    learning_rate=0.2,
    # max_depth=-1,
    # min_child_samples=20,
    # min_child_weight=0.001,
    # min_split_gain=0.0,
    n_estimators=1000,
    # num_leaves=31,
    objective='multiclass',
    # reg_alpha=0.0,
    # reg_lambda=0.,
    # verbose=-10
)

# lgb_model = lgb.LGBMClassifier(random_state=42)
lgb_model = RunModel(lgb_model, X_train, y_train, X_test, y_test,
               Train_Conf_Mat=True, Train_Report=True,
               Test_Report=True, Test_Conf_Mat=True, Bias_Var=False)

Training Metrics for LightGBM:
[[154   5   0   0]
 [  0 153   3   0]
 [  0   4 317   3]
 [  0   0   2 540]]

Testing Metrics for LightGBM:
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000        31
           1     1.0000    0.9778    0.9888        45
           2     0.9878    0.9878    0.9878        82
           3     0.9928    1.0000    0.9964       138

    accuracy                         0.9932       296
   macro avg     0.9952    0.9914    0.9932       296
weighted avg     0.9933    0.9932    0.9932       296

[[ 31   0   0   0]
 [  0  44   1   0]
 [  0   0  81   1]
 [  0   0   0 138]]


In [56]:
# CatBoost
model = CatBoostClassifier(
    iterations=100,
    learning_rate=0.1,
    depth=6,
    random_seed=42,
    verbose=False
)

# Fit the model on the training data
model = RunModel(model, X_train, y_train, X_test, y_test,
               Train_Conf_Mat=True, Train_Report=True,
               Test_Report=True, Test_Conf_Mat=True, Bias_Var=False)

Training Metrics for Catboost:
              precision    recall  f1-score   support

           0     1.0000    0.9748    0.9873       159
           1     0.9341    1.0000    0.9659       156
           2     0.9937    0.9722    0.9828       324
           3     0.9963    0.9963    0.9963       542

    accuracy                         0.9873      1181
   macro avg     0.9810    0.9858    0.9831      1181
weighted avg     0.9879    0.9873    0.9874      1181

[[155   4   0   0]
 [  0 156   0   0]
 [  0   7 315   2]
 [  0   0   2 540]]

Testing Metrics for Catboost:
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000        31
           1     1.0000    1.0000    1.0000        45
           2     1.0000    0.9878    0.9939        82
           3     0.9928    1.0000    0.9964       138

    accuracy                         0.9966       296
   macro avg     0.9982    0.9970    0.9976       296
weighted avg     0.9966    0.9966    0.9966   

In [57]:
bmi_0 = df_bmi[df_bmi['Body_Level'] == 0]
bmi_1 = df_bmi[df_bmi['Body_Level'] == 1]
bmi_2 = df_bmi[df_bmi['Body_Level'] == 2]
bmi_3 = df_bmi[df_bmi['Body_Level'] == 3]

In [58]:
print(bmi_0['BMI'].min(), bmi_0['BMI'].max())
print(bmi_1['BMI'].min(), bmi_1['BMI'].max())
print(bmi_2['BMI'].min(), bmi_2['BMI'].max())
print(bmi_3['BMI'].min(), bmi_3['BMI'].max())

13.291587901701323 19.08220593702959
18.51851851851852 24.840980089578082
22.826738618008303 30.36287670928922
29.911958308167144 50.81175280566433


In [73]:
# https://www.cdc.gov/obesity/basics/adult-defining.html

class BMI():
  def __init__(self):
    self.v0, self.v1, self.v2 = 18.5, 25, 30

  def predict(self, X):
    if(len(X.shape)==2):
      X = X.reshape(-1)
    assert len(X.shape) == 1, "Pass only 2D or 1D np arrays"
    pred = np.zeros(len(X))
    pred[X < self.v0] = 0
    pred[(X >= self.v0) & (X < self.v1)] = 1
    pred[(X >= self.v1) & (X< self.v2)] = 2
    pred[(X >= self.v2)] = 3
    return pred

BMI_clf = BMI()
y_pred_train_manually = BMI_clf.predict(X_train)
y_pred_manually = BMI_clf.predict(X_test)


print('Training Metrics for Manual:')
print(classification_report(y_train, y_pred_train_manually, digits=4))
print(confusion_matrix(y_train, y_pred_train_manually))
# print(confusion_matrix(y_train, y_pred_train, normalize='true'))

print('\nTesting Metrics for Manual:')
print(classification_report(y_test, y_pred_manually, digits=4))
print(confusion_matrix(y_test, y_pred_manually))
# print(confusion_matrix(y_test, y_pred, normalize='true'))


Training Metrics for Manual:
              precision    recall  f1-score   support

           0     1.0000    0.9748    0.9873       159
           1     0.9231    1.0000    0.9600       156
           2     0.9936    0.9630    0.9781       324
           3     0.9945    0.9963    0.9954       542

    accuracy                         0.9848      1181
   macro avg     0.9778    0.9835    0.9802      1181
weighted avg     0.9856    0.9848    0.9849      1181

[[155   4   0   0]
 [  0 156   0   0]
 [  0   9 312   3]
 [  0   0   2 540]]

Testing Metrics for Manual:
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000        31
           1     1.0000    1.0000    1.0000        45
           2     1.0000    0.9878    0.9939        82
           3     0.9928    1.0000    0.9964       138

    accuracy                         0.9966       296
   macro avg     0.9982    0.9970    0.9976       296
weighted avg     0.9966    0.9966    0.9966       