In [1]:
import sys; sys.path.append('../');
import Preprocess.preprocessor as preprocessor
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, make_scorer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [2]:
df_h = preprocessor.ReadData(pth='../Dataset/body_level_classification_train.csv', label='Body_Level')
df_h.head()

Unnamed: 0,Gender,Age,Height,Weight,H_Cal_Consump,Veg_Consump,Water_Consump,Alcohol_Consump,Smoking,Meal_Count,Food_Between_Meals,Fam_Hist,H_Cal_Burn,Phys_Act,Time_E_Dev,Transport,Body_Level
0,Female,22.547298,1.722461,51.881263,yes,2.663421,1.04111,no,no,3.0,Frequently,yes,no,0.794402,1.391948,Public_Transportation,Body Level 1
1,Male,19.799054,1.743702,54.927529,yes,2.0,2.847264,Sometimes,no,3.28926,Sometimes,yes,no,1.680844,2.0,Public_Transportation,Body Level 1
2,Female,17.823438,1.708406,50.0,yes,1.642241,1.099231,Sometimes,no,3.45259,Sometimes,no,no,0.418875,1.0,Public_Transportation,Body Level 1
3,Female,19.007177,1.690727,49.895716,yes,1.212908,1.029703,Sometimes,no,3.207071,Sometimes,no,no,2.0,1.0,Public_Transportation,Body Level 1
4,Male,19.72925,1.793315,58.19515,yes,2.508835,2.076933,no,no,3.435905,Sometimes,yes,no,2.026668,1.443328,Automobile,Body Level 1


In [3]:
# add augmented columns like BMI ...
AGGREGATE = True
DISCRETIZE = False
# one hot encoding for categorical columns
ONE_HOT = False
# resample data
RESAMPLE = False
# resmapling using SMOTE
APPLY_SMOTE = False
# Preprocess
df_h = preprocessor.LabelOrdinalEncode(df_h)
if AGGREGATE:
    df_h = preprocessor.Aggregate(df_h, discretize=DISCRETIZE)
if ONE_HOT:
    df_h = preprocessor.OneHotEncode(df_h, label='Body_Level')
if APPLY_SMOTE:
    df_h = preprocessor.SMOTE(df_h)
elif RESAMPLE:
    df_h = preprocessor.Resample(df_h)
#
df_h.head()

Unnamed: 0,Gender,Age,Height,Weight,H_Cal_Consump,Veg_Consump,Water_Consump,Alcohol_Consump,Smoking,Meal_Count,Food_Between_Meals,Fam_Hist,H_Cal_Burn,Phys_Act,Time_E_Dev,Transport,BMI,Body_Level
0,0,22.547298,1.722461,51.881263,1,2.663421,1.04111,0,0,3.0,2,1,0,0.794402,1.391948,2,17.486856,0
1,1,19.799054,1.743702,54.927529,1,2.0,2.847264,1,0,3.28926,1,1,0,1.680844,2.0,2,18.065315,0
2,0,17.823438,1.708406,50.0,1,1.642241,1.099231,1,0,3.45259,1,0,0,0.418875,1.0,2,17.131202,0
3,0,19.007177,1.690727,49.895716,1,1.212908,1.029703,1,0,3.207071,1,0,0,2.0,1.0,2,17.454857,0
4,1,19.72925,1.793315,58.19515,1,2.508835,2.076933,0,0,3.435905,1,1,0,2.026668,1.443328,4,18.095627,0


In [4]:
X_train, X_test, y_train, y_test = preprocessor.Split(df_h, test_size=0.2, random_state=42)

print('X_train.shape: ', X_train.shape)
print('X_test.shape: ', X_test.shape)
print('y_train.shape: ', y_train.shape)
print('y_test.shape: ', y_test.shape)

print(f'{X_train.head()=}')
print(f'{y_train.head()=}')

X_train.shape:  (1181, 17)
X_test.shape:  (296, 17)
y_train.shape:  (1181,)
y_test.shape:  (296,)
X_train.head()=      Gender        Age    Height      Weight  H_Cal_Consump  Veg_Consump   
660        1  33.081600  1.705617   83.016968              1     2.000000  \
933        0  26.000000  1.610636  105.423532              1     3.000000   
254        0  18.000000  1.600000   55.000000              1     2.000000   
1117       1  30.607546  1.757132  118.565568              1     2.918113   
812        1  40.501722  1.744974  111.169678              1     2.294259   

      Water_Consump  Alcohol_Consump  Smoking  Meal_Count  Food_Between_Meals   
660        2.991671                0        0    2.797600                   1  \
933        2.180566                1        0    3.000000                   1   
254        2.000000                1        0    4.000000                   2   
1117       2.240463                1        0    3.000000                   1   
812        1.870290

In [5]:
scorer = make_scorer(f1_score, average='weighted')

In [6]:
!mkdir Results/

mkdir: cannot create directory ‘Results/’: File exists


In [13]:
# GridSearch
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# SVM
param_grid = {'C': [0.1, 1, 3, 5, 7, 10, 12, 14, 15],
                'gamma': [4, 2, 1, 0.1, 0.01, 0.001, 0.0001],
                'kernel': ['rbf', 'poly', 'sigmoid', 'linear']}

grid = GridSearchCV(SVC(), param_grid, refit=True, cv=5, scoring=scorer)
grid.fit(X_train, y_train)
# output the cv result to csv
pd.DataFrame(grid.cv_results_).to_csv('Results/svm_grid_search_cv.csv')


In [14]:

print(grid.best_params_)
print(grid.best_estimator_)
y_pred_train = grid.predict(X_train)
y_pred = grid.predict(X_test)


print('Training Metrics for SVM:')
# print(classification_report(y_train, y_pred_train, digits=4))
print(confusion_matrix(y_train, y_pred_train))
# print(confusion_matrix(y_train, y_pred_train, normalize='true'))

# print(classification_report(y_train, y_pred_train, digits=4).split('\n')[-2])
# print(classification_report(y_test, y_pred, digits=4).split('\n')[-2])

print('\nTesting Metrics for SVM:')
print(classification_report(y_test, y_pred, digits=4))
print(confusion_matrix(y_test, y_pred))
# print(confusion_matrix(y_test, y_pred, normalize='true'))

{'C': 5, 'gamma': 4, 'kernel': 'linear'}
SVC(C=5, gamma=4, kernel='linear')
Training Metrics for SVM:
[[159   0   0   0]
 [  1 155   0   0]
 [  0   1 323   0]
 [  0   0   0 542]]

Testing Metrics for SVM:
              precision    recall  f1-score   support

           0     0.9118    1.0000    0.9538        31
           1     0.9545    0.9333    0.9438        45
           2     0.9875    0.9634    0.9753        82
           3     0.9928    0.9928    0.9928       138

    accuracy                         0.9764       296
   macro avg     0.9616    0.9724    0.9664       296
weighted avg     0.9770    0.9764    0.9764       296

[[ 31   0   0   0]
 [  3  42   0   0]
 [  0   2  79   1]
 [  0   0   1 137]]


In [20]:
# Random Forest
param_grid = {'n_estimators': [10, 50, 100, 200],
                'max_features': ['sqrt', 'log2'],
                'max_depth' : [4,5,6,7,8, 12, 16],
                'criterion' :['gini', 'entropy']}
grid = GridSearchCV(RandomForestClassifier(), param_grid, refit=True, cv=5, scoring=scorer)
grid.fit(X_train, y_train)
# output cv results to csv
pd.DataFrame(grid.cv_results_).to_csv( 'Results/random_forest_grid_search_results.csv')

In [21]:
# print best parameter after tuning
print(grid.best_params_)
print(grid.best_estimator_)
y_pred_train = grid.predict(X_train)
y_pred = grid.predict(X_test)

print('Training Metrics for Random Forest:')
# print(classification_report(y_train, y_pred_train, digits=4))
print(confusion_matrix(y_train, y_pred_train))
# print(confusion_matrix(y_train, y_pred_train, normalize='true'))

# print(classification_report(y_train, y_pred_train, digits=4).split('\n')[-2])
# print(classification_report(y_test, y_pred, digits=4).split('\n')[-2])

print('\nTesting Metrics for Random Forest:')
print(classification_report(y_test, y_pred, digits=4))
print(confusion_matrix(y_test, y_pred))
# print(confusion_matrix(y_test, y_pred, normalize='true'))

{'criterion': 'gini', 'max_depth': 16, 'max_features': 'log2', 'n_estimators': 100}
RandomForestClassifier(max_depth=16, max_features='log2')
Training Metrics for Random Forest:
[[159   0   0   0]
 [  0 156   0   0]
 [  0   0 324   0]
 [  0   0   0 542]]

Testing Metrics for Random Forest:
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000        31
           1     1.0000    1.0000    1.0000        45
           2     1.0000    1.0000    1.0000        82
           3     1.0000    1.0000    1.0000       138

    accuracy                         1.0000       296
   macro avg     1.0000    1.0000    1.0000       296
weighted avg     1.0000    1.0000    1.0000       296

[[ 31   0   0   0]
 [  0  45   0   0]
 [  0   0  82   0]
 [  0   0   0 138]]


In [22]:
# LightGBM
param_grid = {
    # 'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'learning_rate': [0.05, 0.1, 0.2, .3, .5],
    # 'n_estimators': [50, 100, 200, 500],
    'n_estimators': [100, 500, 1000],
    # 'num_leaves': [4, 8, 16, 32, 64],
    # 'num_leaves': [32, 64],
    # 'max_depth': [4, 5, 6, 7, 8, 12, 16],
    # 'max_depth': [12, 16],
    'boosting_type' : ['gbdt', 'dart', 'goss'],
    'objective' : ['multiclass'],
    # 'random_state' : [501],
    # 'colsample_bytree' : [0.65, 0.66],
    # 'subsample' : [0.7,0.75],
    # 'reg_alpha' : [0.001, 1, 2.0],
    # 'reg_lambda' : [0.001, 1, 2.0],
}
grid = GridSearchCV(LGBMClassifier(), param_grid, refit=True, cv=5, scoring=scorer)
grid.fit(X_train, y_train)
# output cv results to csv
pd.DataFrame(grid.cv_results_).to_csv( 'Results/lightGBM_grid_search_results.csv')

In [24]:
# print best parameter after tuning
print(grid.best_params_)
print(grid.best_estimator_)
y_pred_train = grid.predict(X_train)
y_pred = grid.predict(X_test)

print('Training Metrics for LightGBM:')
# print(classification_report(y_train, y_pred_train, digits=4))
print(confusion_matrix(y_train, y_pred_train))
# print(confusion_matrix(y_train, y_pred_train, normalize='true'))

# print(classification_report(y_train, y_pred_train, digits=4).split('\n')[-2])
# print(classification_report(y_test, y_pred, digits=4).split('\n')[-2])

print('\nTesting Metrics for LightGBM:')
print(classification_report(y_test, y_pred, digits=4))
print(confusion_matrix(y_test, y_pred))
# print(confusion_matrix(y_test, y_pred, normalize='true'))


{'boosting_type': 'dart', 'learning_rate': 0.1, 'n_estimators': 500, 'objective': 'multiclass'}
LGBMClassifier(boosting_type='dart', n_estimators=500, objective='multiclass')
Training Metrics for LightGBM:
[[159   0   0   0]
 [  0 156   0   0]
 [  0   0 324   0]
 [  0   0   0 542]]

Testing Metrics for LightGBM:
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000        31
           1     1.0000    1.0000    1.0000        45
           2     0.9878    0.9878    0.9878        82
           3     0.9928    0.9928    0.9928       138

    accuracy                         0.9932       296
   macro avg     0.9951    0.9951    0.9951       296
weighted avg     0.9932    0.9932    0.9932       296

[[ 31   0   0   0]
 [  0  45   0   0]
 [  0   0  81   1]
 [  0   0   1 137]]


In [26]:
# XGBoost
param_grid = {
    # 'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'learning_rate': [0.05, 0.1, 0.2, .3],
    # 'n_estimators': [10, 50, 100, 200, 500],
    'n_estimators': [100, 1000],
    # 'max_depth': [4, 5, 6, 7, 8, 12, 16],
    # 'max_depth': [4, 8, 12, 16],
    # 'booster' : ['gbtree', 'gblinear', 'dart'],
    'objective' : ['multi:softmax'],
    # 'random_state' : [501], # Updated from 'seed'
    # 'colsample_bytree' : [0.65, 0.66],
    # 'subsample' : [0.7,0.75],
    'reg_alpha' : [.1, 1,1.2, 2],
    'reg_lambda' : [.1, 1,1.2,1.4, 2],
}

grid = GridSearchCV(XGBClassifier(), param_grid, refit=True, cv=5, scoring=scorer)
grid.fit(X_train, y_train)
# output cv results to csv
pd.DataFrame(grid.cv_results_).to_csv( 'Results/XGBoost_grid_search_results.csv')

In [28]:
# print best parameter after tuning
print(grid.best_params_)
print(grid.best_estimator_)

y_pred_train = grid.predict(X_train)
y_pred = grid.predict(X_test)

print('Training Metrics for XGBoost:')
# print(classification_report(y_train, y_pred_train, digits=4))
print(confusion_matrix(y_train, y_pred_train))
# print(confusion_matrix(y_train, y_pred_train, normalize='true'))

# print(classification_report(y_train, y_pred_train, digits=4).split('\n')[-2])
# print(classification_report(y_test, y_pred, digits=4).split('\n')[-2])

print('\nTesting Metrics for XGBoost:')
print(classification_report(y_test, y_pred, digits=4))
print(confusion_matrix(y_test, y_pred))
# print(confusion_matrix(y_test, y_pred, normalize='true'))




{'learning_rate': 0.05, 'n_estimators': 1000, 'objective': 'multi:softmax', 'reg_alpha': 1, 'reg_lambda': 1}
XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.05, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=1000, n_jobs=None, num_parallel_tree=None,
              objective='multi:softmax', predictor=None, ...)
Training Metrics for XGBoost:
[[159   0   0   0]
 [  0 156   0   0]
 [  0   0 324   0]
 [  0   0   0 542]]

Testing Metrics for XG