In [1]:
import sys; sys.path.append('..'); sys.path.append('../Preprocess')
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier


# use sklearn metrics, single function
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import Preprocess.preprocessor as preprocessor
from imblearn.over_sampling import SMOTE

import xgboost as xgb
from lightgbm.sklearn import LGBMClassifier
from catboost import CatBoostClassifier

from mlxtend.evaluate import bias_variance_decomp

In [2]:
from master_script import TEST

In [3]:
df = preprocessor.ReadData(pth='../Dataset/body_level_classification_train.csv', label='Body_Level')
df_h = df.copy()
df_h.head()

Unnamed: 0,Gender,Age,Height,Weight,H_Cal_Consump,Veg_Consump,Water_Consump,Alcohol_Consump,Smoking,Meal_Count,Food_Between_Meals,Fam_Hist,H_Cal_Burn,Phys_Act,Time_E_Dev,Transport,Body_Level
0,Female,22.547298,1.722461,51.881263,yes,2.663421,1.04111,no,no,3.0,Frequently,yes,no,0.794402,1.391948,Public_Transportation,Body Level 1
1,Male,19.799054,1.743702,54.927529,yes,2.0,2.847264,Sometimes,no,3.28926,Sometimes,yes,no,1.680844,2.0,Public_Transportation,Body Level 1
2,Female,17.823438,1.708406,50.0,yes,1.642241,1.099231,Sometimes,no,3.45259,Sometimes,no,no,0.418875,1.0,Public_Transportation,Body Level 1
3,Female,19.007177,1.690727,49.895716,yes,1.212908,1.029703,Sometimes,no,3.207071,Sometimes,no,no,2.0,1.0,Public_Transportation,Body Level 1
4,Male,19.72925,1.793315,58.19515,yes,2.508835,2.076933,no,no,3.435905,Sometimes,yes,no,2.026668,1.443328,Automobile,Body Level 1


In [4]:
# save all except label to test.csv, save label to test_label.csv
df_h.iloc[:, :-1].to_csv('test.csv', index=False)
# df_h.iloc[:, -1].to_csv('test_label.csv', index=False)


In [5]:
# add augmented columns like BMI ...
AGGREGATE = True
DISCRETIZE = False
# one hot encoding for categorical columns
ONE_HOT = False
# resample data
RESAMPLE = False
# resmapling using SMOTE
APPLY_SMOTE = False
# Preprocess
df_h = preprocessor.LabelOrdinalEncode(df_h)
if AGGREGATE:
    df_h = preprocessor.Aggregate(df_h, discretize=DISCRETIZE)
if ONE_HOT:
    df_h = preprocessor.OneHotEncode(df_h, label='Body_Level')
if APPLY_SMOTE:
    df_h = preprocessor.SMOTE(df_h)
elif RESAMPLE:
    df_h = preprocessor.Resample(df_h)
#
df_h.head()

Unnamed: 0,Gender,Age,Height,Weight,H_Cal_Consump,Veg_Consump,Water_Consump,Alcohol_Consump,Smoking,Meal_Count,Food_Between_Meals,Fam_Hist,H_Cal_Burn,Phys_Act,Time_E_Dev,Transport,BMI,Body_Level
0,0,22.547298,1.722461,51.881263,1,2.663421,1.04111,0,0,3.0,2,1,0,0.794402,1.391948,2,17.486856,0
1,1,19.799054,1.743702,54.927529,1,2.0,2.847264,1,0,3.28926,1,1,0,1.680844,2.0,2,18.065315,0
2,0,17.823438,1.708406,50.0,1,1.642241,1.099231,1,0,3.45259,1,0,0,0.418875,1.0,2,17.131202,0
3,0,19.007177,1.690727,49.895716,1,1.212908,1.029703,1,0,3.207071,1,0,0,2.0,1.0,2,17.454857,0
4,1,19.72925,1.793315,58.19515,1,2.508835,2.076933,0,0,3.435905,1,1,0,2.026668,1.443328,4,18.095627,0


In [6]:
X_train = df_h.iloc[:, :-1].values
y_train = df_h.iloc[:, -1].values

In [7]:
# XGBoost
xgb_model = xgb.XGBClassifier(
                            booster='gbtree',
                            learning_rate=0.3,
                            max_depth=3,
                            objective="multi:softprob",
                            random_state=42,
                            num_class=4,
                            # eval_metric="auc",
                            eval_metric="mlogloss",
                            )
xgb_model.fit(X_train, y_train)
preprocessor.Save_Model(xgb_model, path='./model.pkl')

In [8]:
# test with TEST
TEST()

In [9]:
# Compare results from preds.txt with test_label.csv
# df_test_label = pd.read_csv('test_label.csv')
df_test_label = df.iloc[:, -1].values
preds = pd.read_csv('preds.txt', header=None)
preds.columns = ['Body_Level']

# compare preds with test_label
print('Testing Metrics for XGBoost:')
print(classification_report(df_test_label, preds, digits=4, zero_division=0))
print(confusion_matrix(df_test_label, preds))

Testing Metrics for XGBoost:
              precision    recall  f1-score   support

Body Level 1     1.0000    1.0000    1.0000       190
Body Level 2     1.0000    1.0000    1.0000       201
Body Level 3     1.0000    1.0000    1.0000       406
Body Level 4     1.0000    1.0000    1.0000       680

    accuracy                         1.0000      1477
   macro avg     1.0000    1.0000    1.0000      1477
weighted avg     1.0000    1.0000    1.0000      1477

[[190   0   0   0]
 [  0 201   0   0]
 [  0   0 406   0]
 [  0   0   0 680]]
