In [103]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD
from keras.utils import plot_model

In [104]:
temp = pd.read_csv("obesity.csv")
columns = ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']
for c in columns:
  dummies = pd.get_dummies(temp[c], prefix=(str(c)+"_"))
  temp = pd.concat([temp, dummies], axis=1)
temp = temp.drop(columns=columns)


d = ['Insufficient_Weight',
       'Normal_Weight', 'Overweight_Level_I', 'Overweight_Level_II', 'Obesity_Type_I',
       'Obesity_Type_II', 'Obesity_Type_III']
ordinal_encoder = OrdinalEncoder(categories=[d])
temp['NObeyesdad'] = ordinal_encoder.fit_transform(temp[['NObeyesdad']])
y_xgb = temp['NObeyesdad']

temp = temp.drop('NObeyesdad', axis=1)
scaler = MinMaxScaler(feature_range=(0, 1))
X_rescaled = scaler.fit_transform(temp)
X = pd.DataFrame(data = X_rescaled, columns = temp.columns)
X_train, X_test, y_train, y_test = train_test_split(X, y_xgb, test_size=0.2, random_state=15)

In [117]:
trial = XGBClassifier(objective='multi:softprob')
trial.fit(X_train, y_train)
y_pred = trial.predict(X_test)
accuracy = trial.score(X_test, y_test)
print(" Test Accuracy: %.4f%%" % (accuracy * 100.0))
y_train_pred = trial.predict(X_train)
train_accuracy = trial.score(X_train, y_train)
print(" Train Accuracy: %.4f%%" % (train_accuracy * 100.0))

 Test Accuracy: 98.1132%
 Train Accuracy: 100.0000%


The test accuracy here is really high at 97.8723% so we should look for any overfitting, because this model seems to be great at first glance.

In [109]:
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=15) #split the test set into two for a validation set of size 10% of initial data

In [113]:
num_boost_rounds = 50
early_stopping_rounds = 6
earlystopxgb = XGBClassifier(objective='multi:softprob', num_class=7,eval_metric=['merror'], eta=0.1, early_stopping_rounds=early_stopping_rounds)
earlystopxgb.fit(X_train, y_train, eval_set=[(X_val, y_val)])

y_pred = earlystopxgb.predict(X_val)

accuracy = accuracy_score(y_val, y_pred)
print("Validation Accuracy:", accuracy)
y_test_pred = earlystopxgb.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))

[0]	validation_0-merror:0.10377
[1]	validation_0-merror:0.10377
[2]	validation_0-merror:0.11321
[3]	validation_0-merror:0.10377
[4]	validation_0-merror:0.09434
[5]	validation_0-merror:0.06604
[6]	validation_0-merror:0.06604
[7]	validation_0-merror:0.05660
[8]	validation_0-merror:0.05660
[9]	validation_0-merror:0.05660
[10]	validation_0-merror:0.04717
[11]	validation_0-merror:0.03774
[12]	validation_0-merror:0.03774
[13]	validation_0-merror:0.03774
[14]	validation_0-merror:0.03774
[15]	validation_0-merror:0.03774
[16]	validation_0-merror:0.03774
[17]	validation_0-merror:0.02830
[18]	validation_0-merror:0.02830
[19]	validation_0-merror:0.02830
[20]	validation_0-merror:0.02830
[21]	validation_0-merror:0.02830
[22]	validation_0-merror:0.02830
[23]	validation_0-merror:0.03774
Validation Accuracy: 0.9716981132075472
Test Accuracy: 0.9150943396226415


In [114]:
y_train_pred = earlystopxgb.predict(X_train)
print("Train Accuracy:", accuracy_score(y_train, y_train_pred))

Train Accuracy: 0.9840047393364929


Now with the use of early-stopping, the test accuracy is 91.5% which is more reasonable and the validation accuracy was high at 97.16%, while the training accuracy was at 98.4%.

In [116]:
#XGB Cross Validation - https://machinelearningmastery.com/evaluate-gradient-boosting-models-xgboost-python/
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
def train_evaluateXGB(model, xtrain, ytrain, xtest, ytest):
  model.fit(xtrain, ytrain, eval_set=[(xtest,ytest)])
  accuracy = model.score(xtest, ytest)
  return accuracy
kFold = StratifiedKFold(n_splits = 10)
i=0
sum=0
for train, test in kFold.split(temp,y_xgb):
  print("Current Iteration:",i)
  modelxgb = None
  modelxgb = earlystopxgb
  acc = train_evaluateXGB(modelxgb,temp.loc[[x for x in train]],y_xgb.loc[[x for x in train]],temp.loc[[x for x in test]],y_xgb.loc[[x for x in test]])
  print("Accuracy for each Fold: ", acc)
  i+=1
  sum += acc

print("Mean accuracy:", (sum/i))
#Mean accuracy: 0.9237950460520432

Current Iteration: 0
[0]	validation_0-merror:0.22642
[1]	validation_0-merror:0.23585
[2]	validation_0-merror:0.22170
[3]	validation_0-merror:0.22642
[4]	validation_0-merror:0.22642
[5]	validation_0-merror:0.20755
[6]	validation_0-merror:0.18868
[7]	validation_0-merror:0.20283
[8]	validation_0-merror:0.19811
[9]	validation_0-merror:0.20283
[10]	validation_0-merror:0.20283
[11]	validation_0-merror:0.19811
Accuracy for each Fold:  0.8113207547169812
Current Iteration: 1
[0]	validation_0-merror:0.17536
[1]	validation_0-merror:0.18483
[2]	validation_0-merror:0.18957
[3]	validation_0-merror:0.16588
[4]	validation_0-merror:0.15166
[5]	validation_0-merror:0.14218
[6]	validation_0-merror:0.13270
[7]	validation_0-merror:0.13270
[8]	validation_0-merror:0.12796
[9]	validation_0-merror:0.12796
[10]	validation_0-merror:0.12796
[11]	validation_0-merror:0.11374
[12]	validation_0-merror:0.11374
[13]	validation_0-merror:0.10427
[14]	validation_0-merror:0.10427
[15]	validation_0-merror:0.10427
[16]	valid

Our cross-validation ran well too giving a mean accuracy of 94.4%, but performs not as well in the first fold, but really good on all of the other folds with a low error term as well.

In Conclusion

In [118]:
import pickle

In [119]:
filename = 'xgb_model.sav'
pickle.dump(earlystopxgb, open(filename, 'wb'))
