In [95]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD
from keras.utils import plot_model

In [96]:
temp = pd.read_csv("obesity.csv")
columns = ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']
for c in columns:
  dummies = pd.get_dummies(temp[c], prefix=(str(c)+"_"))
  temp = pd.concat([temp, dummies], axis=1)
temp = temp.drop(columns=columns)


d = ['Insufficient_Weight',
       'Normal_Weight', 'Overweight_Level_I', 'Overweight_Level_II', 'Obesity_Type_I',
       'Obesity_Type_II', 'Obesity_Type_III']
ordinal_encoder = OrdinalEncoder(categories=[d])
temp['NObeyesdad'] = ordinal_encoder.fit_transform(temp[['NObeyesdad']])
y_xgb = temp['NObeyesdad']

temp = temp.drop('NObeyesdad', axis=1)
scaler = MinMaxScaler(feature_range=(0, 1))
X_rescaled = scaler.fit_transform(temp)
X = pd.DataFrame(data = X_rescaled, columns = temp.columns)
X_train, X_test, y_train, y_test = train_test_split(X, y_xgb, test_size=0.2, random_state=15)

In [97]:
trial = XGBClassifier(objective='multi:softprob')
trial.fit(X_train, y_train)
y_pred = trial.predict(X_test)
accuracy = trial.score(X_test, y_test)
print("Accuracy: %.4f%%" % (accuracy * 100.0))

Accuracy: 97.8723%


The accuracy here is really high at 97.8723% so we should try to reduce any possible overfitting using early stopping.

In [98]:
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=15) #split the test set into two for a validation set of size 10% of initial data

In [99]:
num_boost_rounds = 100
early_stopping_rounds = 3
earlystopxgb = XGBClassifier(objective='multi:softprob', num_class=7,eval_metric=['merror'], eta=0.1, early_stopping_rounds=early_stopping_rounds)
earlystopxgb.fit(X_train, y_train, eval_set=[(X_val, y_val)])

y_pred = earlystopxgb.predict(X_val)

accuracy = accuracy_score(y_val, y_pred)
print("Validation Accuracy:", accuracy)
y_test_pred = earlystopxgb.predict(X_test)
print(accuracy_score(y_test, y_test_pred))

[0]	validation_0-merror:0.12796
[1]	validation_0-merror:0.12322
[2]	validation_0-merror:0.09479
[3]	validation_0-merror:0.07583
[4]	validation_0-merror:0.08531
[5]	validation_0-merror:0.08057
[6]	validation_0-merror:0.06635
[7]	validation_0-merror:0.06635
[8]	validation_0-merror:0.06161
[9]	validation_0-merror:0.05687
[10]	validation_0-merror:0.05687
[11]	validation_0-merror:0.05687
[12]	validation_0-merror:0.05687
Validation Accuracy: 0.943127962085308
0.9198113207547169


Now with the use of early-stopping, the test accuracy is 91.9% which is more reasonable and the validation accuracy was high as well at 94.31%.

In [100]:
#XGB Cross Validation - https://machinelearningmastery.com/evaluate-gradient-boosting-models-xgboost-python/
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
def train_evaluateXGB(model, xtrain, ytrain, xtest, ytest):
  model.fit(xtrain, ytrain, eval_set=[(xtest,ytest)])
  accuracy = model.score(xtest, ytest)
  return accuracy
kFold = StratifiedKFold(n_splits = 10)
i=0
sum=0
for train, test in kFold.split(temp,y_xgb):
  print("Current Iteration:",i)
  modelxgb = None
  modelxgb = earlystopxgb
  acc = train_evaluateXGB(modelxgb,temp.loc[[x for x in train]],y_xgb.loc[[x for x in train]],temp.loc[[x for x in test]],y_xgb.loc[[x for x in test]])
  i+=1
  sum += acc

print("Mean accuracy:", (sum/i))
#Mean accuracy: 0.9237950460520432

Current Iteration: 0
[0]	validation_0-merror:0.22642
[1]	validation_0-merror:0.23585
[2]	validation_0-merror:0.22170
[3]	validation_0-merror:0.22642
[4]	validation_0-merror:0.22642
[5]	validation_0-merror:0.20755
[6]	validation_0-merror:0.18868
[7]	validation_0-merror:0.20283
[8]	validation_0-merror:0.19811
[9]	validation_0-merror:0.20283
Current Iteration: 1
[0]	validation_0-merror:0.17536
[1]	validation_0-merror:0.18483
[2]	validation_0-merror:0.18957
[3]	validation_0-merror:0.16588
[4]	validation_0-merror:0.15166
[5]	validation_0-merror:0.14218
[6]	validation_0-merror:0.13270
[7]	validation_0-merror:0.13270
[8]	validation_0-merror:0.12796
[9]	validation_0-merror:0.12796
[10]	validation_0-merror:0.12796
[11]	validation_0-merror:0.11374
[12]	validation_0-merror:0.11374
[13]	validation_0-merror:0.10427
[14]	validation_0-merror:0.10427
[15]	validation_0-merror:0.10427
[16]	validation_0-merror:0.10900
Current Iteration: 2
[0]	validation_0-merror:0.11848
[1]	validation_0-merror:0.10427
[2

Our cross-validation ran well too giving a mean accuracy of 93.4% so our model does well.

In [101]:
import pickle

In [102]:
filename = 'xgb_model.sav'
pickle.dump(earlystopxgb, open(filename, 'wb'))
