In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

In [2]:
dataset = pd.read_csv("obesity.csv")
print("Classes in the dataset:", dataset['NObeyesdad'].unique())

Classes in the dataset: ['Normal_Weight' 'Overweight_Level_I' 'Overweight_Level_II'
 'Obesity_Type_I' 'Insufficient_Weight' 'Obesity_Type_II'
 'Obesity_Type_III']


One hot encoding + Scaling the Data

In [3]:
categories = ['Gender', 'family_history_with_overweight', 'FAVC', 'SMOKE', 'SCC', 'CAEC', 'CALC', 'MTRANS', 'NObeyesdad']
for c in categories:
  dummies = pd.get_dummies(dataset[c], prefix=(str(c)+"_"))
  dataset = pd.concat([dataset, dummies], axis=1)
dataset = dataset.drop(columns=categories)

In [4]:
d = ['NObeyesdad__Insufficient_Weight',
       'NObeyesdad__Normal_Weight', 'NObeyesdad__Obesity_Type_I',
       'NObeyesdad__Obesity_Type_II', 'NObeyesdad__Obesity_Type_III',
       'NObeyesdad__Overweight_Level_I', 'NObeyesdad__Overweight_Level_II']
X = dataset.drop(columns=d)
y = dataset[d]

In [5]:
scaler = MinMaxScaler(feature_range=(0, 1))
X_rescaled = scaler.fit_transform(X)
X = pd.DataFrame(data = X_rescaled, columns = X.columns)

Split the data and run the model

In [40]:
data_train, data_test, class_train, class_test = train_test_split(X, y, test_size=0.2, random_state=15)
print(data_train.shape)
print(class_train.shape)
mlp = MLPClassifier(solver = 'sgd', activation = 'logistic', learning_rate_init = 0.4, batch_size = 100, hidden_layer_sizes = (17, 20), max_iter = 400, random_state=0)

(1688, 31)
(1688, 7)


In [57]:
mlp.fit(data_train, class_train)

In [58]:
pred = mlp.predict(data_test)

In [59]:
print("Testing Accuracy : ", accuracy_score(class_test, pred))
print("Testing Mean Square Error : ", mean_squared_error(class_test, pred))

Testing Accuracy :  0.9243498817966903
Testing Mean Square Error :  0.019250253292806486


In [44]:
train_pred = mlp.predict(data_train)
print("Training Accuracy : ",accuracy_score(train_pred, class_train))
print("Training Mean Square Error : ", mean_squared_error(class_train, train_pred))

Training Accuracy :  0.9940758293838863
Training Mean Square Error :  0.0014387271496276233


The testing accuracy is around the training accuracy and the difference is around 0.07 which is sizeable but needs further testing to claim overfitting. The testing MSE is also only 0.02 larger than the training MSE, so this model is good in that regard. From what we have so far, there may not be overfitting, but let's check using Cross Validation and regularization techniques to find an improvement.

In [45]:
print(multilabel_confusion_matrix(class_test, pred))

print("Classification Report : ")
print(classification_report(class_test, pred))

[[[372   3]
  [  1  47]]

 [[367   1]
  [ 10  45]]

 [[353   3]
  [  4  63]]

 [[364   0]
  [  2  57]]

 [[365   1]
  [  0  57]]

 [[351   8]
  [  7  57]]

 [[339  11]
  [  6  67]]]
Classification Report : 
              precision    recall  f1-score   support

           0       0.94      0.98      0.96        48
           1       0.98      0.82      0.89        55
           2       0.95      0.94      0.95        67
           3       1.00      0.97      0.98        59
           4       0.98      1.00      0.99        57
           5       0.88      0.89      0.88        64
           6       0.86      0.92      0.89        73

   micro avg       0.94      0.93      0.93       423
   macro avg       0.94      0.93      0.93       423
weighted avg       0.94      0.93      0.93       423
 samples avg       0.93      0.93      0.93       423



Cross Validation

In [46]:
from sklearn.model_selection import cross_validate
mlpCV = cross_validate(mlp, X, y, cv=10, scoring=['accuracy', 'neg_mean_squared_error'])
#print all the accuracy values from each iteration
print('Accuracy')
print(mlpCV['test_accuracy'])
#print all the MSE values from each iteration
print('MSE')
print(-1*mlpCV['test_neg_mean_squared_error'])
print('Average Accuracy = ',  sum(mlpCV['test_accuracy'])/len(mlpCV['test_accuracy']))
print('Average MSE = ', sum(-1 * mlpCV['test_neg_mean_squared_error']) / len(mlpCV['test_neg_mean_squared_error']))

Accuracy
[0.73113208 0.73459716 0.8957346  0.96208531 0.87203791 0.95734597
 0.97156398 0.99526066 1.         1.        ]
MSE
[0.07210243 0.07109005 0.02979012 0.00947867 0.0338524  0.01083277
 0.00812458 0.0013541  0.         0.        ]
Average Accuracy =  0.9119757667888759
Average MSE =  0.023662510698636962


For some folds the accuracy is lower at 73.1% and 73.4% so there is still room to improve this model, but since it performs well in 8 of the 10 folds, it is still a valuable model.


Hyperparameter Tuning

In [52]:
#MLP hyperparameter tuning

#logic from hw3demo
#set up parameters
max_iterations = [500,400,600]
hidden_layer_size = [(17, 20), (13, 15), (20, 24)]
activations = ["logistic", "relu", "tanh"]
learning_rate_inits = [0.3,0.4,0.5]
params = dict(activation = activations, hidden_layer_sizes = hidden_layer_size, max_iter = max_iterations, learning_rate_init = learning_rate_inits)
grid = GridSearchCV(estimator = mlp, param_grid=params, scoring="accuracy")
grid.fit(X, y)

In [53]:
#results
print("Optimal Hyper-Parameters:", grid.best_params_)
print("Optimal Accuracy:", grid.best_score_)

Optimal Hyper-Parameters: {'activation': 'logistic', 'hidden_layer_sizes': (20, 24), 'learning_rate_init': 0.3, 'max_iter': 500}
Optimal Accuracy: 0.8490022744333524


In [54]:
#make optimal MLP
optimalMLP = MLPClassifier(solver = 'sgd', activation = 'logistic', learning_rate_init = 0.3, batch_size = 100, hidden_layer_sizes = (20, 24), max_iter = 500, random_state=0)
#fit and predict
optimalMLP.fit(data_train, class_train)
pred = optimalMLP.predict(data_test)
#results
print("Accuracy : ", accuracy_score(class_test, pred))
print("Mean Square Error : ", mean_squared_error(class_test, pred))

Accuracy :  0.933806146572104
Mean Square Error :  0.016210739614994935


After tuning the accuracy and mean squared error both improved.

In [62]:
#MORE results
print(multilabel_confusion_matrix(class_test, pred))

print("Classification Report : ")
print(classification_report(class_test, pred))

[[[372   3]
  [  1  47]]

 [[367   1]
  [ 10  45]]

 [[353   3]
  [  4  63]]

 [[364   0]
  [  2  57]]

 [[365   1]
  [  0  57]]

 [[351   8]
  [  7  57]]

 [[339  11]
  [  6  67]]]
Classification Report : 
              precision    recall  f1-score   support

           0       0.94      0.98      0.96        48
           1       0.98      0.82      0.89        55
           2       0.95      0.94      0.95        67
           3       1.00      0.97      0.98        59
           4       0.98      1.00      0.99        57
           5       0.88      0.89      0.88        64
           6       0.86      0.92      0.89        73

   micro avg       0.94      0.93      0.93       423
   macro avg       0.94      0.93      0.93       423
weighted avg       0.94      0.93      0.93       423
 samples avg       0.93      0.93      0.93       423



In [61]:
#Cross Validation for Optimized MLP
optimal_CV = cross_validate(optimalMLP, X, y, cv=10, scoring=['accuracy', 'neg_mean_squared_error'])
#print all the accuracy values from each iteration
print('Accuracy')
print(optimal_CV['test_accuracy'])
#print all the MSE values from each iteration
print('MSE')
print(-1*optimal_CV['test_neg_mean_squared_error'])

print('Average Accuracy = ', sum(optimal_CV['test_accuracy']) / len(optimal_CV['test_accuracy']))
print('Average MSE = ', sum(-1 * optimal_CV['test_neg_mean_squared_error']) / len(optimal_CV['test_neg_mean_squared_error']))

Accuracy
[0.70283019 0.79620853 0.9478673  0.96682464 0.95734597 0.83412322
 0.96682464 0.98578199 1.         1.        ]
MSE
[0.07681941 0.05484089 0.01354096 0.00609343 0.01150982 0.04333108
 0.00812458 0.00406229 0.         0.        ]
Average Accuracy =  0.915780649199678
Average MSE =  0.021832245372440315


After tuning, the new model gives out a good CV accuracy at 91.6% but since the cross-validation for 2 of the folds are very low, we should look into different types of models like a DNN.