In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV

In [18]:
dataset = pd.read_csv("obesity.csv")
print("Classes in the dataset:", dataset['NObeyesdad'].unique())

Classes in the dataset: ['Normal_Weight' 'Overweight_Level_I' 'Overweight_Level_II'
 'Obesity_Type_I' 'Insufficient_Weight' 'Obesity_Type_II'
 'Obesity_Type_III']


In [19]:
categories = ['Gender', 'family_history_with_overweight', 'FAVC', 'SMOKE', 'SCC', 'CAEC', 'CALC', 'MTRANS', 'NObeyesdad']
for c in categories:
  dummies = pd.get_dummies(dataset[c], prefix=(str(c)+"_"))
  dataset = pd.concat([dataset, dummies], axis=1)
dataset = dataset.drop(columns=categories)

In [20]:
d = ['NObeyesdad__Insufficient_Weight',
       'NObeyesdad__Normal_Weight', 'NObeyesdad__Obesity_Type_I',
       'NObeyesdad__Obesity_Type_II', 'NObeyesdad__Obesity_Type_III',
       'NObeyesdad__Overweight_Level_I', 'NObeyesdad__Overweight_Level_II']
X = dataset.drop(columns=d)
y = dataset[d]
print(y.columns)

Index(['NObeyesdad__Insufficient_Weight', 'NObeyesdad__Normal_Weight',
       'NObeyesdad__Obesity_Type_I', 'NObeyesdad__Obesity_Type_II',
       'NObeyesdad__Obesity_Type_III', 'NObeyesdad__Overweight_Level_I',
       'NObeyesdad__Overweight_Level_II'],
      dtype='object')


In [21]:
scaler = MinMaxScaler(feature_range=(0, 1))
X_rescaled = scaler.fit_transform(X)
X = pd.DataFrame(data = X_rescaled, columns = X.columns)

In [22]:
data_train, data_test, class_train, class_test = train_test_split(X, y, test_size=0.2)
print(data_train.shape)
print(class_train.shape)
mlp = MLPClassifier(solver = 'sgd', activation = 'logistic', learning_rate_init = 0.4, batch_size = 100, hidden_layer_sizes = (17, 20), max_iter = 600)

(1688, 31)
(1688, 7)


In [23]:
mlp.fit(data_train, class_train)

In [24]:
pred = mlp.predict(data_test)

In [25]:
print(pred)

[[0 0 1 ... 0 0 0]
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 0 0 1]
 ...
 [0 1 0 ... 0 0 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 1 0 0]]


In [26]:
print("Accuracy : ", accuracy_score(class_test, pred))
print("Mean Square Error : ", mean_squared_error(class_test, pred))

Accuracy :  0.9243498817966903
Mean Square Error :  0.019250253292806486


In [27]:
print(multilabel_confusion_matrix(class_test, pred))

print("Classification Report : ")
print(classification_report(class_test, pred))

[[[363   4]
  [  3  53]]

 [[357   5]
  [  8  53]]

 [[358   6]
  [  0  59]]

 [[364   0]
  [  2  57]]

 [[351   0]
  [  0  72]]

 [[358   3]
  [ 13  49]]

 [[360   9]
  [  4  50]]]
Classification Report : 
              precision    recall  f1-score   support

           0       0.93      0.95      0.94        56
           1       0.91      0.87      0.89        61
           2       0.91      1.00      0.95        59
           3       1.00      0.97      0.98        59
           4       1.00      1.00      1.00        72
           5       0.94      0.79      0.86        62
           6       0.85      0.93      0.88        54

   micro avg       0.94      0.93      0.93       423
   macro avg       0.93      0.93      0.93       423
weighted avg       0.94      0.93      0.93       423
 samples avg       0.93      0.93      0.93       423



  _warn_prf(average, modifier, msg_start, len(result))


Cross Validation

In [14]:
from sklearn.model_selection import cross_validate
mlpCV = cross_validate(mlp, X, y, cv=10, scoring=['accuracy', 'neg_mean_squared_error'])
#print all the accuracy values from each iteration
print('Accuracy')
print(mlpCV['test_accuracy'])
#print all the MSE values from each iteration
print('MSE')
print(-1*mlpCV['test_neg_mean_squared_error'])
print('Average Accuracy = ',  sum(mlpCV['test_accuracy'])/len(mlpCV['test_accuracy']))
print('Average MSE = ', sum(-1 * mlpCV['test_neg_mean_squared_error']) / len(mlpCV['test_neg_mean_squared_error']))

Accuracy
[0.68867925 0.76777251 0.91469194 0.96682464 0.98104265 0.90047393
 0.98104265 0.99052133 1.         1.        ]
MSE
[0.08625337 0.06364252 0.02031144 0.00880162 0.00473934 0.02505078
 0.00541638 0.00203114 0.         0.        ]
Average Accuracy =  0.9191048913529464
Average MSE =  0.021624659879153303


In [17]:
#MLP hyperparameter tuning

#logic from hw3demo
#set up parameters
max_iterations = [500,600,700]
hidden_layer_size = [(17, 20), (17,20,15), (17, 20, 20)]
activations = ["logistic", "relu", "tanh"]
learning_rate_inits = [0.3,0.4,0.5]
params = dict(activation = activations, hidden_layer_sizes = hidden_layer_size, max_iter = max_iterations, learning_rate_init = learning_rate_inits)
grid = GridSearchCV(estimator = mlp, param_grid=params, scoring="accuracy")
grid.fit(X, y)

In [18]:
#results
print("Optimal Hyper-Parameters:", grid.best_params_)
print("Optimal Accuracy:", grid.best_score_)
#Optimal Hyper-Parameters: {'activation': 'logistic', 'hidden_layer_sizes': (17, 20), 'learning_rate_init': 0.3, 'max_iter': 500}
#Optimal Accuracy: 0.8139267027438853

Optimal Hyper-Parameters: {'activation': 'logistic', 'hidden_layer_sizes': (17, 20), 'learning_rate_init': 0.3, 'max_iter': 700}
Optimal Accuracy: 0.7840968931016323


In [28]:
#make optimal MLP
optimalMLP = MLPClassifier(solver = 'sgd', random_state = 42, activation = 'logistic', learning_rate_init = 0.3, batch_size = 100, hidden_layer_sizes = (17, 20), max_iter = 500)
#fit and predict
optimalMLP.fit(data_train, class_train)
pred = optimalMLP.predict(data_test)
#results
print("Accuracy : ", accuracy_score(class_test, pred))
print("Mean Square Error : ", mean_squared_error(class_test, pred))

Accuracy :  0.9361702127659575
Mean Square Error :  0.015873015873015872


In [20]:
#MORE results
print(multilabel_confusion_matrix(class_test, pred))

print("Classification Report : ")
print(classification_report(class_test, pred))

[[[369   4]
  [  1  49]]

 [[342   9]
  [ 10  62]]

 [[346   3]
  [  1  73]]

 [[375   0]
  [  1  47]]

 [[350   0]
  [  0  73]]

 [[363   7]
  [ 11  42]]

 [[367   3]
  [  4  49]]]
Classification Report : 
              precision    recall  f1-score   support

           0       0.92      0.98      0.95        50
           1       0.87      0.86      0.87        72
           2       0.96      0.99      0.97        74
           3       1.00      0.98      0.99        48
           4       1.00      1.00      1.00        73
           5       0.86      0.79      0.82        53
           6       0.94      0.92      0.93        53

   micro avg       0.94      0.93      0.94       423
   macro avg       0.94      0.93      0.93       423
weighted avg       0.94      0.93      0.94       423
 samples avg       0.93      0.93      0.93       423



  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
#Cross Validation for Optimized MLP
optimal_CV = cross_validate(optimalMLP, X, y, cv=10, scoring=['accuracy', 'neg_mean_squared_error'])
#print all the accuracy values from each iteration
print('Accuracy')
print(optimal_CV['test_accuracy'])
#print all the MSE values from each iteration
print('MSE')
print(-1*optimal_CV['test_neg_mean_squared_error'])

print('Average Accuracy = ', sum(optimal_CV['test_accuracy']) / len(optimal_CV['test_accuracy']))
print('Average MSE = ', sum(-1 * optimal_CV['test_neg_mean_squared_error']) / len(optimal_CV['test_neg_mean_squared_error']))



Accuracy
[0.72641509 0.78672986 0.9478673  0.98104265 0.97630332 0.86729858
 0.95260664 0.99052133 0.87677725 1.        ]
MSE
[0.07075472 0.05416385 0.01421801 0.00338524 0.00677048 0.03317536
 0.01150982 0.00203114 0.02979012 0.        ]
Average Accuracy =  0.9105562013770901
Average MSE =  0.0225798725105709
