In [2]:
#importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from scipy.stats import zscore
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import metrics
from sklearn.metrics import accuracy_score

In [4]:
data = pd.read_csv('diabetes_prediction_dataset.csv')

In [5]:
dataValDict = {'gender': {'Male': 0, 'Female': 1, 'Other': 3}, 'smoking_history': {'never': 0, 'current': 1, 'ever': 2, 'former': 3, 'not current': 4}}

#Original file is saved as original data.  Df will be 'cleaned' file.
df = data[data['gender'] != 'Other']
df = data[data['smoking_history'] != 'No Info']

df = df.replace(dataValDict)
df['gender'] = pd.to_numeric(df['gender'])

<class 'pandas.core.frame.DataFrame'>
Int64Index: 64184 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   gender               64184 non-null  int64  
 1   age                  64184 non-null  float64
 2   hypertension         64184 non-null  int64  
 3   heart_disease        64184 non-null  int64  
 4   smoking_history      64184 non-null  int64  
 5   bmi                  64184 non-null  float64
 6   HbA1c_level          64184 non-null  float64
 7   blood_glucose_level  64184 non-null  int64  
 8   diabetes             64184 non-null  int64  
dtypes: float64(3), int64(6)
memory usage: 4.9 MB


Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,1,80.0,0,1,0,25.19,6.6,140,0
2,0,28.0,0,0,0,27.32,5.7,158,0
3,1,36.0,0,0,1,23.45,5.0,155,0
4,0,76.0,1,1,1,20.14,4.8,155,0
5,1,20.0,0,0,0,27.32,6.6,85,0
...,...,...,...,...,...,...,...,...,...
99992,1,26.0,0,0,0,34.34,6.5,160,0
99993,1,40.0,0,0,0,40.69,3.5,155,0
99997,0,66.0,0,0,3,27.83,5.7,155,0
99998,1,24.0,0,0,0,35.42,4.0,100,0


In [6]:
X = df[['gender', 'age', 'hypertension', 'heart_disease', 'smoking_history', 'bmi','HbA1c_level', 'blood_glucose_level']]
y = df["diabetes"]

In [11]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = .3)
MLPmodel = MLPClassifier()
cv = cross_val_score(MLPmodel, x_train, y_train, cv=5)
MLPmodel.fit(x_train, y_train)
test_accuracy = MLPmodel.score(x_test, y_test)
print("Test Accuracy:", test_accuracy)


Test Accuracy: 0.9577274615704197


In [14]:
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
    'solver' : ['sgd', 'adam'],
    'activation' : ['identity', 'logistic', 'relu'],
    'max_iter': [200, 500, 1000],
    'learning_rate': ['constant','invscaling','adaptive']}
grid_search = GridSearchCV(MLPmodel, param_grid, cv=5)
grid_search.fit(x_train, y_train)
print("Best Hyperparameters:", grid_search.best_params_)
best_mlp = grid_search.best_estimator_
gridmodel_acc = best_mlp.score(x_test, y_test)
print("Test Accuracy:", test_accuracy)



Best Hyperparameters: {'activation': 'logistic', 'hidden_layer_sizes': (100, 50), 'learning_rate': 'adaptive', 'max_iter': 1000, 'solver': 'adam'}
Test Accuracy: 0.9566888242625675


In [16]:
tuned_model = MLPClassifier(activation = 'logistic', hidden_layer_sizes = (100,50), learning_rate = 'adaptive', max_iter = 1000, solver = 'adam')
cv = cross_val_score(tuned_model, x_train, y_train, cv=5)
tuned_model.fit(x_train, y_train)
test_accuracy = tuned_model.score(x_test, y_test)
print("Test Accuracy:", test_accuracy)

Test Accuracy: 0.9560656418778563
