In [1]:
import sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
import seaborn as sns
import sklearn
from sklearn.model_selection import train_test_split

In [2]:
names = ['n_pregnant', 'glucose_concentration', 'blood_pressure (mm Hg)', 'skin_thinckness (mm)', 'serum_insulin (mu U/ml)'
        'BMI', 'pedigree_function', 'age', 'class']
df = pd.read_csv('datasets_228_482_diabetes.csv')

In [3]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [5]:
# determine the columns with missing values
df[df['BloodPressure'] == 0].shape

(35, 9)

In [6]:
# preprocess the data
# remove the rows with missing values from relevent columns
columns = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for c in columns:
    df[c].replace(0, np.NaN, inplace=True)

In [7]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,763.0,733.0,541.0,394.0,757.0,768.0,768.0,768.0
mean,3.845052,121.686763,72.405184,29.15342,155.548223,32.457464,0.471876,33.240885,0.348958
std,3.369578,30.535641,12.382158,10.476982,118.775855,6.924988,0.331329,11.760232,0.476951
min,0.0,44.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0
25%,1.0,99.0,64.0,22.0,76.25,27.5,0.24375,24.0,0.0
50%,3.0,117.0,72.0,29.0,125.0,32.3,0.3725,29.0,0.0
75%,6.0,141.0,80.0,36.0,190.0,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [8]:
# drop the rows with NaN vlues
df.dropna(inplace=True)
df. describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0
mean,3.30102,122.627551,70.663265,29.145408,156.056122,33.086224,0.523046,30.864796,0.331633
std,3.211424,30.860781,12.496092,10.516424,118.84169,7.027659,0.345488,10.200777,0.471401
min,0.0,56.0,24.0,7.0,14.0,18.2,0.085,21.0,0.0
25%,1.0,99.0,62.0,21.0,76.75,28.4,0.26975,23.0,0.0
50%,2.0,119.0,70.0,29.0,125.5,33.2,0.4495,27.0,0.0
75%,5.0,143.0,78.0,37.0,190.0,37.1,0.687,36.0,1.0
max,17.0,198.0,110.0,63.0,846.0,67.1,2.42,81.0,1.0


In [9]:
# convert the data to numpy array
data = df.values
print(data.shape)

(392, 9)


In [10]:
# extract the input and lebels 
x = data[:, 0:-1]
y = data[:, -1].astype(int)
print(x.shape)
print(y.shape)

(392, 8)
(392,)


In [11]:
# normalize and standardize the data
from sklearn.preprocessing import StandardScaler

stand = StandardScaler().fit(x)
standardizedData = stand.transform(x)

data = pd.DataFrame(standardizedData) 
data.describe()

Unnamed: 0,0,1,2,3,4,5,6,7
count,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0
mean,-4.0217260000000004e-17,3.1295830000000003e-17,-4.641624e-16,1.04225e-16,6.485742e-17,1.54355e-16,3.8801160000000004e-17,1.028089e-16
std,1.001278,1.001278,1.001278,1.001278,1.001278,1.001278,1.001278,1.001278
min,-1.029213,-2.161731,-3.739001,-2.108484,-1.196867,-2.120941,-1.269525,-0.9682991
25%,-0.7174265,-0.7665958,-0.694164,-0.7755315,-0.6681786,-0.667678,-0.7340909,-0.771985
50%,-0.4056403,-0.1176959,-0.05314565,-0.01384444,-0.2574448,0.01621036,-0.2131475,-0.3793569
75%,0.5297185,0.6609841,0.5878727,0.7478426,0.2859877,0.5718696,0.4751644,0.5040564
max,4.271153,2.445459,3.151946,3.223325,5.81299,4.846172,5.497667,4.921123


### Hyperparameter Grid search

In [12]:
# import the modules
from sklearn.model_selection import GridSearchCV, KFold
from tensorflow.keras.layers import Dense
from tensorflow.keras import Model, Sequential
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.optimizers import Adam

In [14]:
# define and run the grid search model
# set a seed
seed = 1
np.random.seed(seed)


# define the model
def MyModel(lr, n1, n2, activation, init):
    
    model = Sequential()
    model.add(Dense(n1, input_dim=8, kernel_initializer=init, activation=activation))
    model.add(Dense(n2, kernel_initializer=init, activation=activation))
    model.add(Dense(1, activation='sigmoid'))

    
    optimizer = Adam(lr = lr)
    loss = 'binary_crossentropy'
    model.compile(loss=loss, optimizer=optimizer, metrics=['accuracy'])
    return model

# create KerasClassifier model
model = KerasClassifier(build_fn=MyModel, verbose=0)

# set the grid search parameters
batch_size = [8, 16, 32]
epochs = [10, 50, 100]
lr = [0.001, 0.005, 0.01]
n1 = [4, 8, 16, 32]
n2 = [2, 4, 8, 16]
activation = ['relu', 'softmax', 'tanh', 'linear']
init = ['uniform', 'normal', 'zero']

# make a dictionary of grid search parameters
params = dict(batch_size=batch_size, epochs=epochs, activation=activation, init=init, lr=lr, n1=n1, n2=n2)

# make and fit the grid search model
gridModel = GridSearchCV(estimator=model, param_grid=params, cv=KFold(random_state=seed), refit=True, verbose=10, n_jobs=8)
gridSearchResults = gridModel.fit(standardizedData, y)

# print some of the results
print('Best: {} Using: {}'.format(gridSearchResults.best_score_, gridSearchResults.best_params_))

Fitting 5 folds for each of 5184 candidates, totalling 25920 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed:    2.3s
[Parallel(n_jobs=8)]: Done   9 tasks      | elapsed:    2.9s
[Parallel(n_jobs=8)]: Done  16 tasks      | elapsed:    3.2s
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    4.1s
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    4.7s
[Parallel(n_jobs=8)]: Done  45 tasks      | elapsed:    5.4s
[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed:    6.2s
[Parallel(n_jobs=8)]: Done  69 tasks      | elapsed:    7.2s
[Parallel(n_jobs=8)]: Done  82 tasks      | elapsed:    8.5s
[Parallel(n_jobs=8)]: Done  97 tasks      | elapsed:    9.6s
[Parallel(n_jobs=8)]: Done 112 tasks      | elapsed:   10.7s
[Parallel(n_jobs=8)]: Done 129 tasks      | elapsed:   11.9s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:   13.1s
[Parallel(n_jobs=8)]: Done 165 tasks      | elapsed:   14.8s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:   16.1s
[Parallel(

[Parallel(n_jobs=8)]: Done 8305 tasks      | elapsed: 20.4min
[Parallel(n_jobs=8)]: Done 8434 tasks      | elapsed: 21.1min
[Parallel(n_jobs=8)]: Done 8565 tasks      | elapsed: 21.8min
[Parallel(n_jobs=8)]: Done 8696 tasks      | elapsed: 22.2min
[Parallel(n_jobs=8)]: Done 8829 tasks      | elapsed: 22.4min
[Parallel(n_jobs=8)]: Done 8962 tasks      | elapsed: 22.5min
[Parallel(n_jobs=8)]: Done 9097 tasks      | elapsed: 22.7min
[Parallel(n_jobs=8)]: Done 9232 tasks      | elapsed: 22.9min
[Parallel(n_jobs=8)]: Done 9369 tasks      | elapsed: 23.0min
[Parallel(n_jobs=8)]: Done 9506 tasks      | elapsed: 23.3min
[Parallel(n_jobs=8)]: Done 9645 tasks      | elapsed: 23.6min
[Parallel(n_jobs=8)]: Done 9784 tasks      | elapsed: 23.9min
[Parallel(n_jobs=8)]: Done 9925 tasks      | elapsed: 24.2min
[Parallel(n_jobs=8)]: Done 10066 tasks      | elapsed: 24.5min
[Parallel(n_jobs=8)]: Done 10209 tasks      | elapsed: 25.0min
[Parallel(n_jobs=8)]: Done 10352 tasks      | elapsed: 25.5min
[Para

Best: 0.8040246605873108 Using: {'activation': 'relu', 'batch_size': 16, 'epochs': 100, 'init': 'uniform', 'lr': 0.001, 'n1': 32, 'n2': 16}


In [15]:
means = gridSearchResults.cv_results_['mean_test_score']
stds = gridSearchResults.cv_results_['std_test_score']
parameters = gridSearchResults.cv_results_['params']
results = list(zip(means, stds, parameters))
results = sorted(results, key = lambda x: x[0], reverse=True)

# print top 10 parameter combinations
for i in range(0, 10):
    print(results[i])

(0.8040246605873108, 0.0753894722309117, {'activation': 'relu', 'batch_size': 16, 'epochs': 100, 'init': 'uniform', 'lr': 0.001, 'n1': 32, 'n2': 16})
(0.7988640069961548, 0.07096167068662014, {'activation': 'relu', 'batch_size': 8, 'epochs': 100, 'init': 'normal', 'lr': 0.001, 'n1': 16, 'n2': 16})
(0.7987990736961365, 0.08350783695522372, {'activation': 'linear', 'batch_size': 8, 'epochs': 50, 'init': 'normal', 'lr': 0.005, 'n1': 16, 'n2': 8})
(0.7987990736961365, 0.08350783695522372, {'activation': 'linear', 'batch_size': 16, 'epochs': 50, 'init': 'uniform', 'lr': 0.005, 'n1': 8, 'n2': 8})
(0.7987990736961365, 0.08350783695522372, {'activation': 'linear', 'batch_size': 16, 'epochs': 100, 'init': 'normal', 'lr': 0.01, 'n1': 8, 'n2': 8})
(0.7987666368484497, 0.058297686275342576, {'activation': 'relu', 'batch_size': 32, 'epochs': 50, 'init': 'uniform', 'lr': 0.005, 'n1': 16, 'n2': 8})
(0.7987666249275207, 0.07339709180637218, {'activation': 'linear', 'batch_size': 32, 'epochs': 50, 'ini

In [17]:
# make predictions using best estimator
predictions = gridModel.predict(standardizedData) > 0.5

In [22]:
# print accuracy metics

from sklearn.metrics import classification_report, accuracy_score

print(accuracy_score(y, predictions))
print(classification_report(y, predictions))

0.8826530612244898
              precision    recall  f1-score   support

           0       0.92      0.90      0.91       262
           1       0.81      0.84      0.83       130

    accuracy                           0.88       392
   macro avg       0.87      0.87      0.87       392
weighted avg       0.88      0.88      0.88       392



###  Model performs even better on the entire data set