# Predict onset of diabetes using Pima Indians dataset

- Source 1: https://machinelearningmastery.com/tutorial-first-neural-network-python-keras/
- Source 2: https://machinelearningmastery.com/5-step-life-cycle-neural-network-models-keras/
- Dataset: https://archive.ics.uci.edu/ml/datasets/diabetes

# 1. Setup

In [1]:
# ! conda install keras -y

In [2]:
# ! conda install tensorflow -y

In [3]:
# imports
from numpy import loadtxt
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense

In [4]:
# data source
# url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv'
url = 'data/pima-indians-diabetes.data.csv'

In [5]:
# load the dataset
dataset = loadtxt(url, delimiter=',')

In [6]:
# without numpy
df = pd.read_csv(url, header=None)
df.columns = ['pregnancies', 'plasma glucose', 'blood pressure', 
              'skin fold', 'insulin', 'BMI', 'pedigree', 'age', 'onset']
df.shape

(768, 9)

In [7]:
# what the dataset looks like
df.head()

Unnamed: 0,pregnancies,plasma glucose,blood pressure,skin fold,insulin,BMI,pedigree,age,onset
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [8]:
# split into input (X) and output (y) variables
X = dataset[:,0:8]
y = dataset[:,8]

In [9]:
# examine the target
y[:5]

array([1., 0., 1., 0., 1.])

In [10]:
# split into 67% for train and 33% for test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# 2. Define Keras Model

In [11]:
# keras has two APIs: Sequential and Functional.
model = Sequential()

2022-09-01 14:34:04.576976: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [12]:
# input layer
model.add(Dense(12, input_dim=8, activation='relu'))

In [13]:
# hidden layer
model.add(Dense(16, activation='relu'))

In [14]:
# output layer
model.add(Dense(1, activation='sigmoid'))

# 3. Compile Keras Model

In [15]:
# compile the keras model
model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy']
             )

# 4. Fit the model

In [16]:
# fit the keras model on the dataset

model.fit(X_train, 
          y_train, 
          validation_data=(X_test,y_test), 
          epochs=10, 
          batch_size=10)

2022-09-01 14:34:04.772179: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2022-09-01 14:34:04.774018: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2500005000 Hz


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fe8483614c0>

# 5. Make Predictions


In [17]:
# make probability predictions with the model
y_probs = model.predict(X_test)
# round predictions 
rounded = [round(x[0]) for x in y_probs]
rounded[:10]

[1, 0, 1, 0, 1, 0, 1, 1, 1, 1]

In [18]:
# sample 10 of those
import numpy as np
probs=y_probs.reshape(-1)
np.random.choice(probs, size=10)

array([0.40997157, 0.4863898 , 0.93005407, 0.5513696 , 0.7307017 ,
       0.68567806, 0.2522296 , 0.53454363, 0.95990264, 0.52230555],
      dtype=float32)

In [19]:
# make class predictions with the model
y_preds = (model.predict(X_test) > 0.5).astype(int)
print(y_preds[:10].tolist())
print(y_test[:10])

[[1], [0], [1], [0], [1], [0], [1], [1], [1], [1]]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


# 6. Evaluate

#### Important note:
For a lot of reasons, the evaluation metrics will vary dramatically with each run of the model. A primary reason for this is the small size of the dataset. A smart solution is to use k-fold cross-validation, which will average out the results of multiple training runs (we do this in the next notebook).

In [20]:
# evaluate the keras model
_, accuracy = model.evaluate(X_test, y_test)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 51.97


#### How to read the classification report
- Accuracy: The percentage of predictions that were accurate.
- Precision: Percentage of correct positive predictions relative to total positive predictions.
- Recall: Percentage of correct positive predictions relative to total actual positives.
- F1 Score: A weighted harmonic mean of precision and recall.
- Support: The number of occurrences of each class in y_test (i.e., how many observations belonged to each class in the test dataset).

In [21]:
# Evaluate the model
from sklearn import metrics
print(metrics.classification_report(y_test, y_preds))

              precision    recall  f1-score   support

         0.0       0.73      0.43      0.54       168
         1.0       0.38      0.69      0.49        86

    accuracy                           0.52       254
   macro avg       0.56      0.56      0.52       254
weighted avg       0.61      0.52      0.53       254



# 7. Save the model

In [22]:
model.save("diabetes-model-1.h5")

In [23]:
# load model
from keras.models import load_model
model2 = load_model("diabetes-model-1.h5")

# 8. Alternative: Categorical target as 2 variables

In [24]:
# split into input (X) and output (y) variables
from keras.utils import to_categorical
X = dataset[:,0:8]
y = to_categorical(dataset[:,8])

In [25]:
# examine the target & compare to the previous structure
print('before: \n', dataset[:,8][:5])
print('after: \n',  y[:5])

before: 
 [1. 0. 1. 0. 1.]
after: 
 [[0. 1.]
 [1. 0.]
 [0. 1.]
 [1. 0.]
 [0. 1.]]


In [26]:
# split into 67% for train and 33% for test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [27]:
# build the model and fit to the data
model = Sequential()
model.add(Dense(12, input_dim=8, activation='relu'))
model.add(Dense(16, activation='relu'))

In [28]:
# let's restructure the output layer
# previously we did this: 
#model.add(Dense(1, activation='sigmoid')) 
model.add(Dense(2, activation='softmax'))  
# notice the difference in nodes and activation function

In [29]:
# compile and fit
model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy']
             )
model.fit(X_train, 
          y_train, 
          validation_data=(X_test,y_test), 
          epochs=10, 
          batch_size=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fe83013a850>

In [30]:
# examine the predictions
y_probs = model.predict(X_test)
print(y_probs[:5])

y_preds = (model.predict(X_test) > 0.5).astype(int)
print(y_preds[:5])

[[0.3761186  0.6238814 ]
 [0.7700933  0.2299067 ]
 [0.7539397  0.24606031]
 [0.5557385  0.44426146]
 [0.52825755 0.47174254]]
[[0 1]
 [1 0]
 [1 0]
 [1 0]
 [1 0]]


In [31]:
# reformat the outcomes to a single value (this is optional)
y_test=[y[1] for y in y_test]
y_preds=[pred[1] for pred in y_preds]

In [32]:
# evaluate
print(metrics.classification_report(y_test, y_preds))

              precision    recall  f1-score   support

         0.0       0.79      0.74      0.76       168
         1.0       0.55      0.60      0.57        86

    accuracy                           0.70       254
   macro avg       0.67      0.67      0.67       254
weighted avg       0.71      0.70      0.70       254

