# Pima Indians Diabetes Dataset DNN Classification
## Using Tensorflow in Google Colab

### Import libraries

In [0]:
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt

### I've stored my data in Google Drive. So if your data is not on Google Drive, this step is optional.

In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


### Download the data set from [Kaggle](https://www.kaggle.com/uciml/pima-indians-diabetes-database) and load it using `pandas`


In [3]:
diabetes = pd.read_csv('/content/drive/My Drive/Colab Notebooks/diabetes.csv')
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


### Now we'll list the column names and normalize them between 0 & 1

In [4]:
diabetes.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

### Here, we'll not normalize Outcome because it is not a continuous feature.

In [5]:
cols_to_norm = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age']

diabetes[cols_to_norm] = diabetes[cols_to_norm].apply(lambda x: (x - x.min())/(x.max() - x.min()))

diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0.352941,0.743719,0.590164,0.353535,0.0,0.500745,0.234415,0.483333,1
1,0.058824,0.427136,0.540984,0.292929,0.0,0.396423,0.116567,0.166667,0
2,0.470588,0.919598,0.52459,0.0,0.0,0.347243,0.253629,0.183333,1
3,0.058824,0.447236,0.540984,0.232323,0.111111,0.418778,0.038002,0.0,0
4,0.0,0.688442,0.327869,0.353535,0.198582,0.642325,0.943638,0.2,1


### Now, we'll use tensorflow's `neumeric feature columns` and feed it all the normalized columns.

In [6]:
feature_cols = []

for col in cols_to_norm:
  feature_cols.append(tf.feature_column.numeric_column(col))
  
feature_cols

[_NumericColumn(key='Pregnancies', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='Glucose', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='BloodPressure', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='SkinThickness', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='Insulin', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='BMI', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='DiabetesPedigreeFunction', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 _NumericColumn(key='Age', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)]

### Now we'll perform `Train - Test - Split` using Scikit Learn
* For that, first we'll remove the `Outcome` from our training dataset.
* Then use the `Outcome` column for labels.


In [7]:
X_data = diabetes.drop('Outcome', axis=1)
X_data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.352941,0.743719,0.590164,0.353535,0.0,0.500745,0.234415,0.483333
1,0.058824,0.427136,0.540984,0.292929,0.0,0.396423,0.116567,0.166667
2,0.470588,0.919598,0.52459,0.0,0.0,0.347243,0.253629,0.183333
3,0.058824,0.447236,0.540984,0.232323,0.111111,0.418778,0.038002,0.0
4,0.0,0.688442,0.327869,0.353535,0.198582,0.642325,0.943638,0.2


In [8]:
labels = diabetes['Outcome']
labels.head()

0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_data, labels, test_size=0.3)

### Now we'll create our input function using tensorflow's estimator API.
* In this function, we'll provide out training data X_train and y_train.

In [0]:
input_func = tf.estimator.inputs.pandas_input_fn(x=X_train, y=y_train, batch_size=10, num_epochs=1000, shuffle=True)
# num_epochs -> number of iterations
# shuffle = true -> shuffle training data on each iteration

### Now we'll make our Deep Neuran Network Model

In [11]:
dnn_model = tf.estimator.DNNClassifier(hidden_units=[20, 10, 10, 20], feature_columns=feature_cols, n_classes=2)
# n_classes = 2 -> We've only 2 types of Outcomes '0' or '1'

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmp38owpcn6', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f7ba18faa20>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


### Train the Model
* To train a model, it takes 2 arguments  
  1. input_fn
  2. steps

In [12]:
dnn_model.train(input_fn=input_func, steps=1000)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into /tmp/tmp38owpcn6/model.ckpt.
INFO:tensorflow:loss = 7.0727863, step = 0
INFO:tensorflow:global_step/sec: 198.181
INFO:tensorflow:loss = 6.7325926, step = 100 (0.509 sec)
INFO:tensorflow:global_step/sec: 191.535
INFO:tensorflow:loss = 7.3334637, step = 200 (0.522 sec)
INFO:tensorflow:global_step/sec: 182.423
INFO:tensorflow:loss = 6.3855305, step = 300 (0.549 sec)
INFO:tensorflow:global_step/sec: 189.35
INFO:tensorflow:loss = 6.9438725, step = 400 (0.530 sec)
INFO:tensorflow:global_step/sec: 184.242
INFO:tensorflow:loss = 3.6345963, step = 500 (0.540 sec)
INFO:tensorflow:global_step/sec: 189.647
INFO:tensorflow:loss = 4.693725, step = 600 (0.527 sec)
INFO:tensorflow:global_step/sec: 193.167
INFO:tensorflow:lo

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x7f7ba18fa7b8>

### After training our model it is time to evaluate it using our test data.
*  We'll follow the same procedure which we used to create `input_func` but with minor modifications.
* In the evaluation, we only need 1 step and we don't need to shuffle it.

In [13]:
eval_input_func = tf.estimator.inputs.pandas_input_fn(x=X_test, y=y_test, batch_size=10, num_epochs=1, shuffle=False)

dnn_model.evaluate(eval_input_func)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-09-16-13:04:55
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmp38owpcn6/model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-09-16-13:04:56
INFO:tensorflow:Saving dict for global step 1000: accuracy = 0.7619048, accuracy_baseline = 0.64069265, auc = 0.8299414, auc_precision_recall = 0.7007514, average_loss = 0.49772367, global_step = 1000, label/mean = 0.35930735, loss = 4.7905903, precision = 0.68421054, prediction/mean = 0.34205174, recall = 0.62650603
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 1000: /tmp/tmp38owpcn6/model.ckpt-1000


{'accuracy': 0.7619048,
 'accuracy_baseline': 0.64069265,
 'auc': 0.8299414,
 'auc_precision_recall': 0.7007514,
 'average_loss': 0.49772367,
 'global_step': 1000,
 'label/mean': 0.35930735,
 'loss': 4.7905903,
 'precision': 0.68421054,
 'prediction/mean': 0.34205174,
 'recall': 0.62650603}

### Making some predictions
* We'll make predictions on training data and follow the same procedure as input_func
* But while making predictions, we'll not provide it the y_test(Outcome) data.

In [0]:
pred_input_func = tf.estimator.inputs.pandas_input_fn(x=X_train, batch_size=10, num_epochs=10, shuffle=False)

predictions = dnn_model.predict(pred_input_func)

### Now let's list the first 5 predictions and first 5 test data's original outcomes.

In [15]:
predictions = list(predictions)
predictions[:5]

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmp38owpcn6/model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


[{'class_ids': array([1]),
  'classes': array([b'1'], dtype=object),
  'logistic': array([0.90467334], dtype=float32),
  'logits': array([2.2502644], dtype=float32),
  'probabilities': array([0.09532666, 0.90467334], dtype=float32)},
 {'class_ids': array([0]),
  'classes': array([b'0'], dtype=object),
  'logistic': array([0.06816204], dtype=float32),
  'logits': array([-2.615271], dtype=float32),
  'probabilities': array([0.9318379 , 0.06816203], dtype=float32)},
 {'class_ids': array([0]),
  'classes': array([b'0'], dtype=object),
  'logistic': array([0.46010506], dtype=float32),
  'logits': array([-0.1599196], dtype=float32),
  'probabilities': array([0.53989494, 0.4601051 ], dtype=float32)},
 {'class_ids': array([0]),
  'classes': array([b'0'], dtype=object),
  'logistic': array([0.04111069], dtype=float32),
  'logits': array([-3.1495075], dtype=float32),
  'probabilities': array([0.9588893 , 0.04111069], dtype=float32)},
 {'class_ids': array([1]),
  'classes': array([b'1'], dtype=ob

In [16]:
y_train.head()

236    1
422    0
635    1
482    0
750    1
Name: Outcome, dtype: int64

In [17]:
print('Comparing First Element:', 'Predictions:' ,predictions[0]['class_ids'][0], '&& y_test: ',y_train[573])
print('Probability', predictions[0]['probabilities'].max())

Comparing First Element: Predictions: 1 && y_test:  0
Probability 0.90467334
