In [1]:
import pandas as pd

## Download PIMA Indians Diabetes dataset

1. Number of times pregnant 
2. Plasma glucose concentration a 2 hours in an oral glucose tolerance test 
3. Diastolic blood pressure (mm Hg) 
4. Triceps skin fold thickness (mm) 
5. 2-Hour serum insulin (mu U/ml) 
6. Body mass index (weight in kg/(height in m)^2) 
7. Diabetes pedigree function 
8. Age (years) 
9. Class variable (0 or 1) 

In [2]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataset = pd.read_csv(url, names=names)

## Summarize Data

In [3]:
print(dataset.shape)
print(dataset.dtypes)

(768, 9)
preg       int64
plas       int64
pres       int64
skin       int64
test       int64
mass     float64
pedi     float64
age        int64
class      int64
dtype: object


In [4]:
print(dataset.head(20))

    preg  plas  pres  skin  test  mass   pedi  age  class
0      6   148    72    35     0  33.6  0.627   50      1
1      1    85    66    29     0  26.6  0.351   31      0
2      8   183    64     0     0  23.3  0.672   32      1
3      1    89    66    23    94  28.1  0.167   21      0
4      0   137    40    35   168  43.1  2.288   33      1
5      5   116    74     0     0  25.6  0.201   30      0
6      3    78    50    32    88  31.0  0.248   26      1
7     10   115     0     0     0  35.3  0.134   29      0
8      2   197    70    45   543  30.5  0.158   53      1
9      8   125    96     0     0   0.0  0.232   54      1
10     4   110    92     0     0  37.6  0.191   30      0
11    10   168    74     0     0  38.0  0.537   34      1
12    10   139    80     0     0  27.1  1.441   57      0
13     1   189    60    23   846  30.1  0.398   59      1
14     5   166    72    19   175  25.8  0.587   51      1
15     7   100     0     0     0  30.0  0.484   32      1
16     0   118

# The first model

![title](img/MLP1.PNG)

In [5]:
from keras.models import Sequential
from keras.layers import Dense
import numpy as np

Using TensorFlow backend.


In [6]:
seed = 7
np.random.seed(seed)

In [7]:
array = dataset.values
X = array[:,0:8]
Y = array[:,8]

In [8]:
# create model
model = Sequential()
model.add(Dense(12, input_dim=8, init='uniform', activation='relu'))
model.add(Dense(8, init='uniform', activation='relu'))
model.add(Dense(1, init='uniform', activation='sigmoid'))

In [9]:
# compile model
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [10]:
# fit model
model.fit(X, Y, nb_epoch=150, batch_size=10,verbose=0)

<keras.callbacks.History at 0x7f2618870d50>

In [11]:
# evaluate model
scores = model.evaluate(X,Y)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))



# Use a validation dataset

In [12]:
from sklearn import cross_validation



In [13]:
array = dataset.values
X = array[:,0:8]
Y = array[:,8]

validation_size = 0.20
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, test_size=validation_size, random_state=seed)

In [14]:
# create model
model = Sequential()
model.add(Dense(12, input_dim=8, init='uniform', activation='relu'))
model.add(Dense(8, init='uniform', activation='relu'))
model.add(Dense(1, init='uniform', activation='sigmoid'))

In [15]:
# compile model
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [16]:
# fit model
model.fit(X_train, Y_train, validation_data=(X_test,Y_test), nb_epoch=150, batch_size=10)

Train on 614 samples, validate on 154 samples
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
E

<keras.callbacks.History at 0x7f25ebeb87d0>

# Manual k-Fold CV

In [17]:
from sklearn.model_selection import StratifiedKFold

In [18]:
# define 4-fold cross validation test harness
kfold = StratifiedKFold(n_splits=4, shuffle=True, random_state=seed)
cvscores = []

In [19]:
for train, test in kfold.split(X, Y):
    # create model
    model = Sequential()
    model.add(Dense(12, input_dim=8, init= 'uniform' , activation= 'relu' ))
    model.add(Dense(8, init= 'uniform' , activation= 'relu' ))
    model.add(Dense(1, init= 'uniform' , activation= 'sigmoid' ))
    # Compile model
    model.compile(loss= 'binary_crossentropy' , optimizer= 'adam' , metrics=['accuracy'])
    # Fit the model
    model.fit(X[train], Y[train], nb_epoch=150, batch_size=10, verbose=0)
    # evaluate the model
    scores = model.evaluate(X[test], Y[test], verbose=0)
    print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
    cvscores.append(scores[1] * 100)

acc: 73.44%
acc: 83.85%
acc: 72.40%
acc: 75.52%


In [20]:
print("%.2f%% (+/- %.2f%%)" % (np.mean(cvscores), np.std(cvscores)))

76.30% (+/- 4.50%)


# Tune Parameters using Grid Search

In [21]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

In [22]:
validation_size = 0.20
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, test_size=validation_size, random_state=seed)

In [23]:
scaler = StandardScaler().fit(X_train)
rescaledX_train = scaler.transform(X_train)

In [24]:
# Function to create model, required for KerasClassifier
def create_model(optimizer= 'rmsprop' , init= 'uniform' ):
    # create model
    model = Sequential()
    model.add(Dense(12, input_dim=8, init=init, activation= 'relu' ))
    model.add(Dense(8, init=init, activation= 'relu' ))
    model.add(Dense(1, init=init, activation= 'sigmoid' ))
    # Compile model
    model.compile(loss= 'binary_crossentropy' , optimizer=optimizer, metrics=[ 'accuracy' ])
    return model

In [25]:
# create model
model = KerasClassifier(build_fn=create_model, verbose=0)

In [26]:
# grid search epochs, batch size and optimizer
optimizers = [ 'rmsprop' , 'adam' ]
init = [ 'normal' , 'uniform' ]
epochs = np.array([150])
batches = np.array([5,10])
param_grid = dict(optimizer=optimizers, nb_epoch=epochs, batch_size=batches, init=init)
grid = GridSearchCV(estimator=model, param_grid=param_grid)
grid_result = grid.fit(rescaledX_train, Y_train)

In [27]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.763844 using {'init': 'normal', 'optimizer': 'rmsprop', 'nb_epoch': 150, 'batch_size': 5}


In [29]:
for params, mean_score, scores in grid_result.grid_scores_:
    print("%f (%f) with: %r" % (scores.mean(), scores.std(), params))

0.763829 (0.006619) with: {'init': 'normal', 'optimizer': 'rmsprop', 'nb_epoch': 150, 'batch_size': 5}
0.736179 (0.009974) with: {'init': 'normal', 'optimizer': 'adam', 'nb_epoch': 150, 'batch_size': 5}
0.762243 (0.013531) with: {'init': 'uniform', 'optimizer': 'rmsprop', 'nb_epoch': 150, 'batch_size': 5}
0.752447 (0.008153) with: {'init': 'uniform', 'optimizer': 'adam', 'nb_epoch': 150, 'batch_size': 5}
0.745927 (0.004026) with: {'init': 'normal', 'optimizer': 'rmsprop', 'nb_epoch': 150, 'batch_size': 10}
0.758983 (0.011679) with: {'init': 'normal', 'optimizer': 'adam', 'nb_epoch': 150, 'batch_size': 10}
0.755723 (0.017051) with: {'init': 'uniform', 'optimizer': 'rmsprop', 'nb_epoch': 150, 'batch_size': 10}
0.754081 (0.008935) with: {'init': 'uniform', 'optimizer': 'adam', 'nb_epoch': 150, 'batch_size': 10}




## Prepare Data

In [5]:
from sklearn import cross_validation
from sklearn.preprocessing import StandardScaler



In [6]:
array = dataset.values
X = array[:,0:8]
Y = array[:,8]

In [7]:
validation_size = 0.20
seed = 7
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, test_size=validation_size, random_state=seed)

In [8]:
scaler = StandardScaler().fit(X_train)
rescaledX_train = scaler.transform(X_train)

## Tune Models

In [9]:
import numpy as np
np.random.seed(7)
import random
random.seed(7)

from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC



###  SVM with Linear Kernel

In [10]:
C = 10**np.arange(0,4)
param_grid = dict(C=C)
model = SVC(random_state=seed, probability=True, kernel="linear")
scoring = 'roc_auc' 
num_folds = 4
grid = GridSearchCV(cv=num_folds, estimator=model, param_grid=param_grid,scoring=scoring,n_jobs=-1)
grid.fit(rescaledX_train, Y_train)

GridSearchCV(cv=4, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=True, random_state=7, shrinking=True, tol=0.001,
  verbose=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'C': array([   1,   10,  100, 1000])},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=0)

In [11]:
print(grid.best_score_)
print(grid.best_estimator_.C)
svmlTunedC = grid.best_estimator_.C

0.820229475314
1000


### SVM with Polynomial Kernel

In [19]:
C = 10**np.arange(0,4)
degree = np.arange(2,4)
param_grid = dict(C=C,degree=degree)
model = SVC(random_state=seed, probability=True, kernel="poly")
scoring = 'roc_auc' 
num_folds = 4
grid = GridSearchCV(cv=num_folds, estimator=model, param_grid=param_grid,scoring=scoring,n_jobs=-1)
grid.fit(rescaledX_train, Y_train)

GridSearchCV(cv=4, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='poly',
  max_iter=-1, probability=True, random_state=7, shrinking=True, tol=0.001,
  verbose=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'C': array([   1,   10,  100, 1000]), 'degree': array([2, 3])},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=0)

In [20]:
print(grid.best_score_)
print(grid.best_estimator_.C,grid.best_estimator_.degree)
svmpTunedC = grid.best_estimator_.C
svmpTunedDegree = grid.best_estimator_.degree

0.77015953184
(1, 3)


### SVM with Gaussian Kernel

In [29]:
C = 10**np.arange(0,3)
gamma = 10**-np.arange(-5,5)
param_grid = dict(C=C,gamma=gamma)
model = SVC(random_state=seed, probability=True, kernel="rbf")
scoring = 'roc_auc' 
num_folds = 4
grid = GridSearchCV(cv=num_folds, estimator=model, param_grid=param_grid,scoring=scoring,n_jobs=-1)
grid.fit(rescaledX_train, Y_train)

GridSearchCV(cv=4, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=7, shrinking=True, tol=0.001,
  verbose=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'C': array([  1,  10, 100]), 'gamma': array([100000,  10000,   1000,    100,     10,      1,      0,      0,
            0,      0])},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=0)

In [30]:
print(grid.best_score_)
print(grid.best_estimator_.C,grid.best_estimator_.gamma)
svmgTunedC = grid.best_estimator_.C
svmgTunedGamma = grid.best_estimator_.gamma

0.7642281997
(1, 1)


## Evaluate Performance

In [32]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import auc
from sklearn.metrics import roc_curve

In [33]:
seed = 7


pipelines = []
pipelines.append(('SVM-L', Pipeline([('Scaler', StandardScaler()),('SVM-L', SVC(random_state=seed, probability=True, kernel="linear", C=svmlTunedC))])))
pipelines.append(('SVM-P', Pipeline([('Scaler', StandardScaler()),('SVM-P', SVC(random_state=seed, probability=True, kernel="poly", C=svmpTunedC, degree=svmpTunedDegree))])))
pipelines.append(('SVM-G', Pipeline([('Scaler', StandardScaler()),('SVM-G', SVC(random_state=seed, probability=True, kernel="rbf", C=svmgTunedC, gamma=svmgTunedGamma))])))

In [34]:
# Make predictions on test dataset
results = []
names = []
for name, model in pipelines:
    model.fit(X_train,Y_train)
    fpr, tpr, thresholds = roc_curve(Y_test,model.predict_proba(X_test)[:,1])
    result = auc(fpr,tpr)
    results.append(result)
    names.append(name)
    msg = "%s: %f" % (name, result)
    print(msg)

SVM-L: 0.845542
SVM-P: 0.811901
SVM-G: 0.805028
