In [42]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline

In [43]:
data_frame = pd.read_csv('data/pima-data.csv')

In [44]:
data_frame.shape

(768, 10)

In [45]:
data_frame.head(3)

Unnamed: 0,num_preg,glucose_conc,diastolic_bp,thickness,insulin,bmi,diab_pred,age,skin,diabetes
0,6,148,72,35,0,33.6,0.627,50,1.379,True
1,1,85,66,29,0,26.6,0.351,31,1.1426,False
2,8,183,64,0,0,23.3,0.672,32,0.0,True


In [46]:
# Deleting 'skin' column completely
del data_frame['skin']

# Checking if the action was successful or not
data_frame.head()

Unnamed: 0,num_preg,glucose_conc,diastolic_bp,thickness,insulin,bmi,diab_pred,age,diabetes
0,6,148,72,35,0,33.6,0.627,50,True
1,1,85,66,29,0,26.6,0.351,31,False
2,8,183,64,0,0,23.3,0.672,32,True
3,1,89,66,23,94,28.1,0.167,21,False
4,0,137,40,35,168,43.1,2.288,33,True


In [47]:
# Mapping the values
map_diabetes = {True : 1, False : 0}

# Setting the map to the data_frame
data_frame['diabetes'] = data_frame['diabetes'].map(map_diabetes)

# Let's see what we have done
data_frame.head()

Unnamed: 0,num_preg,glucose_conc,diastolic_bp,thickness,insulin,bmi,diab_pred,age,diabetes
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [48]:
num_true = 0.0
num_false = 0.0
target = []
for item in data_frame['diabetes']:
    if item == True:
        num_true += 1
    else:
        num_false += 1
    target.append(item) 

percent_true = (num_true / (num_true + num_false)) * 100
percent_false = (num_false / (num_true + num_false)) * 100

print "Number of True Cases: {0} ({1:2.2f}%)".format(num_true, percent_true)
print "Number of False Cases: {0} ({1:2.2f}%)".format(num_false, percent_false)

Number of True Cases: 268.0 (34.90%)
Number of False Cases: 500.0 (65.10%)


In [28]:
del data_frame['diabetes']
X = data_frame

In [49]:
from sklearn.cross_validation import train_test_split

feature_column_names = ['num_preg', 'glucose_conc', 'diastolic_bp', 'thickness', 'insulin', 'bmi', 'diab_prod', 'age']

predicted_class_name = ['diabetes']
y = np.array(target)
# Saving 30% for testing
split_test_size = 30

# Splitting using scikit-learn train_test_split function

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = split_test_size, random_state = 42)


In [50]:
from sklearn.preprocessing import Imputer

#Impute with mean all 0 readings
fill_0 = Imputer(missing_values=0, strategy="mean", axis=0)

X_train = fill_0.fit_transform(X_train)
X_test = fill_0.fit_transform(X_test)

## Train Model using train data 
Here you use a naive bayes algorithm called GaussianNB

In [51]:
from sklearn.naive_bayes import GaussianNB

# create Gaussian Naive Bayes model object and train it with the data
nb_model = GaussianNB()

nb_model.fit(X_train, y_train.ravel())

GaussianNB(priors=None)

## Performance of Training Data

In [53]:
prediction_from_trained_data = nb_model.predict(X_train)

# performance metrics library
from sklearn import metrics

# get current accuracy of the model

accuracy = metrics.accuracy_score(y_train, prediction_from_trained_data)

print "Accuracy of our naive bayes model is : {0:.4f}".format(accuracy)

Accuracy of our naive bayes model is : 0.7588


## Performance of Testing Data

In [54]:
# this returns array of predicted results from test_data
prediction_from_test_data = nb_model.predict(X_test)

accuracy = metrics.accuracy_score(y_test, prediction_from_test_data)

print "Accuracy of our naive bayes model is: {0:0.4f}".format(accuracy)

Accuracy of our naive bayes model is: 0.7000


## Predict diabetes

In [58]:
d = [[6, 148, 72, 35, 0, 33.6, 0.627, 50]]
nb_model.predict(d)

array([1])