# To predict diabetes using PIMA diabetes data

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline

In [2]:
data = pd.read_csv("./data/pima-data.csv")

In [5]:
print(data)

     num_preg  glucose_conc  diastolic_bp  thickness  insulin   bmi  \
0           6           148            72         35        0  33.6   
1           1            85            66         29        0  26.6   
2           8           183            64          0        0  23.3   
3           1            89            66         23       94  28.1   
4           0           137            40         35      168  43.1   
5           5           116            74          0        0  25.6   
6           3            78            50         32       88  31.0   
7          10           115             0          0        0  35.3   
8           2           197            70         45      543  30.5   
9           8           125            96          0        0   0.0   
10          4           110            92          0        0  37.6   
11         10           168            74          0        0  38.0   
12         10           139            80          0        0  27.1   
13    

In [6]:
data.shape

(768, 10)

In [7]:
data.head(5)

Unnamed: 0,num_preg,glucose_conc,diastolic_bp,thickness,insulin,bmi,diab_pred,age,skin,diabetes
0,6,148,72,35,0,33.6,0.627,50,1.379,True
1,1,85,66,29,0,26.6,0.351,31,1.1426,False
2,8,183,64,0,0,23.3,0.672,32,0.0,True
3,1,89,66,23,94,28.1,0.167,21,0.9062,False
4,0,137,40,35,168,43.1,2.288,33,1.379,True


In [8]:
data.isnull().values.any()

False

In [9]:
data.corr()

Unnamed: 0,num_preg,glucose_conc,diastolic_bp,thickness,insulin,bmi,diab_pred,age,skin,diabetes
num_preg,1.0,0.129459,0.141282,-0.081672,-0.073535,0.017683,-0.033523,0.544341,-0.081672,0.221898
glucose_conc,0.129459,1.0,0.15259,0.057328,0.331357,0.221071,0.137337,0.263514,0.057328,0.466581
diastolic_bp,0.141282,0.15259,1.0,0.207371,0.088933,0.281805,0.041265,0.239528,0.207371,0.065068
thickness,-0.081672,0.057328,0.207371,1.0,0.436783,0.392573,0.183928,-0.11397,1.0,0.074752
insulin,-0.073535,0.331357,0.088933,0.436783,1.0,0.197859,0.185071,-0.042163,0.436783,0.130548
bmi,0.017683,0.221071,0.281805,0.392573,0.197859,1.0,0.140647,0.036242,0.392573,0.292695
diab_pred,-0.033523,0.137337,0.041265,0.183928,0.185071,0.140647,1.0,0.033561,0.183928,0.173844
age,0.544341,0.263514,0.239528,-0.11397,-0.042163,0.036242,0.033561,1.0,-0.11397,0.238356
skin,-0.081672,0.057328,0.207371,1.0,0.436783,0.392573,0.183928,-0.11397,1.0,0.074752
diabetes,0.221898,0.466581,0.065068,0.074752,0.130548,0.292695,0.173844,0.238356,0.074752,1.0


In [10]:
del data['thickness']

In [11]:
data.head(5)

Unnamed: 0,num_preg,glucose_conc,diastolic_bp,insulin,bmi,diab_pred,age,skin,diabetes
0,6,148,72,0,33.6,0.627,50,1.379,True
1,1,85,66,0,26.6,0.351,31,1.1426,False
2,8,183,64,0,23.3,0.672,32,0.0,True
3,1,89,66,94,28.1,0.167,21,0.9062,False
4,0,137,40,168,43.1,2.288,33,1.379,True


In [13]:
diabetes_map = {True: 1, False: 0}

In [14]:
data['diabetes'] = data['diabetes'].map(diabetes_map)

In [15]:
data.head(5)

Unnamed: 0,num_preg,glucose_conc,diastolic_bp,insulin,bmi,diab_pred,age,skin,diabetes
0,6,148,72,0,33.6,0.627,50,1.379,1
1,1,85,66,0,26.6,0.351,31,1.1426,0
2,8,183,64,0,23.3,0.672,32,0.0,1
3,1,89,66,94,28.1,0.167,21,0.9062,0
4,0,137,40,168,43.1,2.288,33,1.379,1


In [16]:
diabetes_true_count = len(data.loc[data['diabetes'] == True])

In [17]:
diabetes_false_count = len(data.loc[data['diabetes'] == False])

In [18]:
print("Diabetes - True : ", diabetes_true_count)
print("Diabetes - False : ", diabetes_false_count)

Diabetes - True :  268
Diabetes - False :  500


In [19]:
print("Diabetes - True : ", diabetes_true_count/data.shape[0]*100)
print("Diabetes - False : ", diabetes_false_count/data.shape[0]*100)

Diabetes - True :  34.89583333333333
Diabetes - False :  65.10416666666666


In [20]:
from sklearn.cross_validation import train_test_split
feature_columns = ['num_preg', 'glucose_conc', 'diastolic_bp', 'insulin', 'bmi', 'diab_pred', 'age', 'skin']
predicted_class = ['diabetes']

X = data[feature_columns].values
y = data[predicted_class].values

split_test_size = 0.30

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = split_test_size, random_state=10)



In [21]:
X_train.shape[0]/X.shape[0] * 100



69.921875

In [22]:
X_test.shape[0]/X.shape[0] * 100

30.078125

In [23]:
print(len(data.loc[data['diabetes'] == 1]) / len(data.loc[data['diabetes']]) * 100)
print(len(data.loc[data['diabetes'] == 0]) / len(data.loc[data['diabetes']]) * 100)

34.89583333333333
65.10416666666666


In [24]:
print(len(y_train[y_train[:] == 1]) / len(y_train[y_train[:]]) * 100)
print(len(y_train[y_train[:] == 0]) / len(y_train[y_train[:]]) * 100)

33.70577281191806
66.29422718808193


In [25]:
print(len(y_test[y_test[:] == 1]) / len(y_test[y_test[:]]) * 100)
print(len(y_test[y_test[:] == 0]) / len(y_test[y_test[:]]) * 100)

37.66233766233766
62.33766233766234


In [26]:
data.head(5)

Unnamed: 0,num_preg,glucose_conc,diastolic_bp,insulin,bmi,diab_pred,age,skin,diabetes
0,6,148,72,0,33.6,0.627,50,1.379,1
1,1,85,66,0,26.6,0.351,31,1.1426,0
2,8,183,64,0,23.3,0.672,32,0.0,1
3,1,89,66,94,28.1,0.167,21,0.9062,0
4,0,137,40,168,43.1,2.288,33,1.379,1


In [27]:
print("total number of rows : {0}".format(len(data)))
print("number of rows missing glucose_conc: {0}".format(len(data.loc[data['glucose_conc'] == 0])))
print("number of rows missing glucose_conc: {0}".format(len(data.loc[data['glucose_conc'] == 0])))
print("number of rows missing diastolic_bp: {0}".format(len(data.loc[data['diastolic_bp'] == 0])))
print("number of rows missing insulin: {0}".format(len(data.loc[data['insulin'] == 0])))
print("number of rows missing bmi: {0}".format(len(data.loc[data['bmi'] == 0])))
print("number of rows missing diab_pred: {0}".format(len(data.loc[data['diab_pred'] == 0])))
print("number of rows missing age: {0}".format(len(data.loc[data['age'] == 0])))
print("number of rows missing skin: {0}".format(len(data.loc[data['skin'] == 0])))

total number of rows : 768
number of rows missing glucose_conc: 5
number of rows missing glucose_conc: 5
number of rows missing diastolic_bp: 35
number of rows missing insulin: 374
number of rows missing bmi: 11
number of rows missing diab_pred: 0
number of rows missing age: 0
number of rows missing skin: 227


In [28]:
from sklearn.preprocessing import Imputer

fill_0 = Imputer(missing_values=0, strategy="mean", axis=0)

X_train = fill_0.fit_transform(X_train)
X_test = fill_0.fit_transform(X_test)

# Training with Naive Bayes

In [29]:
from sklearn.naive_bayes import GaussianNB

naive_model = GaussianNB()
naive_model.fit(X_train, y_train.ravel())

GaussianNB(priors=None)

## Performance on training data

In [30]:
naive_predict_train_data = naive_model.predict(X_train)

from sklearn import metrics

print("Accuracy = {0:.3f}".format(metrics.accuracy_score(y_train, naive_predict_train_data)))

Accuracy = 0.756


## Performance on test data

In [35]:
naive_predict_test_data = naive_model.predict(X_test)

from sklearn import metrics

print("Accuracy = {0:.3f}".format(metrics.accuracy_score(y_test, naive_predict_test_data)))

Accuracy = 0.732
