In [31]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

# importing ploting libraries
import matplotlib.pyplot as plt   

#importing seaborn for statistical plots
import seaborn as sns

#Let us break the X and y dataframes into training set and test set. For this we will use
#Sklearn package's data splitting function which is based on random function

from sklearn.model_selection import train_test_split

import numpy as np

# calculate accuracy measures and confusion matrix
from sklearn import metrics

In [32]:
# To enable plotting graphs in Jupyter notebook
%matplotlib inline 

In [33]:
# The data lies in the following URL.
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"

In [34]:
#Attribute Information:
#1. Number of times pregnant 
#2. Plasma glucose concentration a 2 hours in an oral glucose tolerance test 
#3. Diastolic blood pressure (mm Hg) 
#4. Triceps skin fold thickness (mm) 
#5. 2-Hour serum insulin (mu U/ml) 
#6. Body mass index (weight in kg/(height in m)^2) 
#7. Diabetes pedigree function 
#8. Age (years) 
#9. Class variable (0 or 1) 

In [35]:
# Since it is a data file with no header, we will supply the column names which have been obtained from the above URL 
# Create a python list of column names called "names"

colnames = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

#Load the file from local directory using pd.read_csv which is a special form of read_table
#while reading the data, supply the "colnames" list


In [36]:
prima_df = pd.read_csv("pima-indians-diabetes-1.data", names=colnames)

In [37]:
prima_df.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [38]:
# Let us check whether any of the columns has any value other than numeric i.e. data is not corrupted such as a "?" instead of 
# a number.

# we use np.isreal a numpy function which checks each column for each row and returns a bool array, 
# where True if input element is real.
# applymap is pandas dataframe function that applies the np.isreal function columnwise
# Following line selects those rows which have some non-numeric value in any of the columns hence the  ~ symbol

prima_df[~prima_df.applymap(np.isreal).all(1)]

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class


In [39]:
# replace the missing values in pima_df with median value :Note, we do not need to specify the column names
# every column's missing value is replaced with that column's median respectively
prima_df = prima_df.fillna(prima_df.median())
prima_df.describe()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [41]:
#Lets analysze the distribution of the various attributes
prima_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
preg,768.0,3.845052,3.369578,0.0,1.0,3.0,6.0,17.0
plas,768.0,120.894531,31.972618,0.0,99.0,117.0,140.25,199.0
pres,768.0,69.105469,19.355807,0.0,62.0,72.0,80.0,122.0
skin,768.0,20.536458,15.952218,0.0,0.0,23.0,32.0,99.0
test,768.0,79.799479,115.244002,0.0,0.0,30.5,127.25,846.0
mass,768.0,31.992578,7.88416,0.0,27.3,32.0,36.6,67.1
pedi,768.0,0.471876,0.331329,0.078,0.24375,0.3725,0.62625,2.42
age,768.0,33.240885,11.760232,21.0,24.0,29.0,41.0,81.0
class,768.0,0.348958,0.476951,0.0,0.0,0.0,1.0,1.0


In [42]:
prima_df['class'].value_counts()

0    500
1    268
Name: class, dtype: int64

In [43]:
# Let us look at the target column which is 'class' to understand how the data is distributed amongst the various values

# Most are not diabetic. The ratio is almost 1:2 in favor or class 0.  The model's ability to predict class 0 will 
# be better than predicting class 1. 
prima_df.groupby(["class"]).count()

Unnamed: 0_level_0,preg,plas,pres,skin,test,mass,pedi,age
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,500,500,500,500,500,500,500,500
1,268,268,268,268,268,268,268,268


In [23]:
# Separate the independent attributes i.e. columns 0 to 8 and store them in X array
# Store the target column (column 8) into Y array

 # select all rows and first 7 columns which are the attributes
# select all rows and the 8th column which is the classification "Yes", "No" for diabeties
array = prima_df.values
X = array[:,0:7] # select all rows and first 8 columns which are the attributes
Y = array[:,8]   # select all rows and the 8th column which is the classification "Yes", "No" for diabeties


In [24]:
# Create the training and test data set in the ratio of 70:30 respectively. Can be any other ratio...

# taking 70:30 training and test set
# Random numbmer seeding for reapeatability of the code
test_size = 0.30 # taking 70:30 training and test set
seed = 7  # Random numbmer seeding for reapeatability of the code
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
type(X_train)

numpy.ndarray

In [25]:
y_train[:5]

array([1., 0., 0., 0., 1.])

In [44]:
# Invoking the NB Gaussian function to create the model
# fitting the model in the training data set
model=GaussianNB()
model.fit(X_train,y_train)
y_predict=model.predict(X_test)
y_predict[:10]

array([0., 1., 1., 0., 0., 0., 0., 0., 1., 0.])

In [52]:
model2 = LogisticRegression()
model2.fit(X_train,y_train)
y2_predict = model2.predict(X_test)

In [53]:
pd.DataFrame(y_test)[0].value_counts()

0.0    147
1.0     84
Name: 0, dtype: int64

In [54]:
acc=metrics.accuracy_score(y_test,y_predict)
acc

0.7359307359307359

In [55]:
acc2=metrics.accuracy_score(y_test,y2_predict)
acc2

0.7705627705627706

In [47]:
cm=metrics.confusion_matrix(y_test,y_predict)
cm

array([[119,  28],
       [ 33,  51]])

In [56]:
cm2=metrics.confusion_matrix(y_test,y2_predict)
cm2

array([[132,  15],
       [ 38,  46]])

In [48]:
# make predictions
cr=metrics.classification_report(y_test,y_predict)
print(cr)

             precision    recall  f1-score   support

        0.0       0.78      0.81      0.80       147
        1.0       0.65      0.61      0.63        84

avg / total       0.73      0.74      0.73       231



Precision: Within a given set of positively-labeled results, the fraction that were true positives = tp/(tp + fp)

Recall: Given a set of positively-labeled results, the fraction of all positives that were retrieved = tp/(tp + fn)

Accuracy: tp + tn / (tp + tn + fp +fn) But this measure can be dominated by larger class. Suppose 10, 90 and 80 of 90 is correctly predicted while only 2 of 0 is predicted correctly. Accuracy is 80+2 / 100 i.e. 82%

TO over come the dominance of the majority class, use weighted measure (not shown)


F is harmonic mean of precision and recal given by ((B^2 +1) PR) / (B^2P +R)
When B is set to 1 we get F1 = 2PR / (P+R)

In [57]:
# make predictions
cr2=metrics.classification_report(y_test,y2_predict)
print(cr2)

             precision    recall  f1-score   support

        0.0       0.78      0.90      0.83       147
        1.0       0.75      0.55      0.63        84

avg / total       0.77      0.77      0.76       231

