In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

# importing ploting libraries
import matplotlib.pyplot as plt   

#importing seaborn for statistical plots
import seaborn as sns

#Let us break the X and y dataframes into training set and test set. For this we will use
#Sklearn package's data splitting function which is based on random function

from sklearn.model_selection import train_test_split

import numpy as np

# calculate accuracy measures and confusion matrix
from sklearn import metrics

In [2]:
# To enable plotting graphs in Jupyter notebook
%matplotlib inline 

In [3]:
# The data lies in the following URL.
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"

In [4]:
#Attribute Information:
#1. Number of times pregnant 
#2. Plasma glucose concentration a 2 hours in an oral glucose tolerance test 
#3. Diastolic blood pressure (mm Hg) 
#4. Triceps skin fold thickness (mm) 
#5. 2-Hour serum insulin (mu U/ml) 
#6. Body mass index (weight in kg/(height in m)^2) 
#7. Diabetes pedigree function 
#8. Age (years) 
#9. Class variable (0 or 1) 

In [5]:
# Since it is a data file with no header, we will supply the column names which have been obtained from the above URL 
# Create a python list of column names called "names"

colnames = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

#Load the file from local directory using pd.read_csv which is a special form of read_table
#while reading the data, supply the "colnames" list


In [8]:
prima_df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data", names=colnames)

HTTPError: HTTP Error 404: Not Found

In [None]:
prima_df.head()

In [None]:
# Let us check whether any of the columns has any value other than numeric i.e. data is not corrupted such as a "?" instead of 
# a number.

# we use np.isreal a numpy function which checks each column for each row and returns a bool array, 
# where True if input element is real.
# applymap is pandas dataframe function that applies the np.isreal function columnwise
# Following line selects those rows which have some non-numeric value in any of the columns hence the  ~ symbol

prima_df[~prima_df.applymap(np.isreal).all(1)]

In [None]:
# replace the missing values in pima_df with median value :Note, we do not need to specify the column names
# every column's missing value is replaced with that column's median respectively
prima_df = prima_df.fillna(prima_df.median())
prima_df.describe()

In [None]:
#Lets analysze the distribution of the various attributes
prima_df.describe().transpose()

In [None]:
prima_df['class'].value_counts()

In [None]:
# Let us look at the target column which is 'class' to understand how the data is distributed amongst the various values

# Most are not diabetic. The ratio is almost 1:2 in favor or class 0.  The model's ability to predict class 0 will 
# be better than predicting class 1. 
prima_df.groupby(["class"]).count()

In [None]:
# Separate the independent attributes i.e. columns 0 to 8 and store them in X array
# Store the target column (column 8) into Y array

 # select all rows and first 7 columns which are the attributes
# select all rows and the 8th column which is the classification "Yes", "No" for diabeties
array = prima_df.values
X = array[:,0:7] # select all rows and first 8 columns which are the attributes
Y = array[:,8]   # select all rows and the 8th column which is the classification "Yes", "No" for diabeties


In [None]:
# Create the training and test data set in the ratio of 70:30 respectively. Can be any other ratio...

# taking 70:30 training and test set
# Random numbmer seeding for reapeatability of the code
test_size = 0.30 # taking 70:30 training and test set
seed = 7  # Random numbmer seeding for reapeatability of the code
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
type(X_train)

In [None]:
y_train[:5]

In [None]:
# Invoking the NB Gaussian function to create the model
# fitting the model in the training data set
model=GaussianNB()
model.fit(X_train,y_train)
y_predict=model.predict(X_test)
y_predict[:10]

In [None]:
pd.DataFrame(y_test)[0].value_counts()

In [None]:
acc=metrics.accuracy_score(y_test,y_predict)
acc

In [None]:
cm=metrics.confusion_matrix(y_test,y_predict)
cm

In [None]:
# make predictions
cr=metrics.classification_report(y_test,y_predict)
print(cr)

Precision: Within a given set of positively-labeled results, the fraction that were true positives = tp/(tp + fp)
Recall: Given a set of positively-labeled results, the fraction of all positives that were retrieved = tp/(tp + fn)
Accuracy: tp + tn / (tp + tn + fp +fn) But this measure can be dominated by larger class. Suppose 10, 90 and 80 of 90 is correctly predicted while only 2 of 0 is predicted correctly. Accuracy is 80+2 / 100 i.e. 82%

TO over come the dominance of the majority class, use weighted measure (not shown)

F is harmonic mean of precision and recal given by ((B^2 +1) PR) / (B^2P +R)
When B is set to 1 we get F1 = 2PR / (P+R)