In [1]:
import pandas as pd
from sklearn import model_selection
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB

In [2]:
# The source data does not have any column names. Read the documentation about the data to identify the column names
# supplying list of names as input to create the dataframe

#url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
#dataframe = pandas.read_csv(url, names=names)
dataframe = pd.read_csv("pima-indians-diabetes.data" , names=names)

In [3]:
# Separate the independent attributes i.e. columns 0 to 8 and store them in X array
# Store the target column (column 8) into Y array

array = dataframe.values
X = array[:,0:8] # select all rows and first 7 columns which are the attributes
Y = array[:,8]   # select all rows and the 8th column which is the classification "Yes", "No" for diabeties

In [4]:
# Create the training and test data set in the ratio of 70:30 respectively. Can be any other ratio...

test_size = 0.30 # taking 70:30 training and test set
seed = 7  # Random numbmer seeding for reapeatability of the code

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, Y, test_size=test_size, random_state=seed)

In [5]:
# Invoking the NB Gaussian function to create the model
# fitting the model in the training data set
model = GaussianNB()
model.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [6]:
# make predictions
expected = y_test
predicted = model.predict(X_test)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

              precision    recall  f1-score   support

         0.0       0.80      0.79      0.79       147
         1.0       0.64      0.65      0.65        84

   micro avg       0.74      0.74      0.74       231
   macro avg       0.72      0.72      0.72       231
weighted avg       0.74      0.74      0.74       231

[[116  31]
 [ 29  55]]


Precision: Within a given set of positively-labeled results, the fraction that were true positives = tp/(tp + fp)
Recall: Given a set of positively-labeled results, the fraction of all positives that were retrieved = tp/(tp + fn)
Accuracy: tp + tn / (tp + tn + fp +fn) But this measure can be dominated by larger class. Suppose 10, 90 and 80 of 90 is correctly predicted while only 2 of 0 is predicted correctly. Accuracy is 80+2 / 100 i.e. 82%

TO over come the dominance of the majority class, use weighted measure (not shown)

F is harmonic mean of precision and recal given by ((B^2 +1) PR) / (B^2P +R)
When B is set to 1 we get F1 = 2PR / (P+R)

In [7]:
model.predict_proba(X_test)

array([[9.86158406e-01, 1.38415941e-02],
       [3.12377632e-02, 9.68762237e-01],
       [5.40661056e-02, 9.45933894e-01],
       [9.65063730e-01, 3.49362701e-02],
       [7.50733349e-01, 2.49266651e-01],
       [6.17347194e-01, 3.82652806e-01],
       [9.73540877e-01, 2.64591226e-02],
       [9.49533045e-01, 5.04669545e-02],
       [1.97584360e-03, 9.98024156e-01],
       [9.33570976e-01, 6.64290241e-02],
       [3.96715904e-02, 9.60328410e-01],
       [9.25722402e-01, 7.42775980e-02],
       [4.43096787e-02, 9.55690321e-01],
       [3.24707474e-01, 6.75292526e-01],
       [4.19892463e-01, 5.80107537e-01],
       [7.58418540e-01, 2.41581460e-01],
       [8.25171668e-01, 1.74828332e-01],
       [9.43196719e-01, 5.68032810e-02],
       [8.52324786e-01, 1.47675214e-01],
       [9.41636579e-01, 5.83634208e-02],
       [2.14623553e-01, 7.85376447e-01],
       [8.07445119e-01, 1.92554881e-01],
       [7.64889656e-03, 9.92351103e-01],
       [6.77903759e-09, 9.99999993e-01],
       [9.652053