# Testing various ML algorithms on Pima Indian Diabetes dataset

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
from sklearn import metrics
%matplotlib inline

col_names = ['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label']
pima = pd.read_csv("diabetes.csv")
pima.columns = col_names

In [2]:
pima.head()

Unnamed: 0,pregnant,glucose,bp,skin,insulin,bmi,pedigree,age,label
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
feature_cols = ['pregnant', 'insulin', 'bmi', 'age','glucose','bp','pedigree']
X = pima[feature_cols]
y = pima.label

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.33, random_state=0)

# Gaussian Naive Bayes Classifier

In [4]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(Xtrain, ytrain)

ypred = clf.predict(Xtest)
# (true labels, predicted)
print("Accuracy:", metrics.accuracy_score(ytest, ypred))
# (true labels, predicted)
print("Confusion Matrix:\n", metrics.confusion_matrix(ytest, ypred))
# (predicted, true labels)
print("Classification Report:\n", metrics.classification_report(ypred, ytest))

Accuracy: 0.7598425196850394
Confusion Matrix:
 [[150  20]
 [ 41  43]]
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.79      0.83       191
           1       0.51      0.68      0.59        63

   micro avg       0.76      0.76      0.76       254
   macro avg       0.70      0.73      0.71       254
weighted avg       0.79      0.76      0.77       254



# Multinomial Naive Bayes Classifier

In [5]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(Xtrain, ytrain)

ypred = clf.predict(Xtest)
# (true labels, predicted)
print("Accuracy:", metrics.accuracy_score(ytest, ypred))
# (true labels, predicted)
print("Confusion Matrix:\n", metrics.confusion_matrix(ytest, ypred))
# (predicted, true labels)
print("Classification Report:\n", metrics.classification_report(ypred, ytest))

Accuracy: 0.6062992125984252
Confusion Matrix:
 [[122  48]
 [ 52  32]]
Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.70      0.71       174
           1       0.38      0.40      0.39        80

   micro avg       0.61      0.61      0.61       254
   macro avg       0.55      0.55      0.55       254
weighted avg       0.61      0.61      0.61       254



# Bernoulli Naive Bayes Classifier

In [6]:
from sklearn.naive_bayes import BernoulliNB
clf = BernoulliNB()
clf.fit(Xtrain, ytrain)

ypred = clf.predict(Xtest)
# (true labels, predicted)
print("Accuracy:", metrics.accuracy_score(ytest, ypred))
# (true labels, predicted)
print("Confusion Matrix:\n", metrics.confusion_matrix(ytest, ypred))
# (predicted, true labels)
print("Classification Report:\n", metrics.classification_report(ypred, ytest))

Accuracy: 0.6653543307086615
Confusion Matrix:
 [[169   1]
 [ 84   0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.67      0.80       253
           1       0.00      0.00      0.00         1

   micro avg       0.67      0.67      0.67       254
   macro avg       0.50      0.33      0.40       254
weighted avg       0.99      0.67      0.80       254



# K-Nearest Neighbors Classifier

In [7]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(Xtrain, ytrain)

ypred = clf.predict(Xtest)
# (true labels, predicted)
print("Accuracy:", metrics.accuracy_score(ytest, ypred))
# (true labels, predicted)
print("Confusion Matrix:\n", metrics.confusion_matrix(ytest, ypred))
# (predicted, true labels)
print("Classification Report:\n", metrics.classification_report(ypred, ytest))

Accuracy: 0.7086614173228346
Confusion Matrix:
 [[138  32]
 [ 42  42]]
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.77      0.79       180
           1       0.50      0.57      0.53        74

   micro avg       0.71      0.71      0.71       254
   macro avg       0.66      0.67      0.66       254
weighted avg       0.72      0.71      0.71       254



# Multinomial Logistic Regression

In [8]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial')
clf.fit(Xtrain, ytrain)

ypred = clf.predict(Xtest)
# (true labels, predicted)
print("Accuracy:", metrics.accuracy_score(ytest, ypred))
# (true labels, predicted)
print("Confusion Matrix:\n", metrics.confusion_matrix(ytest, ypred))
# (predicted, true labels)
print("Classification Report:\n", metrics.classification_report(ypred, ytest))

Accuracy: 0.7834645669291339
Confusion Matrix:
 [[154  16]
 [ 39  45]]
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.80      0.85       193
           1       0.54      0.74      0.62        61

   micro avg       0.78      0.78      0.78       254
   macro avg       0.72      0.77      0.73       254
weighted avg       0.82      0.78      0.79       254





# Support Vector Classifier

In [9]:
from sklearn.svm import SVC
clf = SVC(gamma='auto')
clf.fit(Xtrain, ytrain)

ypred = clf.predict(Xtest)
# (true labels, predicted)
print("Accuracy:", metrics.accuracy_score(ytest, ypred))
# (true labels, predicted)
print("Confusion Matrix:\n", metrics.confusion_matrix(ytest, ypred))
# (predicted, true labels)
print("Classification Report:\n", metrics.classification_report(ypred, ytest))

Accuracy: 0.6692913385826772
Confusion Matrix:
 [[170   0]
 [ 84   0]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.67      0.80       254
           1       0.00      0.00      0.00         0

   micro avg       0.67      0.67      0.67       254
   macro avg       0.50      0.33      0.40       254
weighted avg       1.00      0.67      0.80       254



  'recall', 'true', average, warn_for)


# Multi-layer Neural Networks

In [10]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, 
                    hidden_layer_sizes=(10, 5), random_state=1)

clf.fit(Xtrain, ytrain)

ypred = clf.predict(Xtest)
# (true labels, predicted)
print("Accuracy:", metrics.accuracy_score(ytest, ypred))
# (true labels, predicted)
print("Confusion Matrix:\n", metrics.confusion_matrix(ytest, ypred))
# (predicted, true labels)
print("Classification Report:\n", metrics.classification_report(ypred, ytest))

Accuracy: 0.7440944881889764
Confusion Matrix:
 [[147  23]
 [ 42  42]]
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.78      0.82       189
           1       0.50      0.65      0.56        65

   micro avg       0.74      0.74      0.74       254
   macro avg       0.68      0.71      0.69       254
weighted avg       0.77      0.74      0.75       254

