**Preprocessing**

In [234]:
# Load libraries
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics

#Import scikit-learn metrics module for accuracy calculation

In [235]:
col_names = ['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label']
# load dataset
pima = pd.read_csv("https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv", header=None, names=col_names)

In [236]:
pima.head()

Unnamed: 0,pregnant,glucose,bp,skin,insulin,bmi,pedigree,age,label
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [237]:
#split dataset in features and target variable
feature_cols = ['pregnant', 'insulin', 'bmi', 'age','glucose','bp','pedigree']
X = pima[feature_cols] # Features
y = pima.label # Target variable

In [238]:
# Check columns with zero values
print("Total number of rows: {0}".format(len(pima)))
print("Number of rows missing Pregnancies: {0}".format(len(pima.loc[pima['pregnant']==0])))
print("Number of rows missing Glucose: {0}".format(len(pima.loc[pima['glucose']==0])))
print("Number of rows missing BloodPressure: {0}".format(len(pima.loc[pima['bp']==0])))
print("Number of rows missing SkinThickness: {0}".format(len(pima.loc[pima['skin']==0])))
print("Number of rows missing Insulin: {0}".format(len(pima.loc[pima['insulin']==0])))
print("Number of rows missing BMI: {0}".format(len(pima.loc[pima['bmi']==0])))
print("Number of rows missing DiabetesPedigreeFunction: {0}".format(len(pima.loc[pima['pedigree']==0])))
print("Number of rows missing Age: {0}".format(len(pima.loc[pima['age']==0])))


Total number of rows: 768
Number of rows missing Pregnancies: 111
Number of rows missing Glucose: 5
Number of rows missing BloodPressure: 35
Number of rows missing SkinThickness: 227
Number of rows missing Insulin: 374
Number of rows missing BMI: 11
Number of rows missing DiabetesPedigreeFunction: 0
Number of rows missing Age: 0


In [239]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test|

In [240]:
# Imputing zeros values in the dataset
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=0, strategy='mean')
X_train = imputer.fit_transform(X_train)
X_test = imputer.fit_transform(X_test)

**Decision tree classifier**

In [268]:

# Create Decision Tree classifer object
clf = DecisionTreeClassifier(criterion="entropy", max_depth=5)

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8181818181818182


In [269]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
metrics.confusion_matrix(y_test,y_pred)

cm=confusion_matrix(y_test, y_pred)
from sklearn.metrics import classification_report
print("\nCR by library method=\n\n\n",classification_report(y_test, y_pred))


CR by library method=


               precision    recall  f1-score   support

           0       0.85      0.86      0.86       146
           1       0.76      0.74      0.75        85

    accuracy                           0.82       231
   macro avg       0.81      0.80      0.80       231
weighted avg       0.82      0.82      0.82       231



**SVM**

In [270]:
from sklearn.svm import SVC

In [271]:
# Instantiate the Support Vector Classifier (SVC)
svc = SVC(C=11.0, random_state=11, kernel='poly')
 
# Fit the model
svc.fit(X_train, y_train)

# Make the predictions
y_predict = svc.predict(X_test)

# Measure the performance
print("Accuracy score %.3f" %metrics.accuracy_score(y_test, y_predict))


Accuracy score 0.779


In [272]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
metrics.confusion_matrix(y_test,y_predict)

cm=confusion_matrix(y_test, y_predict)
from sklearn.metrics import classification_report
print("\nCR by library method=\n\n\n",classification_report(y_test, y_predict))


CR by library method=


               precision    recall  f1-score   support

           0       0.77      0.94      0.84       146
           1       0.83      0.51      0.63        85

    accuracy                           0.78       231
   macro avg       0.80      0.72      0.74       231
weighted avg       0.79      0.78      0.76       231



**Logistic Regression**

In [273]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
# define the model
model = LogisticRegression()
# fit on the training set
model.fit(X_train, y_train)
# predict on test set
yhat = model.predict(X_test)
# evaluate predictions
accuracy = accuracy_score(y_test, yhat)
print('Accuracy: %.2f' % (accuracy*100))

Accuracy: 78.79


In [274]:
metrics.confusion_matrix(y_test,yhat)

cm=confusion_matrix(y_test, yhat)
from sklearn.metrics import classification_report
print("\nCR by library method=\n\n\n",classification_report(y_test, yhat))


CR by library method=


               precision    recall  f1-score   support

           0       0.79      0.90      0.84       146
           1       0.77      0.60      0.68        85

    accuracy                           0.79       231
   macro avg       0.78      0.75      0.76       231
weighted avg       0.79      0.79      0.78       231



**Trying xgboost for better accuarcy**

In [275]:
!pip install xgboost



In [276]:
from numpy import loadtxt
from xgboost import XGBClassifier

# fit model no training data185

xg_reg = XGBClassifier( learning_rate =0.9,
                       n_estimators=220,
                       max_depth=7,
                       min_child_weight=1,
                       gamma=7
                       ,subsample=0.8,
                       colsample_bytree=0.8, 
                       objective= 'binary:logistic',
                       nthread=8, 
                       scale_pos_weight=1,
                       seed=27)
xg_reg.fit(X_train, y_train)
#print(xg_reg)
preds = xg_reg.predict(X_test) 
cm=confusion_matrix(y_test,preds)
print("\nCR by library method=\n\n\n",
          classification_report(y_test,preds))


CR by library method=


               precision    recall  f1-score   support

           0       0.85      0.86      0.85       146
           1       0.75      0.74      0.75        85

    accuracy                           0.81       231
   macro avg       0.80      0.80      0.80       231
weighted avg       0.81      0.81      0.81       231



**The best accuarcy and recall was for Decision tree classifier then XGboost**