# Diabetes Classification

In [1]:
#importing the necessary packages
import numpy as np
import pandas as pd

In [2]:
#reading the data
data=np.genfromtxt('diabetes.csv',dtype='str',delimiter=',')
header=data[0]
data=data[1:].astype('float64')
x=data[:,:data.shape[1]-1]
Y=data[:,data.shape[1]-1].reshape(data.shape[0],1)

The dataset consists of data from women above the age of 21

In [3]:
print(data.shape)

(768, 9)


The data consists of 768 rows and 9 columns.

In [4]:
print("The features used for predicting the labels of data are")
print(header[:len(header)-1])

The features used for predicting the labels of data are
['Pregnancies' 'Glucose' 'BloodPressure' 'SkinThickness' 'Insulin' 'BMI'
 'DiabetesPedigreeFunction' 'Age']


In [5]:
#Preprocessing the data
MIN=x.min(axis=0)
MAX=x.max(axis=0)
X=(x-MIN)/(MAX-MIN)

In [6]:
#to calculate accuracy
def calculate_accuracy(predicted_Y, actual_Y):
    c=0
    for i in range(0,len(predicted_Y),1):
        if(predicted_Y[i]==actual_Y[i]):
            c=c+1
    acc=c/len(predicted_Y)
    return acc

In [7]:
#to calculate F1 score
def F1_score(y_pred,y):
    tp=0
    fn=0
    fp=0
    for i in range(0,len(y_pred),1):
        if(y_pred[i]==y[i] and y_pred[i]==1):
            tp=tp+1
        elif(y_pred[i]==0 and y[i]==1):
            fn=fn+1
        elif(y_pred[i]==1 and y[i]==0):
            fp=fp+1
    if(tp==0):
        return 0
    precision=tp/(tp+fp)
    recall=tp/(tp+fn)
    misclassifications=(fp+fn)
    return(2*precision*recall/(precision+recall),misclassifications,precision,recall)

**For this classification problem 4 algorithms are used namely KNN, Logistic Regression, SVM and Decision Tree**

# KNN

In [8]:
#splitting the data into train, validation and test sets
test_x=X[int(0.8*X.shape[0]):]
test_y=Y[int(0.8*X.shape[0]):]
train_x=X[:int(0.8*X.shape[0])]
train_y=Y[:int(0.8*X.shape[0])]
l=train_x.shape[0]
val_X=train_x[int(l*0.8):]
val_Y=train_y[int(l*0.8):]
train_X=train_x[:int(l*0.8)]
train_Y=train_y[:int(l*0.8)]

In [9]:
#to compute the similarity between two vectors
def compute_ln_norm_distance(vector1, vector2, n):
    l_norm=0;
    for i in range(0,len(vector1),1):
        l_norm=l_norm+pow(abs(vector1[i]-vector2[i]),n)
    l_norm=pow(l_norm,1/n)
    return(l_norm)

In [10]:
#to find the k nearest neighbors of a given test example
def find_k_nearest_neighbors(train_X, test_example, k, n_in_ln_norm_distance):
    d=[]
    for i in range(0,len(train_X),1):
        l=compute_ln_norm_distance(train_X[i],test_example,n_in_ln_norm_distance)
        d=d+[[l,i]]
    d.sort()
    p=[]
    for i in range(0,k,1):
        p=p+[d[i][1]]
    return p

In [11]:
#to classify the points based on the most common value of the k nearest neighbors
def classify_points_using_knn(train_X, train_Y, test_X, n_in_ln_norm_distance, k):
    y=[]
    for i in range(0,len(test_X),1):
        k_i=find_k_nearest_neighbors(train_X,test_X[i],k,n_in_ln_norm_distance)
        K=[]
        for j in k_i:
            K=K+[train_Y[j]]
        p=np.unique(K,return_counts=True)
        p=p[0][np.argmax(p[1])]
        y=y+[p]
    return y

In [12]:
#to find the best value of k using the validation set
F1=0
k=0
n_in_ln_norm_distance=2
for i in range(1,30,1):
    print("For k =",i)
    y_pred=classify_points_using_knn(train_X,train_Y,val_X,n_in_ln_norm_distance,i)
    f1,mis,pre,rec=F1_score(y_pred,val_Y)
    print("F1 score =",f1)
    if(f1>F1):
        F1=f1
        k=i

For k = 1
F1 score = 0.5454545454545454
For k = 2
F1 score = 0.5416666666666666
For k = 3
F1 score = 0.6875
For k = 4
F1 score = 0.679245283018868
For k = 5
F1 score = 0.7076923076923077
For k = 6
F1 score = 0.5660377358490566
For k = 7
F1 score = 0.6551724137931034
For k = 8
F1 score = 0.6181818181818182
For k = 9
F1 score = 0.5573770491803278
For k = 10
F1 score = 0.5185185185185185
For k = 11
F1 score = 0.6031746031746033
For k = 12
F1 score = 0.5714285714285715
For k = 13
F1 score = 0.6
For k = 14
F1 score = 0.5862068965517241
For k = 15
F1 score = 0.6101694915254238
For k = 16
F1 score = 0.6206896551724138
For k = 17
F1 score = 0.6229508196721312
For k = 18
F1 score = 0.6071428571428571
For k = 19
F1 score = 0.6333333333333334
For k = 20
F1 score = 0.6206896551724138
For k = 21
F1 score = 0.6229508196721312
For k = 22
F1 score = 0.631578947368421
For k = 23
F1 score = 0.6333333333333334
For k = 24
F1 score = 0.6071428571428571
For k = 25
F1 score = 0.6440677966101696
For k = 26
F1

In [13]:
print("The optimum value of k is")
print(k)

The optimum value of k is
5


In [14]:
#predicting the labels of the test set
predicted_test_y=classify_points_using_knn(train_x,train_y,test_x,2,k)

In [15]:
#calculating accuracy and F1_score
accuracy_knn=calculate_accuracy(predicted_test_y,test_y)
F1_knn,mis_knn,pre_knn,rec_knn=F1_score(predicted_test_y,test_y)

In [16]:
print("Accuracy =",accuracy_knn)
print("F1 score =",F1_knn)
print("Precision =",pre_knn)
print("Recall =",rec_knn)
print("No. of misclassifications =",mis_knn)

Accuracy = 0.7337662337662337
F1 score = 0.5858585858585857
Precision = 0.6590909090909091
Recall = 0.5272727272727272
No. of misclassifications = 41


# Logistic Regression

In [17]:
test_x=x[int(0.8*X.shape[0]):]
test_y=Y[int(0.8*X.shape[0]):]
train_x=x[:int(0.8*X.shape[0])]
train_y=Y[:int(0.8*X.shape[0])]
l=train_x.shape[0]
val_X=train_x[int(l*0.8):]
val_Y=train_y[int(l*0.8):]
train_X=train_x[:int(l*0.8)]
train_Y=train_y[:int(l*0.8)]

In [18]:
#to calucate the sigmoid of any given vector
#the sigmoid function is given by sigmoid(x)=1/(1+e^(-x))
def sigmoid(Z):
    a=np.exp(-Z)
    a=a+1
    b=1/a
    return b

In [19]:
#to compute the cost function
def compute_cost(X, Y, W, b,Lambda):
    y=sigmoid(np.dot(X,W)+b)
    cost=(-1)*(Y*np.log(y)+(1-Y)*np.log(1-y))
    c=np.sum(cost)+(Lambda/2)*np.dot(W.T,W)
    c=c/X.shape[0]
    return c[0][0]

In [20]:
#to compute the gradient of the cost function
def compute_gradient_of_cost_function(X, Y, W, b,Lambda):
    a=sigmoid(np.dot(X,W)+b)
    z=a-Y
    db=np.sum(z)
    db=db/X.shape[0]
    dW=np.dot(X.T,z)
    dW=dW+Lambda*W
    dW=dW/X.shape[0]
    return(dW,db)

In [21]:
#to predict the labels
def predict_labels(X, W, b):
    a=sigmoid(np.dot(X,W)+b)
    a=np.where(a>=0.5,1,0)
    return a

In [22]:
#gradient descent:used to find the optimum value of the parameters W and b
def gradient_descent(X,Y,alpha,num_iters,Lambda):
    W=np.zeros((X.shape[1],1))
    b=0
    for i in range(0,num_iters,1):
        dW,db=compute_gradient_of_cost_function(X,Y,W,b,Lambda)
        W=W-alpha*dW
        b=b-alpha*db
    return(W,b)

In [23]:
# to find the optimum value of the regularization parameter lambda
l=[0,1,3,10,30,100,1000]
F1=0
for i in l:
    print("For lambda =",i)
    W,b=gradient_descent(train_X,train_Y,0.0002,100000,i)
    pred=predict_labels(val_X,W,b)
    f1,mis,pre,rec=F1_score(pred,val_Y)
    print("F1 score =",f1)
    if(f1>F1):
        F1=f1
        Lambda=i

For lambda = 0
F1 score = 0.45614035087719296
For lambda = 1
F1 score = 0.45614035087719296
For lambda = 3
F1 score = 0.45614035087719296
For lambda = 10
F1 score = 0.45614035087719296
For lambda = 30
F1 score = 0.4642857142857143
For lambda = 100
F1 score = 0.43636363636363634
For lambda = 1000
F1 score = 0.4528301886792453


In [24]:
print("The optimum value of Lambda is")
print(Lambda)

The optimum value of Lambda is
30


In [25]:
#training the algorithm on the train + validation set using the optimum value of lambda
W,b=gradient_descent(train_x,train_y,0.0002,500000,Lambda)

In [26]:
#predicting the labels of the test set
pred_y=predict_labels(test_x,W,b)
#calculating accuracy and F1 score
accuracy_lr=calculate_accuracy(pred_y,test_y)
F1_lr,mis_lr,pre_lr,rec_lr=F1_score(pred_y,test_y)

In [27]:
print("Accuracy =",accuracy_lr)
print("F1 score =",F1_lr)
print("Precision =",pre_lr)
print("Recall =",rec_lr)
print("No. of misclassifications =",mis_lr)

Accuracy = 0.7402597402597403
F1 score = 0.5555555555555556
Precision = 0.7142857142857143
Recall = 0.45454545454545453
No. of misclassifications = 40


# SVM

In [28]:
test_x=X[int(0.8*X.shape[0]):]
test_y=Y[int(0.8*X.shape[0]):]
train_x=X[:int(0.8*X.shape[0])]
train_y=Y[:int(0.8*X.shape[0])]
l=train_x.shape[0]
val_X=train_x[int(l*0.8):]
val_Y=train_y[int(l*0.8):]
train_X=train_x[:int(l*0.8)]
train_Y=train_y[:int(l*0.8)]

In [29]:
#importing the package necessary for SVM
from sklearn.svm import SVC

In [30]:
#using the validation set to find the optimum value of C
F1=0
for i in range (1,100,1):
    print("For C =",i)
    model=SVC(C=i,kernel='rbf',gamma='scale',decision_function_shape='ovr',break_ties=True)
    model.fit(train_X,train_Y.reshape(train_Y.shape[0],))
    pred_y=model.predict(val_X)
    f1,mis,pre,rec=F1_score(pred_y,val_Y)
    print("F1 score =",f1)
    if(f1>F1):
        F1=f1
        c=i
new_model=SVC(C=c,kernel='rbf',gamma='scale',decision_function_shape='ovr',break_ties=True)
new_model=new_model.fit(train_x,train_y.reshape(train_y.shape[0],))

For C = 1
F1 score = 0.7333333333333333
For C = 2
F1 score = 0.721311475409836
For C = 3
F1 score = 0.721311475409836
For C = 4
F1 score = 0.7301587301587302
For C = 5
F1 score = 0.7076923076923077
For C = 6
F1 score = 0.7076923076923077
For C = 7
F1 score = 0.7076923076923077
For C = 8
F1 score = 0.7076923076923077
For C = 9
F1 score = 0.71875
For C = 10
F1 score = 0.6984126984126984
For C = 11
F1 score = 0.6984126984126984
For C = 12
F1 score = 0.6984126984126984
For C = 13
F1 score = 0.6984126984126984
For C = 14
F1 score = 0.6984126984126984
For C = 15
F1 score = 0.6984126984126984
For C = 16
F1 score = 0.6984126984126984
For C = 17
F1 score = 0.6984126984126984
For C = 18
F1 score = 0.7096774193548386
For C = 19
F1 score = 0.7096774193548386
For C = 20
F1 score = 0.7096774193548386
For C = 21
F1 score = 0.7096774193548386
For C = 22
F1 score = 0.7096774193548386
For C = 23
F1 score = 0.7096774193548386
For C = 24
F1 score = 0.6984126984126984
For C = 25
F1 score = 0.69841269841269

In [31]:
print("The optimum value of C is")
print(c)

The optimum value of C is
1


In [32]:
#predicting the labels of the test set
pred_y=model.predict(test_x)
#calculating the accuracy and F1 score
accuracy_svm=calculate_accuracy(pred_y,test_y)
F1_svm,mis_svm,pre_svm,rec_svm=F1_score(pred_y,test_y)

In [33]:
print("Accuracy =",accuracy_svm)
print("F1 score =",F1_svm)
print("Precision =",pre_svm)
print("Recall =",rec_svm)
print("No. of misclassifications =",mis_svm)

Accuracy = 0.7857142857142857
F1 score = 0.6857142857142857
Precision = 0.72
Recall = 0.6545454545454545
No. of misclassifications = 33


# Decision Tree

In [34]:
test_x=X[int(0.8*X.shape[0]):]
test_y=Y[int(0.8*X.shape[0]):]
train_x=X[:int(0.8*X.shape[0])]
train_y=Y[:int(0.8*X.shape[0])]
l=train_x.shape[0]
val_X=train_x[int(l*0.8):]
val_Y=train_y[int(l*0.8):]
train_X=train_x[:int(l*0.8)]
train_Y=train_y[:int(l*0.8)]

In [35]:
#definition of a node of the tree
class Node:
    def __init__(self, predicted_class, depth):
        self.predicted_class = predicted_class
        self.feature_index = -1
        self.threshold = 0
        self.depth = depth
        self.left = None
        self.right = None

In [36]:
#to predict the majority class of a leaf node
def predict_class(Y):
    c=np.unique(Y,return_counts=True)
    return c[0][np.argmax(c[1])]

In [37]:
#to split the data based on a given feature value
def split_data_set(X, Y, feature_index, threshold):
    left=np.where(X[:,feature_index]<threshold,True,False)
    right=np.where(X[:,feature_index]>=threshold,True,False)
    left_x=X[left]
    left_y=Y[left]
    right_x=X[right]
    right_y=Y[right]
    return left_x,left_y,right_x,right_y

In [38]:
#to calculate the gini index of a particular split
def calculate_gini_index(left_y,right_y):
    n_l=len(left_y)
    n_r=len(right_y)
    gini=0
    u_l=np.unique(left_y,return_counts=True)
    u_r=np.unique(right_y,return_counts=True)
    u_l=u_l[1]/n_l
    u_r=u_r[1]/n_r
    u_l=u_l**2
    u_r=u_r**2
    s_l=np.sum(u_l)
    s_r=np.sum(u_r)
    gini=(1-s_l)*(n_l/(n_l+n_r))+(1-s_r)*(n_r/(n_l+n_r))
    return gini

In [39]:
#to get optimum split in the data
def get_best_split(X, Y):
    gini=1
    for i in range(0,len(X),1):
        for j in range(0,len(X[i]),1):
            l_x,l_y,r_x,r_y=split_data_set(X,Y,j,X[i][j])
            g=calculate_gini_index(l_y,r_y)
            if((g<gini)or((g==gini)and(b_f==j)and(b_t>X[i][j]))):
                gini=g
                b_f=j
                b_t=X[i][j]
    return([b_f,b_t])

In [40]:
#to construct the decision tree
def construct_tree(X, Y, depth = 0):
    node =Node(predict_class(Y),depth)
    if(depth>=max_depth):
        return node
    if(len(np.unique(Y))==1):
        return node
    if(len(Y)<=min_size):
        return node
    b_f,b_t=get_best_split(X,Y)
    l_x,l_y,r_x,r_y=split_data_set(X,Y,b_f,b_t)
    node.feature_index=b_f
    node.threshold=b_t
    node.left=construct_tree(l_x,l_y,depth+1)
    node.right=construct_tree(r_x,r_y,depth+1)
    return node    

In [41]:
#to predict the labels of a given data set
def predict(root, X):
    if(root.feature_index==-1):
        return root.predicted_class
    if(root.threshold>X[root.feature_index]):
        p_c=predict(root.left,X)
    else:
        p_c=predict(root.right,X)
    return p_c

In [42]:
#using the validation set to choose the optimum values of the hyper parameter:maximum depth and minimum size
m_d=[5,10,20,30]
m_s=[5,10,20,30]
F1=0
for i in m_d:
    max_depth=i
    for j in m_s:
        print("For maximum depth =",i,"and minimum size =",j)
        min_size=j
        root=construct_tree(train_X,train_Y)
        pred_y=[]
        for k in range(0,val_X.shape[0],1):
            pred_y=pred_y+[predict(root,val_X[k,])]
        f1,mis,pre,rec=F1_score(pred_y,val_Y)
        print("F1 score =",f1)
        if(f1>F1):
            F1=f1
            depth=i
            size=j

For maximum depth = 5 and minimum size = 5
F1 score = 0.6285714285714286
For maximum depth = 5 and minimum size = 10
F1 score = 0.6285714285714286
For maximum depth = 5 and minimum size = 20
F1 score = 0.6849315068493151
For maximum depth = 5 and minimum size = 30
F1 score = 0.6849315068493151
For maximum depth = 10 and minimum size = 5
F1 score = 0.5797101449275361
For maximum depth = 10 and minimum size = 10
F1 score = 0.6000000000000001
For maximum depth = 10 and minimum size = 20
F1 score = 0.6478873239436619
For maximum depth = 10 and minimum size = 30
F1 score = 0.6176470588235293
For maximum depth = 20 and minimum size = 5
F1 score = 0.5479452054794521
For maximum depth = 20 and minimum size = 10
F1 score = 0.6000000000000001
For maximum depth = 20 and minimum size = 20
F1 score = 0.6301369863013698
For maximum depth = 20 and minimum size = 30
F1 score = 0.6176470588235293
For maximum depth = 30 and minimum size = 5
F1 score = 0.5479452054794521
For maximum depth = 30 and minimu

In [43]:
print("max_depth is")
print(depth)
print("min_size is")
print(size)

max_depth is
5
min_size is
20


In [44]:
# constructing the tree using the optimum values of the hyper parameters
max_depth=depth
min_size=size
root=construct_tree(train_X,train_Y)

In [45]:
#predicting the labels of the test set
pred_y=[]
for i in range(0,test_x.shape[0],1):
    pred_y=pred_y+[predict(root,test_x[i,])]
#calculating the accuracy and F1 score
accuracy_tree=calculate_accuracy(pred_y,test_y)
F1_tree,mis_tree,pre_tree,rec_tree=F1_score(pred_y,test_y)

In [46]:
print("Accuracy =",accuracy_tree)
print("F1 score =",F1_tree)
print("Precision =",pre_tree)
print("Recall =",rec_tree)
print("No. of misclassifications =",mis_tree)

Accuracy = 0.7727272727272727
F1 score = 0.7200000000000001
Precision = 0.6428571428571429
Recall = 0.8181818181818182
No. of misclassifications = 35


In [47]:
matrix={'KNN':{'Accuracy':accuracy_knn,'Precision':pre_knn,'Recall':rec_knn,'F1 Score':F1_knn,'No. of Misclassifications':mis_knn},'LR':{'Accuracy':accuracy_lr,'Precision':pre_lr,'Recall':rec_lr,'F1 Score':F1_lr,'No. of Misclassifications':mis_lr},'SVM':{'Accuracy':accuracy_svm,'Precision':pre_svm,'Recall':rec_svm,'F1 Score':F1_svm,'No. of Misclassifications':mis_svm},'DTree':{'Accuracy':accuracy_tree,'Precision':pre_tree,'Recall':rec_tree,'F1 Score':F1_tree,'No. of Misclassifications':mis_tree}}
matrix=pd.DataFrame(matrix)
print(matrix)

                                 KNN         LR        SVM      DTree
Accuracy                    0.733766   0.740260   0.785714   0.772727
Precision                   0.659091   0.714286   0.720000   0.642857
Recall                      0.527273   0.454545   0.654545   0.818182
F1 Score                    0.585859   0.555556   0.685714   0.720000
No. of Misclassifications  41.000000  40.000000  33.000000  35.000000


From the above table we can see that the Decision Tree has the highest F1 score while the SVM has the highest accuracy

From the table we also see that the Decision Tree has the highest recall.

Which means that it has the fewest false negatives.

Whereas the SVM has th highest precision.

Which means that it has the fewest false positives.

As the algorithm here is used for diabetes classification.

Therefore the Decision Tree is preferred as it has the highest recall.