# Diabetes prediction using Logistic Regression using a self-written gradient descent algorithm

In [390]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns

### Fetching the dataset and selecting features according to a fixed correlation threshold

In [391]:
data = pd.read_csv('./diabetes.csv')

correlations = data.corr()['Outcome']

corThreshold = 0.1
features = correlations[abs(correlations) >= corThreshold]
featureNames = [name for name, cor in features.iteritems()]

featureNames.remove('Outcome')

In [392]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [393]:
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


### Normalizing and splitting the dataset

In [394]:
from sklearn.model_selection import train_test_split

mX = data[featureNames]
my = data['Outcome']

featuresMean = np.mean(mX)
featuresSigma = np.std(mX)

mX = (mX - featuresMean) / featuresSigma

mX.insert(0, 'BIAS', np.ones(len(mX)))

X, X_test, y, y_test = train_test_split(mX, my, test_size = 0.3, random_state = 3)

(m, n) = X.shape

### Hypothesis function which predicts the probabilities (Sigmoid function)

In [395]:
def hypothesis(theta, x) : 
    z = np.dot(x, theta)
    h = 1 / (1 + np.exp(-z))
    return h

### Cost function

In [396]:
def costFunction(theta) : 
    h = hypothesis(theta, X)
    i = np.where((h != 1) & (h != 0))
    J =  (-1 / m) * sum((y.iloc[i] * np.log(h[i])) + ((1 - y.iloc[i]) * np.log(1 - h[i])))
    return J

In [397]:
def differentialCostFunction(theta) : 
    d = np.sum((hypothesis(theta, X) - y).T * X.T, axis = 1)
    return d

### Gradient descent which converges parameters near the global minimum

In [398]:
def gradientDescent() : 
    theta, alpha = np.zeros(n, dtype=np.float64), 0.01
    J = costFunction(theta)
    while True : 
        newTheta = theta - (alpha / m) * differentialCostFunction(theta)
        newJ = costFunction(newTheta)
        if newJ >= J or abs(newJ - J) <= 0.00001 : 
            break
        else : 
            theta = newTheta
            J = newJ
    return theta

In [399]:
theta = gradientDescent()

print(f'Parameters obtained by gradient descent : \n{theta}')

Parameters obtained by gradient descent : 
BIAS                       -0.818282
Pregnancies                 0.292493
Glucose                     0.886484
Insulin                     0.007025
BMI                         0.501330
DiabetesPedigreeFunction    0.291948
Age                         0.201066
dtype: float64


### Predicting the value of the testing set with a decision boundary

In [400]:
threshold = 0.58
predictions = hypothesis(theta, X_test)

predictions[predictions > threshold] = 1
predictions[predictions <= threshold] = 0

dif = predictions - y_test

correct = len(dif[dif == 0])

print(f'Efficiency of the model {correct * 100 / len(y_test)}%')

Efficiency of the model 74.02597402597402%


### Some classification metrics

In [401]:
from sklearn.metrics import confusion_matrix, precision_score, roc_curve

con = confusion_matrix(y_test, predictions)
pre = precision_score(y_test, predictions)

print(f'Confusion matrix : \n{con}')
print(f'Precision : {pre}')

Confusion matrix : 
[[121  12]
 [ 48  50]]
Precision : 0.8064516129032258
