In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns

In [2]:
data = pd.read_csv('./diabetes.csv')

correlations = data.corr()['Outcome']

features = correlations[abs(correlations) >= 0.1]
featureNames = [name for name, cor in features.iteritems()]

featureNames.remove('Outcome')

In [3]:
from sklearn.model_selection import train_test_split

mX = data[featureNames]
my = data['Outcome']

featuresMean = np.mean(mX)
featuresSigma = np.std(mX)

mX = (mX - featuresMean) / featuresSigma

mX.insert(0, 'BIAS', np.ones(len(mX)))

X, X_test, y, y_test = train_test_split(mX, my, test_size = 0.3, random_state = 3)

(m, n) = X.shape

In [4]:
def hypothesis(theta, x) : 
    z = np.dot(x, theta)
    h = 1 / (1 + np.exp(-z))
    return h

In [5]:
def costFunction(theta) : 
    h = hypothesis(theta, X)
    i = np.where((h != 1) & (h != 0))
    J =  (-1 / m) * sum((y.iloc[i] * np.log(h[i])) + ((1 - y.iloc[i]) * np.log(1 - h[i])))
    return J

In [6]:
def differentialCostFunction(theta) : 
    d = np.sum((hypothesis(theta, X) - y).T * X.T, axis = 1)
    return d

In [7]:
def gradientDescent() : 
    theta, alpha = np.zeros(n, dtype=np.float64), 0.01
    J = costFunction(theta)
    while True : 
        newTheta = theta - (alpha / m) * differentialCostFunction(theta)
        newJ = costFunction(newTheta)
        if newJ >= J or abs(newJ - J) <= 0.00001 : 
            break
        else : 
            theta = newTheta
            J = newJ
    return theta

In [8]:
theta = gradientDescent()

print(f'Parameters obtained by gradient descent : {theta}')

In [10]:
threshold = 0.5
predictions = hypothesis(theta, X_test)

predictions[predictions > threshold] = 1
predictions[predictions <= threshold] = 0

dif = predictions - y_test

correct = len(dif[dif == 0])

print(f'Efficiency of the model {correct * 100 / len(y_test)}%')

Efficiency of the model 73.5930735930736%
