In [1]:
import numpy as np
import pandas as pd
import random
import copy

In [2]:
df = pd.read_csv('diabetes.csv')

In [3]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
pd.value_counts(df['Outcome'])

0    500
1    268
Name: Outcome, dtype: int64

In [5]:
# feature scaling
def minMaxScaler(x):
    minValues = []
    maxValues = []
    for i in range(len(x[0])):
        data = []
        for j in range(len(x)):
            data.append(x[j][i])
        minValues.append(min(data))
        maxValues.append(max(data))
    
    for i in range(len(x[0])):
        for j in range(len(x)):
            numer = x[j][i] - minValues[i]
            denom = maxValues[i] - minValues[i]
            x[j][i] = numer / denom
    
    return x
    

In [6]:
x = minMaxScaler(df.values)

In [7]:
x[0]

array([0.35294118, 0.74371859, 0.59016393, 0.35353535, 0.        ,
       0.50074516, 0.23441503, 0.48333333, 1.        ])

In [8]:
# train test split
def cross_validation(dataset, k=5):
    dataset_copy = list(dataset)
    size = len(dataset) // k
    folds = []
    for i in range(k):
        fold = []
        while len(fold) < size:
            index = random.randrange(0, len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        folds.append(fold)
    return folds

In [9]:
def predict(row, coef):
    x = coef[0]
    for i in range(len(row)):
        x += coef[i+1] * row[i]
    return 1 / (1 + np.exp(-x))

In [10]:
def accuracy(y, y_pred):
    count = 0
    for i in range(len(y)):
        if y[i] == y_pred[i]:
            count += 1
    return count / len(y) * 100

In [101]:
def gradientDescent(x_train, y_train, epochs, alpha):
    coef = np.zeros(x_train.shape[1] + 1)
    n = len(x_train)
    for epoch in range(epochs):
        for i in range(len(x_train)):
            y_pred = predict(x_train[i], coef)
            loss = y_train[i] - y_pred
            coef[0] = coef[0] - ((2/n) * loss) * alpha
            for j in range(x_train.shape[1]):
#                 print(loss, coef[j+1], x_train[j])
                coef[j+1] = coef[j+1] - ((2/n) * np.dot(loss, x_train[i][j])) * alpha
    return coef

In [102]:
def logisitc(x_train, y_train, x_test, y_test, epochs, alpha):
    coef = gradientDescent(x_train, y_train, epochs, alpha)
    predictions = []
    for row in x_test:
        y_pred = predict(row, coef)
        predictions.append(round(y_pred))
    acc = accuracy(y_test, predictions)
    return acc

In [107]:
def evaluate(dataset, epochs, alpha):
    folds = cross_validation(dataset)
    accuracies = []
    for i in range(len(folds)):
        x_train = []
        y_train = []
        x_test = []
        y_test = []
        train = folds.copy()
        train.pop(i)
        test = folds[i].copy()
        for fold in train:
            for data in fold:
                x_train.append(data[:-1])
                y_train.append(data[-1])
        
        for data in test:
            x_test.append(data[:-1])
            y_test.append(data[-1])
    
        x_train = np.asarray(x_train)
        x_test = np.asarray(x_test)
        y_train = np.asarray(y_train)
        y_test = np.asarray(y_test)
        
        acc = logisitc(x_train, y_train, x_test, y_test, epochs, alpha)
        print("Accuracy for fold_{} is : {}".format(i, acc))
        accuracies.append(acc)
    return accuracies

In [109]:
epochs = 1000
alpha = 0.01
evaluate(x, epochs, alpha)

Accuracy for fold_0 is : 33.33333333333333
Accuracy for fold_1 is : 37.908496732026144
Accuracy for fold_2 is : 29.411764705882355
Accuracy for fold_3 is : 35.294117647058826
Accuracy for fold_4 is : 37.908496732026144


[33.33333333333333,
 37.908496732026144,
 29.411764705882355,
 35.294117647058826,
 37.908496732026144]

In [18]:
folds = cross_validation(x)

In [24]:
len(folds)

5

In [23]:
len(folds[0])

153

In [21]:
folds[0][0]

array([0.29411765, 0.69849246, 0.6557377 , 0.35353535, 0.1891253 ,
       0.4709389 , 0.12083689, 0.06666667, 1.        ])

In [42]:
train = folds.copy()
train.pop(0)
test = folds[0].copy()

In [45]:
train[0][0]

array([0.11764706, 0.67336683, 0.57377049, 0.        , 0.        ,
       0.43070045, 0.19812126, 0.03333333, 1.        ])

In [47]:
test[0]

array([0.29411765, 0.69849246, 0.6557377 , 0.35353535, 0.1891253 ,
       0.4709389 , 0.12083689, 0.06666667, 1.        ])

In [92]:
i = np.array([0.11764706, 0.6281407,  0.49180328, 0.2020202,  0.16548463, 0.50372578,
 0.00426985, 0.16666667])

In [96]:

np.dot(0.5, i)

array([0.05882353, 0.31407035, 0.24590164, 0.1010101 , 0.08274231,
       0.25186289, 0.00213493, 0.08333333])