In [None]:
# Building Logistic Regression Classifier from scratch using python code..
# yhat=(exp(b0+b1x1+b2x2-----)/1+exp(b0+b1x1+b2x2-----))
# Most simplfied yhat=1/(1+exp(-(b0+b1x1+b2x2-----)))

In [1]:
# Load csv
from csv import reader
def load_csv(filename):
    dataset=list()
    open_file=open(filename)
    read_file=reader(open_file)
    for row in read_file:
        if not row:
            continue
        dataset.append(row)
    return dataset

In [2]:
# Converting String column values to float values
def convert_str_to_float(dataset,column):
    for row in dataset:
        row[column]=float(row[column])

In [3]:
# Using Statistics finding min and max values for each attribute
def minmax(dataset):
    minmax=list()
    for i in range(len(dataset[0])):
        column_value=[row[i] for row in dataset]
        min_value=min(column_value)
        max_value=max(column_value)
        minmax.append([min_value,max_value])
    return minmax        

In [4]:
# Using Statistics Normalizing the dataset
def normalize_scale(dataset,minmax):
    for row in dataset:
        for i in range(len(row)):
            row[i]=(row[i]-minmax[i][0])/(minmax[i][1]-minmax[i][0])    

In [5]:
# Using Statistics building Model Accuracy using KFold Cross validation technique
from random import seed
from random import randrange
def KFold(dataset,folds):
    fold_values=list()
    dataset_copy=list(dataset)
    fold_size=int(len(dataset)/folds)
    for _ in range(folds):
        fold=list()
        while len(fold)<fold_size:
            index=randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        fold_values.append(fold)
    return fold_values    

In [6]:
# Using Statistics Building Model Accuracy metrics prediction using classification accuracy technique
def accuracy_metrics(actual,predicted):
    correct=0
    for i in range(len(actual)):
        if actual[i]==predicted[i]:
            correct+=1
    return correct/float(len(actual))*100.0

In [8]:
# Predicte model values
from math import exp
def predict(row,coeff):
    yhat=coeff[0]
    for i in range(len(row)-1):
        yhat+=coeff[i+1]*row[i]
    return 1.0/(1.0+exp(-yhat))

In [9]:
# Find coefficient from train data using stochastic gradient descent technique
def coefficient_sgd(train,learning_rate,epoch):
    coeff=[0.0 for row in range(len(dataset[0]))]
    for _ in range(epoch):
        for row in train:
            yhat=predict(row,coeff)
            error=row[-1]-yhat
            coeff[0]=coeff[0]+learning_rate*error*yhat*(1.0-yhat)
            for i in range(len(row)-1):
                coeff[i+1]=coeff[i+1]+learning_rate*error*yhat*(1.0-yhat)*row[i]
    return coeff           

In [10]:
# Evaluate Model accuracy
def evaluate_model(dataset,algorithm,folds,*args):
    folds=KFold(dataset,folds)
    accuracy=list()
    for fold in folds:
        train_set=list(folds)
        train_set.remove(fold)
        train_set=sum(train_set,[])
        test_set=list()
        for row in fold:
            row_copy=list(row)
            test_set.append(row_copy)
            row_copy[-1]=None
        predict=algorithm(train_set,test_set,*args)
        actual=[row[-1] for row in fold]
        classification_accuracy=accuracy_metrics(actual,predict)
        accuracy.append(classification_accuracy)
    return accuracy

In [11]:
# Logistic regression Algorithm 
def logisitic_regression_classifier(train,test,learning_rate,epoch):
    predicted=list()
    coeff=coefficient_sgd(train,learning_rate,epoch)
    for row in test:
        yhat=predict(row,coeff)
        yhat=round(yhat)
        predicted.append(yhat)
    return predicted

In [21]:
# Logistic regression classifier for pima indian diabets sample dataset
seed(1)
filename='pima-indians-diabetes.csv'
dataset=load_csv(filename)
for column in range(len(dataset[0])):
    convert_str_to_float(dataset,column)
data_minmax=minmax(dataset)
normalize_scale(dataset,data_minmax)
l_rate=0.1
epoch=100
folds=5
accuracy=evaluate_model(dataset,logisitic_regression_classifier,folds,l_rate,epoch)

In [22]:
print(accuracy)

[73.20261437908496, 75.81699346405229, 75.81699346405229, 83.66013071895425, 78.43137254901961]


In [23]:
print("mean_accuracy:",(sum(accuracy)/float(len(accuracy))))

('mean_accuracy:', 77.38562091503267)
