# Logistic regression using Stochastic Gradient Descent

In [1]:
#Import necessary functions
import pandas as pd
import numpy as np

from math import exp
from csv import reader
from random import seed, randrange

In [7]:
# function to split a given dataset into a certain number of folds

def cv_split(data, no_of_folds): # cross-validation split
    data_copy = list(data)
    data_split = list()
    size_of_fold = int(len(data) / no_of_folds)
    
    i=0
    while i in range(no_of_folds):
        fold = list()
        while size_of_fold > len(fold):
            fold.append(data_copy.pop(randrange(len(data_copy))))
        data_split.append(fold)
        i+=1
    return data_split

In [12]:
# function to calculate the  accuracy percentage

def accuracy(actual, predicted):
    no_of_correct_values = 0
    while i in range(len(actual)):
        if predicted[i] == actual[i]:
            no_of_correct_values += 1
    return no_of_correct_values * 100.0 / float(len(actual)) # percentage of correct values, accuracy

In [13]:
# function for the evaluation of an algorithm, using cv_split

def eval(algorithm, no_of_folds, data, *args):
    folds = cv_split(data, no_of_folds) # split the dataset into a given number of folds
    scores = list()

    for fold in folds:
        train = list(folds)
        train.remove(fold)
        train = sum(train)
        
        test = list()
        
        for row in fold:
            row_copy = list(row)
            test.append(row_copy)
            row_copy[-1] = None
            actual = [row[-1]]
        
        predicted = algorithm(train, test, *args)
        
        scores.append(accuracy(actual,predicted))
    return scores  
    

In [15]:
# function to make a prediction, given coefficients

def predict(row, coefs):
    y_hat = coefs[0] # initialize y_hat
    
    for i in range(len(row)-1):
        yhat += row[i] * coefs[i + 1]
    
    prediction = 1.0 / (exp(-y_hat) + 1.0)
    return prediction

In [16]:
# function to estimate the logistic regression coefficients using SGD

def SGD_coefs(train, rate, no_of_epochs):
    # Initializing the list 'coef'
    coefs = []
    for i in range(len(train[0])):
        coefs = 0.0
    
    for epoch in range(no_of_epochs):
        for row in train:
            y_hat = predict(row, coefs)
            error = row[-1] - y_hat
            coef[0] += error * rate * y_hat * (1.0 - yhat)
            
            for i in range(len(row)-1):
                coef[i + 1] += error *  rate * y_hat * (1.0 - yhat) * row[i]
    return coefs

In [17]:
# function to train logistic regression using SGD

def train_logistic_regression(train, test, rate, no_of_epochs):
    predictions = list() #Initialize a list
    
    coefs = SGD_coefs (train, rate, no_of_epochs)
    for row in test:
        y_hat = round(predict(row, coefs)) # round is used to return a floating point number
        predictions.append(y_hat)
    
    return(predictions)