# Chapter 09 - Logistic Regression

$ylin = b_0 + b_1 * x_1$

$yhat = \frac{1.0}{1.0 + e^(-ylin)}$

In [1]:
from random import seed
from math import exp

from Codes.ch01_load_and_convert_data import load_csv, str_column_to_float
from Codes.ch02_scale_data_functions import dataset_minmax, normalize_dataset
from Codes.ch03_resampling_methods import cross_validation_split
from Codes.ch06_algorithm_test_harnesses import evaluate_algorithm_kfold

### Making predictions

In [2]:
# Make a prediction with coefficients
def predict(row, coefficients):
    yhat = coefficients[0]
    for i in range(len(row)-1):
        yhat += coefficients[i+1] * row[i]
    return 1.0/(1.0 + exp(-yhat))

In [3]:
# Test predictions
dataset = [[2.7810836,2.550537003,0],
[1.465489372,2.362125076,0],
[3.396561688,4.400293529,0],
[1.38807019,1.850220317,0],
[3.06407232,3.005305973,0],
[7.627531214,2.759262235,1],
[5.332441248,2.088626775,1],
[6.922596716,1.77106367,1],
[8.675418651,-0.242068655,1],
[7.673756466,3.508563011,1]]
coef = [-0.406605464, 0.852573316, -1.104746259]
for row in dataset:
    yhat = predict(row, coef)
    print("Expected=%.3f, Predicted=%.3f [%d]" % (row[-1], yhat, round(yhat)))

Expected=0.000, Predicted=0.299 [0]
Expected=0.000, Predicted=0.146 [0]
Expected=0.000, Predicted=0.085 [0]
Expected=0.000, Predicted=0.220 [0]
Expected=0.000, Predicted=0.247 [0]
Expected=1.000, Predicted=0.955 [1]
Expected=1.000, Predicted=0.862 [1]
Expected=1.000, Predicted=0.972 [1]
Expected=1.000, Predicted=0.999 [1]
Expected=1.000, Predicted=0.905 [1]


### Estimating Coefficients

$b_0(t+1) = b_0(t) + LR * (y(t) - yhat(t)) * yhat(t) * (1-yhat(y))$
$b_n(t+1) = b_n(t) + LR * (y(t) - yhat(t)) * yhat(t) * (1-yhat(y)) * x_n(t)$

In [4]:
# Estimate logistic regression coefficients using stochastic gradient descent
def coefficients_sgd(train, l_rate, n_epoch):
    coef = [0.0 for i in range(len(train[0]))]
    for epoch in range(n_epoch):
        sum_error = 0
        for row in train:
            yhat = predict(row, coef)
            error = row[-1] - yhat
            sum_error += error**2
            for i in range(len(row)-1):
                coef[i+1] = coef[i+1] + l_rate * error * yhat * (1.0 - yhat) * row[i]
        print('>epoch=%d, lrate=%.3f, error=%.3f' % (epoch, l_rate, sum_error))
    return coef  

In [5]:
# Calculate coefficients
dataset = [[2.7810836,2.550537003,0],
[1.465489372,2.362125076,0],
[3.396561688,4.400293529,0],
[1.38807019,1.850220317,0],
[3.06407232,3.005305973,0],
[7.627531214,2.759262235,1],
[5.332441248,2.088626775,1],
[6.922596716,1.77106367,1],
[8.675418651,-0.242068655,1],
[7.673756466,3.508563011,1]]
l_rate = 0.3
n_epoch = 100
coef = coefficients_sgd(dataset, l_rate, n_epoch)
print(coef)

>epoch=0, lrate=0.300, error=2.228
>epoch=1, lrate=0.300, error=1.642
>epoch=2, lrate=0.300, error=1.161
>epoch=3, lrate=0.300, error=0.880
>epoch=4, lrate=0.300, error=0.678
>epoch=5, lrate=0.300, error=0.544
>epoch=6, lrate=0.300, error=0.458
>epoch=7, lrate=0.300, error=0.396
>epoch=8, lrate=0.300, error=0.349
>epoch=9, lrate=0.300, error=0.312
>epoch=10, lrate=0.300, error=0.282
>epoch=11, lrate=0.300, error=0.256
>epoch=12, lrate=0.300, error=0.235
>epoch=13, lrate=0.300, error=0.216
>epoch=14, lrate=0.300, error=0.201
>epoch=15, lrate=0.300, error=0.187
>epoch=16, lrate=0.300, error=0.175
>epoch=17, lrate=0.300, error=0.164
>epoch=18, lrate=0.300, error=0.154
>epoch=19, lrate=0.300, error=0.146
>epoch=20, lrate=0.300, error=0.138
>epoch=21, lrate=0.300, error=0.131
>epoch=22, lrate=0.300, error=0.125
>epoch=23, lrate=0.300, error=0.119
>epoch=24, lrate=0.300, error=0.114
>epoch=25, lrate=0.300, error=0.109
>epoch=26, lrate=0.300, error=0.105
>epoch=27, lrate=0.300, error=0.101
>e

### Pima Indians Diabetes Case Study

In [6]:
# Logistic Regression Algorithm with Stochastic Gradient Descent
def logistic_regression(train, test, l_rate, n_epoch):
    predictions = list()
    coef = coefficients_sgd(train, l_rate, n_epoch)
    for row in test:
        yhat = predict(row, coef)
        yhat = round(yhat)
        predictions.append(yhat)
    return predictions

In [10]:
# Test the logistic regression algorithm on the diabetes dataset
seed(1)

# load and prepare data
filename = './data/pima-indians-diabetes.csv'
dataset = load_csv(filename)
for i in range(len(dataset[0])):
    str_column_to_float(dataset, i)

# normalize
minmax = dataset_minmax(dataset)
normalize_dataset(dataset, minmax)

# evaluate algorithm
n_folds = 5
l_rate = 0.1
n_epoch = 100
scores = evaluate_algorithm_kfold(dataset, logistic_regression, n_folds, l_rate, n_epoch)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

>epoch=0, lrate=0.100, error=144.599
>epoch=1, lrate=0.100, error=141.031
>epoch=2, lrate=0.100, error=138.717
>epoch=3, lrate=0.100, error=136.844
>epoch=4, lrate=0.100, error=135.332
>epoch=5, lrate=0.100, error=134.101
>epoch=6, lrate=0.100, error=133.090
>epoch=7, lrate=0.100, error=132.252
>epoch=8, lrate=0.100, error=131.552
>epoch=9, lrate=0.100, error=130.962
>epoch=10, lrate=0.100, error=130.461
>epoch=11, lrate=0.100, error=130.034
>epoch=12, lrate=0.100, error=129.666
>epoch=13, lrate=0.100, error=129.349
>epoch=14, lrate=0.100, error=129.073
>epoch=15, lrate=0.100, error=128.832
>epoch=16, lrate=0.100, error=128.621
>epoch=17, lrate=0.100, error=128.435
>epoch=18, lrate=0.100, error=128.271
>epoch=19, lrate=0.100, error=128.125
>epoch=20, lrate=0.100, error=127.995
>epoch=21, lrate=0.100, error=127.880
>epoch=22, lrate=0.100, error=127.776
>epoch=23, lrate=0.100, error=127.683
>epoch=24, lrate=0.100, error=127.599
>epoch=25, lrate=0.100, error=127.524
>epoch=26, lrate=0.100

## Future Works

* Tune The Example. Tune the learning rate, number of epochs and even data preparation
method to get an improved score on the dataset.
* Batch Stochastic Gradient Descent. Change the stochastic gradient descent algorithm
to accumulate updates across each epoch and only update the coefficients in a batch at
the end of the epoch.
* Additional Classification Problems. Apply the technique to other binary (2 class)
classification problems on the UCI machine learning repository.