In [2]:
import os
import numpy as np

In [3]:
path = os.path.join(os.getcwd(), "logistic_data.txt")
with open(path, "r") as datafile:
    data = datafile.readlines()

The function `formatData()` reads the data, and formats it to form the desired matrices.

In [4]:
def formatData(data, matrix):
    for line in data:
        line = line.strip("\n")
        l1 = [float(x) for x in line.split(",")]
        l1.insert(0, 1)
        matrix.append(l1)
    return matrix

matrix = formatData(data, list())
# print(matrix)

Creating X, y, and $\theta$ from the data matrix. Here, 
- X is the design matrix
- y is the target variables
- $\theta$ is the parameters

In [5]:
cols = len(matrix[0])
matrix = np.matrix(matrix)

X = matrix[:, 0:cols-1]
y = matrix[:, cols-1:cols]
theta = np.matrix(np.zeros(3))
# print(X, y, theta)

Defining the sigmoid function.

In [6]:
def sigmoid(x):
    return 1/(1 + np.exp(-x))

Defining the Cost Function, for logistic-regression, at particular $\theta$

In [7]:
def computeCost(X, y, theta):
    pos = np.multiply(-y, np.log(sigmoid(X * theta.T)))
    neg = np.multiply((1-y), np.log(1 - sigmoid(X * theta.T)))
    inner = pos - neg
    return np.sum(inner)/ (len(X))

In [8]:
computeCost(X, y, theta) #Cost for the initial theta

0.6931471805599453

Function to perform the gradient descent algorithm. Here, 
- alpha is the learning rate of the algorithm
- iter_count is the number of iterations made by the algorithm.  
The function returns the final optimised $\theta$ and the final cost 

In [9]:
def gradientDescent(X, y, theta, alpha, iter_count):
    temp = np.matrix(np.zeros(theta.shape)) # will hold value of new theta
    cost = np.zeros(iter_count) #will hold value of cost for each iteration
    parameters = int(theta.ravel().shape[1]) #defines number of feature variables
    for i in range(iter_count):
        error = sigmoid(X * theta.T) - y
        for j in range(parameters):
            term = np.multiply(error, X[:, j])
            temp[0, j] = theta[0, j] - ((alpha/len(X)) * np.sum(term))
        theta = temp
        cost[i] = computeCost(X, y, theta)
    return theta, cost

Executing gradient Descent  
iter_count >= 20000 gives good results.  
Better is to use scipy.optimise


In [9]:
iter_count = 200000
alpha = 0.001
g, cost = gradientDescent(X, y, theta, alpha, iter_count)
print(g)

[[-7.45017822  0.06550395  0.05898701]]


In [10]:
computeCost(X, y, g) #Computing cost for final theta

0.31655433520912246

Defining a `predict()` function, that will predict the category for given inputs

In [11]:
def predict(X, theta):
    probability = sigmoid(X * theta.T)
    return [1 if x >= 0.5 else 0 for x in probability]

Using the `predict()` function to find out the final accuracy of our learning algorithm

In [1]:
g = np.matrix(g)
predictions = predict(X, g)
# print(predictions, y)
correct = [1 if ((a == 1 and b == 1) or (a == 0 and b == 0)) else 0 for (a, b) in zip(predictions, y)]
accuracy = (sum(correct) % len(correct))
print(f'accuracy = {accuracy}%')

NameError: name 'np' is not defined