# Logistic Regression implementation from scratch

In [1]:
import numpy as np

class LogisticRegression:
  def __init__(self, X, y, learning_rate, max_iterations, tolerance):
    self.X = X
    self.y = np.expand_dims(y, axis=1)       
    self.beta = np.random.rand(self.X.shape[1], 1) / 10.0
    self.bias = 0
    self.lr = learning_rate
    self.max_iter = max_iterations
    self.n_obs = X.shape[0]
    self.n_features = X.shape[1]
    self.tol = tolerance

  def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))      

  def yhat(self, beta, bias):
    yhat = self.sigmoid(np.matmul(self.X, beta) + bias)
    return yhat

  def loss(self, beta, bias):
    yhat = self.yhat(beta, bias)
    loss = -1.0 / self.n_obs * (np.sum(self.y * np.log(yhat) + (1.0 - y) * np.log(1.0 - yhat)))
    return loss

  def gradient_descent(self, beta, bias):
    yhat = self.yhat(beta, bias)
    dLossdBeta = 1.0 / self.n_obs * np.matmul(self.X.T, (yhat - self.y))
    dLossdBias = (1.0 / self.n_obs * np.sum(yhat - self.y))
    return dLossdBeta, dLossdBias

  def train(self):
    for i in range(self.max_iter):
      beta_old = self.beta
      bias_old = self.bias
      dLossdBeta, dLossdBias = self.gradient_descent(self.beta, self.bias)
      self.beta = self.beta - self.lr * dLossdBeta
      self.bias = self.bias - self.lr * dLossdBias
      diff = abs(self.beta - beta_old)
      diff_over_threshold = diff > self.tol * self.beta
      if not diff_over_threshold.any():
        loss = self.loss(self.beta, self.bias)
        print(f"Converged!!! Final loss: {loss}")
        break  
      if i % 10000 == 0:
        loss = self.loss(self.beta, self.bias)
        print(f"Loss at iteration {i} is {loss}")
    print("Bias = ", self.bias)
    print("Beta = ", np.squeeze(self.beta, axis=1))

  def predict(self, X):
    yhat = np.round_(self.sigmoid(np.matmul(X, self.beta) + self.bias))
    return yhat

## Test1 using Generated Data

In [2]:
from sklearn.datasets import make_blobs

In [3]:
X, y = make_blobs(n_samples=5000, centers=2)

#Split in test train split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [4]:
lr = LogisticRegression(X_train, y_train, 0.001, 100000, 1e-4)

In [5]:
lr.train()

Loss at iteration 0 is 4017.409934738068
Loss at iteration 10000 is 9987.996560420366
Loss at iteration 20000 is 11016.663977314098
Loss at iteration 30000 is 11620.480629522288
Loss at iteration 40000 is 12049.967413850016
Loss at iteration 50000 is 12383.7666260662
Loss at iteration 60000 is 12656.954344243035
Loss at iteration 70000 is 12888.262601000411
Loss at iteration 80000 is 13088.884038388913
Loss at iteration 90000 is 13266.044694403141
Bias =  -0.07362037909603185
Beta =  [ 0.48200607 -0.95192122]


In [6]:
y_pred = lr.predict(X_test)
y_test1 = np.expand_dims(y_test, axis=1)  

print(f"Accuracy: {np.sum(y_test1==y_pred)/X_test.shape[0]}")

Accuracy: 1.0


#Test 2 

In [7]:
#Make a fake dataset
from sklearn import datasets
(X, y) = datasets.make_classification(n_samples=10000, n_features=7, n_informative=5, n_redundant=2)

In [8]:
#Split in test train split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [9]:
lr = LogisticRegression(X_train, y_train, 0.00001, 50000, 1e-4)

In [10]:
lr.train()

Loss at iteration 0 is 6742.95793669236
Loss at iteration 10000 is 6403.216906726441
Loss at iteration 20000 is 6408.2228026918665
Loss at iteration 30000 is 6515.635248604295
Loss at iteration 40000 is 6651.9667605796385
Bias =  -0.007143715517786395
Beta =  [ 0.10946463  0.16638967 -0.0250479  -0.05617942 -0.09759045  0.31229916
  0.04734305]


In [11]:
y_pred = lr.predict(X_test)
y_test1 = np.expand_dims(y_test, axis=1)  

print(f"Accuracy: {np.sum(y_test1==y_pred)/X_test.shape[0]}")

Accuracy: 0.8675
