# Logistic Regression with Numpy Implementation

## 1. Introduction

Logistic regression is one of the most fundamental machine learning models for binary classification. I will summarize its methodology and implement it from scratch using NumPy.

The problem we solve is **binary classification,** for example, the doctor would like to base on patients's features, including mean radius, mean texture, etc, to classify breat cancer into one of the following two case:

- "malignant":  𝑦=1 
- "benign":  𝑦=0 

which correspond to serious and gentle case respectively.

We will load the breast cancer data from scikit-learn as a toy dataset, and split the data into the training and test datasets.

## 2. Logistic Regression Model

[To be continued.]

## 3. Numpy Implementation of Logistic Regressio

In [1]:
# https://github.com/bowen0701/machine-learning/blob/master/logistic_regression.py
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import random

import numpy as np

np.random.seed(71)


class LogisticRegression(object):
    """Numpy implementation of Logistic Regression."""
    def __init__(self, batch_size=10, lr=0.01, n_epochs=5):
        self._batch_size = batch_size
        self._lr = lr
        self._n_epochs = n_epochs

    def _sigmoid(self, z):
        def f(x):
            if x < 0:
                return np.exp(x) / (1 + np.exp(x))
            else:
                return 1 / (1 + np.exp(-x))
        return np.array(list(map(f, z)))

    def _logreg(self, X, w, b):
        return self._sigmoid(np.dot(X, w) + b)

    def _cross_entropy(self, y_hat, y, eps=1e-7):
        # To avoid overflow in log, add epsilon = 1E-7.
        return - np.mean(y * np.log(y_hat + eps) + (1 - y) * np.log(1 - y_hat + eps))

    def _weights_init(self):
        w = np.zeros(self._n_inputs).reshape(self._n_inputs, 1)
        b = np.zeros(1).reshape(1, 1)
        return w, b

    def _sgd(self, X, y, w, b):
        m = X.shape[0]

        y_hat = self._logreg(X, w, b) 
        dw = - 1 / m * np.matmul(X.T, y - y_hat)
        db = - np.mean(y - y_hat)
        
        for (param, grad) in zip([w, b], [dw, db]):
            param[:] = param - self._lr * grad

    def _data_iter(self):
        idx = list(range(self._n_examples))
        random.shuffle(idx)
        for i in range(0, self._n_examples, self._batch_size):
            idx_batch = np.array(
                idx[i:min(i + self._batch_size, self._n_examples)])
            yield (self._X_train.take(idx_batch, axis=0), 
                   self._y_train.take(idx_batch, axis=0))

    def fit(self, X_train, y_train):
        self._X_train = X_train
        self._y_train = y_train
        self._n_examples, self._n_inputs = X_train.shape

        logreg = self._logreg
        loss = self._cross_entropy
        w, b = self._weights_init()

        for epoch in range(self._n_epochs):
            for step, (X, y) in enumerate(self._data_iter()):
                y = y.reshape((y.shape[0], -1))
                self._sgd(X, y, w, b)
            train_loss = loss(logreg(X, w, b), y)
            if epoch % 10 == 0:
                print('epoch {0}: loss {1}'.format(epoch + 1, train_loss))

        self._logreg = logreg
        self._w, self._b = w, b
        return self

    def get_coeff(self):
        return self._b, self._w.reshape((-1,))

    def predict(self, X_test):
        return self._logreg(X_test, self._w, self._b).reshape((-1,))

## 4. Data Preparation and Preprocessing

In [2]:
import sklearn
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression as LogisticRegressionSklearn

# https://github.com/bowen0701/machine-learning/blob/master/numpy_metrics.py
from numpy_metrics import accuracy

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
# Read breast cancer data.
X, y = load_breast_cancer(return_X_y=True)

In [5]:
X.shape, y.shape

((569, 30), (569,))

In [6]:
X[:3]

array([[1.799e+01, 1.038e+01, 1.228e+02, 1.001e+03, 1.184e-01, 2.776e-01,
        3.001e-01, 1.471e-01, 2.419e-01, 7.871e-02, 1.095e+00, 9.053e-01,
        8.589e+00, 1.534e+02, 6.399e-03, 4.904e-02, 5.373e-02, 1.587e-02,
        3.003e-02, 6.193e-03, 2.538e+01, 1.733e+01, 1.846e+02, 2.019e+03,
        1.622e-01, 6.656e-01, 7.119e-01, 2.654e-01, 4.601e-01, 1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, 1.326e+03, 8.474e-02, 7.864e-02,
        8.690e-02, 7.017e-02, 1.812e-01, 5.667e-02, 5.435e-01, 7.339e-01,
        3.398e+00, 7.408e+01, 5.225e-03, 1.308e-02, 1.860e-02, 1.340e-02,
        1.389e-02, 3.532e-03, 2.499e+01, 2.341e+01, 1.588e+02, 1.956e+03,
        1.238e-01, 1.866e-01, 2.416e-01, 1.860e-01, 2.750e-01, 8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, 1.203e+03, 1.096e-01, 1.599e-01,
        1.974e-01, 1.279e-01, 2.069e-01, 5.999e-02, 7.456e-01, 7.869e-01,
        4.585e+00, 9.403e+01, 6.150e-03, 4.006e-02, 3.832e-02, 2.058e-02,
        2.250e-02, 4.571e-03, 2.357e

In [7]:
y[:3]

array([0, 0, 0])

In [8]:
# Split data into training and test datasets.
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=71, shuffle=True, stratify=y)

In [9]:
print(X_train_raw.shape, y_train.shape)
print(X_test_raw.shape, y_test.shape)

(426, 30) (426,)
(143, 30) (143,)


In [10]:
# Feature engineering for standardizing features by min-max scaler.
min_max_scaler = MinMaxScaler()

X_train = min_max_scaler.fit_transform(X_train_raw)
X_test = min_max_scaler.transform(X_test_raw)

## 4. Fitting Logistic Regression

In [11]:
# Fit our Logistic Regression.
clf = LogisticRegression(batch_size=100, lr=10, n_epochs=100)

clf.fit(X_train, y_train)

epoch 1: loss 2.432988683901155
epoch 11: loss 0.08269506633173168
epoch 21: loss 0.02205420752252182
epoch 31: loss 0.043176617239112924
epoch 41: loss 0.3093572198198679
epoch 51: loss 0.10770636028790158
epoch 61: loss 0.029891748379843333
epoch 71: loss 0.02073525084060601
epoch 81: loss 0.05118910235934664
epoch 91: loss 0.01949032998807695


<__main__.LogisticRegression at 0x7fc0fa8d4048>

In [12]:
# Get coefficient.
clf.get_coeff()

(array([[16.37297703]]),
 array([-1.22119274, -3.19738366, -1.32460176, -2.82844307, -1.02977342,
         1.05390748, -4.41262344, -6.73851879, -1.45374884,  3.14150419,
        -7.11112612, -0.52413638, -5.56553335, -4.46050678,  1.55378861,
         3.95417232,  1.93726362,  1.38320829,  2.75462555,  2.93060893,
        -5.2695596 , -4.95495259, -4.70659005, -5.21991334, -3.38171848,
        -0.77473968, -3.70639665, -5.12344415, -2.59490509, -0.86470708]))

In [13]:
# Predicted probabilities for training data.
p_pred_train = clf.predict(X_train)
p_pred_train[:3]

array([9.97464581e-01, 4.58526243e-12, 2.56514901e-04])

In [14]:
# Predicted labels for training data.
y_pred_train = (p_pred_train > 0.5) * 1
y_pred_train[:3]

array([1, 0, 0])

In [15]:
# Predicted label correctness for training data.
# y_pred_train == y_train

In [16]:
# Prediction accuracy for training data.
accuracy(y_train, y_pred_train)

0.9765258215962441

In [17]:
# Predicted label correctness for test data.
p_pred_test = clf.predict(X_test)
y_pred_test = (p_pred_test > 0.5) * 1

# y_pred_test == y_test

In [18]:
# Prediction accuracy for test data.
accuracy(y_test, y_pred_test)

0.958041958041958

## 5. Fitting Sklearn's Logistic Regression as Benchmark

In [19]:
# Fit sklearn's Logistic Regression.
clf2 = LogisticRegressionSklearn(C=1e4, solver='lbfgs', max_iter=500)

clf2.fit(X_train, y_train)

LogisticRegression(C=10000.0, max_iter=500)

In [20]:
# Get coefficients.
clf2.intercept_, clf2.coef_

(array([56.06250509]),
 array([[  53.5460616 ,  -27.2575739 ,   48.30697654,   10.5636878 ,
          -14.75837806,   98.5009966 ,  -52.51936527,  -52.16906591,
           -5.08742246,  -53.96348797,  -33.97198842,   -5.48905184,
          -19.38885928,  -43.89981909,   38.75665922,  -51.43678914,
           83.21007672,  -21.89925037,   14.96797392,   79.99757062,
          -59.04206865,   -3.91791317,  -63.58395555, -103.96747709,
           -7.9699581 ,   20.04904076,  -21.96650031,  -21.30939901,
          -21.55187209,  -11.69936363]]))

In [21]:
# Predicted labels for training data.
y_pred_train = clf2.predict(X_train)
y_pred_train[:3]

array([1, 0, 0])

In [22]:
# Predicted label correctness for training data.
# y_pred_train == y_train

In [23]:
# Prediction accuracy for training data.
accuracy(y_train, y_pred_train)

1.0

In [24]:
# Predicted label correctness for test data.
y_pred_test = clf2.predict(X_test) 
# y_pred_test == y_test

In [25]:
# # Prediction accuracy for test data.
accuracy(y_test, y_pred_test)

0.965034965034965