# Softmax Regression with Implementation

## Introduction

Logistic regression is one of the most fundamental machine learning models for binary classification. I will summarize its methodology and implement it in NumPy and PyTorch.

The problem we solve is **multiclass classification,** for example, we would like to enable computer vision to classify grayscale images into one of the 10 classes. 

We will load the MNIST data from scikit-learn as a toy dataset, and split the data into the training and test datasets.

## Softmax Regression Model

[To be continued.]

In [1]:
import random
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import tensorflow as tf

import sys
sys.path.append('../numpy/')
from metrics import accuracy

np.random.seed(71)

In [2]:
%load_ext autoreload
%autoreload 2

## MNIST Dataset and Preprocessing

In [3]:
import sklearn
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [4]:
# Read breast cancer data.
mnist_data = load_digits()
X, y = mnist_data.data, mnist_data.target

In [5]:
X.shape, y.shape

((1797, 64), (1797,))

In [6]:
print(mnist_data.feature_names)
X[:3]

['pixel_0_0', 'pixel_0_1', 'pixel_0_2', 'pixel_0_3', 'pixel_0_4', 'pixel_0_5', 'pixel_0_6', 'pixel_0_7', 'pixel_1_0', 'pixel_1_1', 'pixel_1_2', 'pixel_1_3', 'pixel_1_4', 'pixel_1_5', 'pixel_1_6', 'pixel_1_7', 'pixel_2_0', 'pixel_2_1', 'pixel_2_2', 'pixel_2_3', 'pixel_2_4', 'pixel_2_5', 'pixel_2_6', 'pixel_2_7', 'pixel_3_0', 'pixel_3_1', 'pixel_3_2', 'pixel_3_3', 'pixel_3_4', 'pixel_3_5', 'pixel_3_6', 'pixel_3_7', 'pixel_4_0', 'pixel_4_1', 'pixel_4_2', 'pixel_4_3', 'pixel_4_4', 'pixel_4_5', 'pixel_4_6', 'pixel_4_7', 'pixel_5_0', 'pixel_5_1', 'pixel_5_2', 'pixel_5_3', 'pixel_5_4', 'pixel_5_5', 'pixel_5_6', 'pixel_5_7', 'pixel_6_0', 'pixel_6_1', 'pixel_6_2', 'pixel_6_3', 'pixel_6_4', 'pixel_6_5', 'pixel_6_6', 'pixel_6_7', 'pixel_7_0', 'pixel_7_1', 'pixel_7_2', 'pixel_7_3', 'pixel_7_4', 'pixel_7_5', 'pixel_7_6', 'pixel_7_7']


array([[ 0.,  0.,  5., 13.,  9.,  1.,  0.,  0.,  0.,  0., 13., 15., 10.,
        15.,  5.,  0.,  0.,  3., 15.,  2.,  0., 11.,  8.,  0.,  0.,  4.,
        12.,  0.,  0.,  8.,  8.,  0.,  0.,  5.,  8.,  0.,  0.,  9.,  8.,
         0.,  0.,  4., 11.,  0.,  1., 12.,  7.,  0.,  0.,  2., 14.,  5.,
        10., 12.,  0.,  0.,  0.,  0.,  6., 13., 10.,  0.,  0.,  0.],
       [ 0.,  0.,  0., 12., 13.,  5.,  0.,  0.,  0.,  0.,  0., 11., 16.,
         9.,  0.,  0.,  0.,  0.,  3., 15., 16.,  6.,  0.,  0.,  0.,  7.,
        15., 16., 16.,  2.,  0.,  0.,  0.,  0.,  1., 16., 16.,  3.,  0.,
         0.,  0.,  0.,  1., 16., 16.,  6.,  0.,  0.,  0.,  0.,  1., 16.,
        16.,  6.,  0.,  0.,  0.,  0.,  0., 11., 16., 10.,  0.,  0.],
       [ 0.,  0.,  0.,  4., 15., 12.,  0.,  0.,  0.,  0.,  3., 16., 15.,
        14.,  0.,  0.,  0.,  0.,  8., 13.,  8., 16.,  0.,  0.,  0.,  0.,
         1.,  6., 15., 11.,  0.,  0.,  0.,  1.,  8., 13., 15.,  1.,  0.,
         0.,  0.,  9., 16., 16.,  5.,  0.,  0.,  0.,  0.,  

In [7]:
print(mnist_data.target_names)
y[:3]

[0 1 2 3 4 5 6 7 8 9]


array([0, 1, 2])

In [8]:
# Split data into training and test datasets.
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=71, shuffle=True, stratify=y)

In [9]:
print(X_train_raw.shape, y_train.shape)
print(X_test_raw.shape, y_test.shape)

(1347, 64) (1347,)
(450, 64) (450,)


In [10]:
# Feature engineering for standardizing features by min-max scaler.
min_max_scaler = MinMaxScaler()

X_train = min_max_scaler.fit_transform(X_train_raw)
X_test = min_max_scaler.transform(X_test_raw)

In [11]:
# Convert arrays to float32.
X_train, X_test, y_train, y_test = (
    np.float32(X_train), np.float32(X_test), np.float32(y_train), np.float32(y_test))

In [12]:
X_train.dtype, y_train.dtype

(dtype('float32'), dtype('float32'))

## Numpy Implementation of Softmax Regression

In [13]:
class SoftmaxRegression(object):
    """Numpy implementation of Softmax Regression."""
    # TODO

    def __init__(self, batch_size=64, lr=0.01, n_epochs=1000):
        self.batch_size = batch_size
        self.lr = lr
        self.n_epochs = n_epochs

    def get_data(self, X_train, y_train, shuffle=True):
        """Get dataset and information."""
        self.X_train = X_train
        self.y_train = y_train

        # Get the numbers of examples and inputs.
        self.n_examples, self.n_inputs = self.X_train.shape

        if shuffle:
            idx = list(range(self.n_examples))
            random.shuffle(idx)
            self.X_train = self.X_train[idx]
            self.y_train = self.y_train[idx]

    def _create_weights(self):
        """Create model weights and bias."""
        self.w = np.zeros(self.n_inputs).reshape(self.n_inputs, 1)
        self.b = np.zeros(1).reshape(1, 1)

    def _logit(self, X):
        """Logit: unnormalized log probability."""
        return np.matmul(X, self.w) + self.b

    def _sigmoid(self, logit):
        """Sigmoid function by stabilization trick.

        sigmoid(z) = 1 / (1 + exp(-z)) 
                   = exp(z) / (1 + exp(z)) * exp(z_max) / exp(z_max)
                   = exp(z - z_max) / (exp(-z_max) + exp(z - z_max)),
        where z is the logit, and z_max = z - max(0, z).
        """
        logit_max = np.maximum(0, logit)
        logit_stable = logit - logit_max
        return np.exp(logit_stable) / (np.exp(-logit_max) + np.exp(logit_stable))
    
    def _model(self, X):
        """Logistic regression model."""
        logit = self._logit(X)
        return self._sigmoid(logit)

    def _loss(self, y, logit):
        """Cross entropy loss by stabilizaiton trick.

        cross_entropy_loss(y, z) 
          = - 1/n * \sum_{i=1}^n y_i * log p(y_i = 1|x_i) + (1 - y_i) * log p(y_i = 0|x_i)
          = - 1/n * \sum_{i=1}^n y_i * (z_i - log(1 + exp(z_i))) + (1 - y_i) * (-log(1 + exp(z_i))),
        where z is the logit, z_max = z - max(0, z),
          log p(y = 1|x)
            = log (1 / (1 + exp(-z))) 
            = log (exp(z) / (1 + exp(z)))
            = z - log(1 + exp(z))
        and 
          log(1 + exp(z)) := logsumexp(z)
            = log(exp(0) + exp(z))
            = log(exp(0) + exp(z) * exp(z_max) / exp(z_max))
            = z_max + log(exp(-z_max) + exp(z - z_max)).
        """
        logit_max = np.maximum(0, logit)
        logit_stable = logit - logit_max
        logsumexp_stable = logit_max + np.log(np.exp(-logit_max) + np.exp(logit_stable))
        self.cross_entropy = -(y * (logit - logsumexp_stable) + (1 - y) * (-logsumexp_stable))
        return np.mean(self.cross_entropy)

    def _optimize(self, X, y):
        """Optimize by stochastic gradient descent."""
        m = X.shape[0]

        y_ = self._model(X) 
        dw = 1 / m * np.matmul(X.T, y_ - y)
        db = np.mean(y_ - y)

        for (param, grad) in zip([self.w, self.b], [dw, db]):
            param[:] = param - self.lr * grad

    def _fetch_batch(self):
        """Fetch batch dataset."""
        idx = list(range(self.n_examples))
        for i in range(0, self.n_examples, self.batch_size):
            idx_batch = idx[i:min(i + self.batch_size, self.n_examples)]
            yield (self.X_train.take(idx_batch, axis=0), self.y_train.take(idx_batch, axis=0))

    def fit(self):
        """Fit model."""
        self._create_weights()

        for epoch in range(1, self.n_epochs + 1):
            total_loss = 0
            for X_train_b, y_train_b in self._fetch_batch():
                y_train_b = y_train_b.reshape((y_train_b.shape[0], -1))
                self._optimize(X_train_b, y_train_b)
                train_loss = self._loss(y_train_b, self._logit(X_train_b))
                total_loss += train_loss * X_train_b.shape[0]

            if epoch % 100 == 0:
                print('epoch {0}: training loss {1}'.format(epoch, total_loss / self.n_examples))

        return self

    def get_coeff(self):
        return self.b, self.w.reshape((-1,))

    def predict(self, X):
        return self._model(X).reshape((-1,))

## Fitting Softmax Regression in NumPy

In [None]:
# Fit our Softmax Regression.
softmax = SoftmaxRegression(batch_size=64, lr=1, n_epochs=100)

In [None]:
# Get datasets and build graph.
softmax.get_data(X_train, y_train, shuffle=True)

In [None]:
softmax.fit()

In [None]:
# Get coefficient.
softmax.get_coeff()

In [None]:
# Predicted probabilities for training data.
p_train_ = softmax.predict(X_train)
p_train_[:10]

In [None]:
# Predicted labels for training data.
y_train_ = (p_train_ > 0.5) * 1
y_train_[:3]

In [None]:
# Prediction accuracy for training data.
accuracy(y_train, y_train_)

In [None]:
# Predicted label correctness for test data.
p_test_ = softmax.predict(X_test)
print(p_test_[:10])
y_test_ = (p_test_ > 0.5) * 1

In [None]:
# Prediction accuracy for test data.
accuracy(y_test, y_test_)

## PyTorch Implementation of Softmax Regression

In [15]:
class SoftmaxRegressionTorch(nn.Module):
    """PyTorch implementation of Softmax Regression."""

    def __init__(self, n_targets=10, batch_size=64, lr=0.01, n_epochs=1000):
        super(SoftmaxRegressionTorch, self).__init__()
        self.n_targets = n_targets
        self.batch_size = batch_size
        self.lr = lr
        self.n_epochs = n_epochs

    def get_data(self, X_train, y_train, shuffle=True):
        """Get dataset and information."""
        self.X_train = X_train
        self.y_train = y_train

        # Get the numbers of examples and inputs.
        self.n_examples, self.n_inputs = self.X_train.shape

        if shuffle:
            idx = list(range(self.n_examples))
            random.shuffle(idx)
            self.X_train = self.X_train[idx]
            self.y_train = self.y_train[idx]

    def _create_model(self):
        """Create logistic regression model."""
        self.net = nn.Sequential(
            nn.Linear(self.n_inputs, self.n_targets),
            nn.Softmax(dim=1),
        )

    def forward(self, x):
        x = x.view(x.shape[0], -1)
        y = self.net(x)
        return y

    def _create_loss(self):
        """Create cross entropy loss."""
        self.criterion = nn.CrossEntropyLoss()

    def _create_optimizer(self):
        """Create optimizer by stochastic gradient descent."""
        self.optimizer = optim.SGD(self.parameters(), lr=self.lr)

    def build(self):
        """Build model, loss function and optimizer."""
        self._create_model()
        self._create_loss()
        self._create_optimizer()

    def _fetch_batch(self):
        """Fetch batch dataset."""
        idx = list(range(self.n_examples))
        for i in range(0, self.n_examples, self.batch_size):
            idx_batch = idx[i:min(i + self.batch_size, self.n_examples)]
            yield (self.X_train.take(idx_batch, axis=0), 
                   self.y_train.take(idx_batch, axis=0))

    def fit(self):
        """Fit model."""
        for epoch in range(1, self.n_epochs + 1):
            total_loss = 0
            for X_train_b, y_train_b in self._fetch_batch():
                # Convert to Tensor from NumPy array and reshape ys.
                X_train_b, y_train_b = (
                    torch.from_numpy(X_train_b), torch.from_numpy(y_train_b))

                y_pred_b = self.net(X_train_b)
                loss = self.criterion(y_pred_b, y_train_b.long())
                total_loss += loss * X_train_b.shape[0]

                # Zero grads, performs backward pass, and update weights.
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()

            if epoch % 100 == 0:
                print(f'Epoch {epoch}: training loss: {total_loss / self.n_examples}')

    def get_coeff(self):
        """Get model coefficients."""
        # Detach var which require grad.
        return (self.net[0].bias.detach().numpy(),
                self.net[0].weight.detach().numpy())

    def predict(self, X):
        """Predict for new data."""
        with torch.no_grad():
            X_ = torch.from_numpy(X)
            return self.net(X_)

## Fitting Softmax Regression in PyTorch

In [16]:
# Fit PyTorch Logistic Regression.
softmax_torch = SoftmaxRegressionTorch(n_targets=10, batch_size=64, lr=0.5, n_epochs=1000)

In [17]:
softmax_torch.get_data(X_train, y_train, shuffle=True)

In [18]:
softmax_torch.build()

In [19]:
softmax_torch.net

Sequential(
  (0): Linear(in_features=64, out_features=10, bias=True)
  (1): Softmax(dim=1)
)

In [20]:
softmax_torch.fit()

Epoch 100: training loss: 1.5401159524917603
Epoch 200: training loss: 1.5128363370895386
Epoch 300: training loss: 1.5020697116851807
Epoch 400: training loss: 1.4955798387527466
Epoch 500: training loss: 1.4912927150726318
Epoch 600: training loss: 1.4882277250289917
Epoch 700: training loss: 1.4859102964401245
Epoch 800: training loss: 1.4840892553329468
Epoch 900: training loss: 1.4826055765151978
Epoch 1000: training loss: 1.4813412427902222


In [21]:
# Get coefficient.
softmax_torch.get_coeff()

(array([ 0.31273487, -1.1591383 ,  0.2713904 ,  0.08872963,  0.6331543 ,
         0.34047732, -0.42384103,  0.5192086 ,  0.27209905, -0.5812637 ],
       dtype=float32),
 array([[ 6.85066730e-02, -2.07083911e-01,  1.93988141e-02,
          5.55133104e-01, -6.15285151e-03, -1.31471848e+00,
         -9.15930569e-01, -2.69238353e-01,  9.59302932e-02,
         -5.22491872e-01, -1.42384216e-01,  8.99561584e-01,
          1.04160869e+00,  1.10052407e+00, -8.48016918e-01,
         -8.05965662e-02, -7.17134103e-02,  2.38810450e-01,
          1.28496408e+00,  1.21454701e-01, -2.78667998e+00,
          2.05707026e+00,  3.89430434e-01, -1.39909640e-01,
          1.78014040e-02,  9.51899230e-01,  1.31865561e+00,
         -1.65240741e+00, -4.66167593e+00,  3.75972986e-01,
          1.13720012e+00,  5.54777943e-02,  9.29418206e-02,
          1.33213949e+00,  1.50297630e+00, -1.95885289e+00,
         -3.74068594e+00,  3.26903790e-01,  9.41402495e-01,
         -4.79919016e-02, -6.96029440e-02,  3.5649

In [22]:
# Predicted probabilities for training data.
p_train_ = softmax_torch.predict(X_train)

In [23]:
# Prediction accuracy for training data.
accuracy(p_train_.argmax(dim=1).numpy(), y_train)

0.991833704528582

In [24]:
# Predicted label correctness for test data.
p_test_ = softmax_torch.predict(X_test)

In [25]:
# Prediction accuracy for test data.
accuracy(p_test_.argmax(dim=1).numpy(), y_test)

0.9711111111111111

## Benchmark with Sklearn's Softmax Regression

In [26]:
# Fit sklearn's Softmax Regression.
from sklearn.linear_model import LogisticRegression as LogisticRegressionSklearn

softmax_sk = LogisticRegressionSklearn(C=1e4, multi_class='multinomial', max_iter=500)

softmax_sk.fit(X_train, y_train.reshape(y_train.shape[0], ))

LogisticRegression(C=10000.0, max_iter=500, multi_class='multinomial')

In [27]:
# Get coefficients.
softmax_sk.intercept_, softmax_sk.coef_

(array([  6.44249192, -11.78827616,   9.15377243,  -6.54285021,
         20.57724269,  -1.87399062,  -1.49799704,   4.82668839,
         -1.49597245, -17.80110895]),
 array([[ 0.00000000e+00,  1.82014829e-02, -1.02354488e-01,
         -1.09174791e+00,  6.17472267e-01, -2.09580231e+00,
         -2.86710590e+00, -8.13254984e-01,  1.12288811e-03,
         -2.24451729e+00, -6.45541061e-01,  4.30635635e+00,
          5.72461608e-01, -2.66108726e-01, -1.27924671e+00,
         -1.00903806e+00, -3.67088590e-01, -3.81501727e-01,
          2.15420862e+00,  4.97151804e-01, -9.82361490e+00,
          3.66875990e+00,  8.00124111e-01, -3.42277611e-01,
         -3.67357442e-01,  2.76777140e+00,  1.09800041e+00,
         -3.53039252e+00, -1.07179082e+01, -5.15154238e-01,
          2.34853409e+00,  4.09575676e-03,  0.00000000e+00,
          4.89460610e+00,  1.73994209e+00, -3.21061662e+00,
         -1.03196894e+01,  1.89809863e+00,  7.33390382e-01,
          0.00000000e+00,  3.42605581e-03, -1.58390012

In [28]:
# Predicted labels for training data.
y_train_ = softmax_sk.predict(X_train)
y_train_[:3]

array([4., 9., 1.], dtype=float32)

In [29]:
# Prediction accuracy for training data.
accuracy(y_train_, y_train)

1.0

In [30]:
# Predicted label correctness for test data.
y_test_ = softmax_sk.predict(X_test)

In [31]:
# # Prediction accuracy for test data.
accuracy(y_test_, y_test)

0.9622222222222222