In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import math
# from io import BytesIO
# from sklearn.datasets import load_iris
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler

import tensorflow as tf

In [2]:
random_seed = 42
learning_rate = 1e-1
num_epochs = 100
batch_size = 256

num_features = 784
num_classes = 10

In [3]:
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.mnist.load_data()

X_train = np.reshape(X_train / 255, (-1, num_features)).astype(np.float32)
X_test = np.reshape(X_test / 255, (-1, num_features)).astype(np.float32)

In [4]:
def next_batch(X, y, batch_size):
    i, iters_per_epoch = 0, math.ceil(y.shape[0] / batch_size)
    while i < iters_per_epoch:
        i += 1
        lo, hi = (i - 1) * batch_size, i * batch_size
        yield X[lo:hi], y[lo:hi]

## Low-level implementation with manual gradients

In [10]:
class SoftmaxRegressionMannualGrad:
    def __init__(self, num_features, num_classes):
        self.num_features = num_features
        self.num_classes = num_classes
        
        self.W = np.random.randn(self.num_features, self.num_classes) * 2 / num_features # he initialization
        self.b = np.zeros((num_classes, ))
        
    
    def forward(self, x):
        logits = np.matmul(x, self.W) + self.b
        return logits
    
    def step(self, x, y, learning_rate=1e-1):
        logits = self.forward(x)
        proba = self._softmax(logits).astype(np.float32)
        one_hot = np.zeros((len(y), num_classes), dtype=np.float32)
        one_hot[range(len(y)), y] = 1.
        grad_w = np.matmul(x.T, proba - one_hot)
        grad_b = np.sum(proba - one_hot, axis=0)
        self.W -= learning_rate * grad_w
        self.b -= learning_rate * grad_b
    
    def evaluate(self, x, y):
        logits = self.forward(x)
        y_pred = np.argmax(logits, axis=1)
        return np.sum(y_pred == y) / len(y)
        
    def train(self, epochs, batch_size, learning_rate=1e-3):
        for e in range(1, epochs + 1):
            for batch_X, batch_y in next_batch(X_train, y_train, batch_size):
                self.step(batch_X, batch_y, learning_rate=learning_rate)
            if not e % 10:
                train_acc = self.evaluate(X_train[:512], y_train[:512])
                test_acc = self.evaluate(X_test[:512], y_test[:512])
                print('Epoch: %03d' % e, end="")
                print(' | Train ACC: %.3f' % train_acc, end="")
                print(' | Test ACC: %.3f' % test_acc)
    
    def _softmax(self, logits):
        logits -= np.max(logits, axis=1, keepdims=True)  # numerical stability
        return np.exp(logits) / np.sum(np.exp(logits), axis=1, keepdims=True)

In [11]:
softmax_regressor = SoftmaxRegressionMannualGrad(784, 10)
softmax_regressor.train(num_epochs, batch_size)

Epoch: 010 | Train ACC: 0.926 | Test ACC: 0.918
Epoch: 020 | Train ACC: 0.930 | Test ACC: 0.920
Epoch: 030 | Train ACC: 0.939 | Test ACC: 0.920
Epoch: 040 | Train ACC: 0.941 | Test ACC: 0.926
Epoch: 050 | Train ACC: 0.941 | Test ACC: 0.926
Epoch: 060 | Train ACC: 0.943 | Test ACC: 0.926
Epoch: 070 | Train ACC: 0.943 | Test ACC: 0.928
Epoch: 080 | Train ACC: 0.943 | Test ACC: 0.928
Epoch: 090 | Train ACC: 0.945 | Test ACC: 0.928
Epoch: 100 | Train ACC: 0.947 | Test ACC: 0.928


## Low-level implementation using tensorflow autograd

In [6]:
class SoftmaxRegressionTF(tf.keras.Model):
    def __init__(self, num_features, num_classes):
        super(SoftmaxRegressionTF, self).__init__()
        self.num_features = num_features
        self.num_classes = num_classes
        
        w_init = tf.initializers.VarianceScaling()
        self.W = tf.Variable(w_init(shape=(num_features, num_classes), 
                                    dtype=tf.float32), name='kernel')
        self.b = tf.Variable(tf.zeros((num_classes, ), dtype=tf.float32), name='bias')
    
    def call(self, x):
        logits = tf.matmul(x, self.W) + self.b
        return logits

In [7]:
NUM_EPOCH = 100
regressor = SoftmaxRegressionTF(28 * 28, 10)
optimazer = tf.optimizers.SGD(learning_rate=1e-1)

# def loss_function(y, proba):
#     return -y * tf.math.log(proba)

for e in range(NUM_EPOCH):
    for batch_X, batch_y in next_batch(X_train, y_train, batch_size):
        with tf.GradientTape() as tape:
            logits =regressor(batch_X)
            loss = tf.reduce_mean(tf.losses.sparse_categorical_crossentropy(batch_y, logits, from_logits=True))
        
        variables = regressor.variables
        gradients = tape.gradient(loss, variables)
        optimazer.apply_gradients(zip(gradients, variables))
        
    if not e % 10:
        train_y_pred = tf.argmax(logits, axis=1).numpy()
        train_acc = np.sum(train_y_pred == batch_y) / len(batch_y)

    #         test_y_pred = tf.where(LRTF(X_test)[:, 0] >= .5, 1, 0).numpy()
    #         test_acc = np.sum(test_y_pred == y_test) / len(y_test)

        print('Epoch: %03d' % (e+1), end="")
        print(' | Train ACC: %.3f' % train_acc, end="")
        print(' | Cost: %.3f' % loss.numpy())

Epoch: 001 | Train ACC: 0.906 | Cost: 0.536
Epoch: 011 | Train ACC: 0.938 | Cost: 0.322
Epoch: 021 | Train ACC: 0.938 | Cost: 0.296
Epoch: 031 | Train ACC: 0.948 | Cost: 0.282
Epoch: 041 | Train ACC: 0.958 | Cost: 0.271
Epoch: 051 | Train ACC: 0.958 | Cost: 0.263
Epoch: 061 | Train ACC: 0.958 | Cost: 0.256
Epoch: 071 | Train ACC: 0.958 | Cost: 0.250
Epoch: 081 | Train ACC: 0.958 | Cost: 0.244
Epoch: 091 | Train ACC: 0.958 | Cost: 0.240


In [44]:
y_pred = []
for batch_X, batch_y in next_batch(X_test, y_test, batch_size):
    logits =regressor(batch_X)
    y_pred.append(tf.argmax(logits, axis=1).numpy())
y_pred = np.concatenate(y_pred)
np.sum(y_pred == y_test) / len(y_test)

0.9256