In [2]:
## Load libraries
import numpy as np
import sys
import matplotlib.pyplot as plt
import matplotlib.cm as cm
plt.style.use('dark_background')
%matplotlib inline

In [3]:
np.set_printoptions(precision=2)

In [4]:
import tensorflow as tf




In [5]:
tf.__version__

'2.15.0'

In [1]:
# Generate artificial data with 64 samples, 5 features per sample
# and 3 output classes
num_samples = 64 # number of samples
num_features = 5 # number of features (a.k.a. dimensionality)
num_labels = 3 # number of output labels
# Data matrix (each column = single sample)
X = np.random.choice(np.arange(3, 10), size = (num_features, num_samples), replace = True)
# Class labels
y = np.random.choice([0, 1, 2], size = num_samples, replace = True)
# One-hot encode class labels
y = tf.keras.utils.to_categorical(y)

NameError: name 'np' is not defined

---

A generic layer class with forward and backward methods

----

In [7]:
class Layer:
  def __init__(self):
    self.input = None
    self.output = None

  def forward(self, input):
    pass

  def backward(self, output_gradient, learning_rate):
    pass

---

The softmax classifier steps for a generic sample $\mathbf{x}$ with (one-hot encoded) true label $\mathbf{y}$ (3 possible categories) using a randomly initialized weights matrix (with bias abosrbed as its last last column):

1. Calculate raw scores vector for a generic sample $\mathbf{x}$  (bias feature added): $$\mathbf{z} = \mathbf{Wx}.$$
2. Calculate softmax probabilities (that is, softmax-activate the raw scores) $$\mathbf{a} = \text{softmax}(\mathbf{z})\Rightarrow\begin{bmatrix}a_0\\a_1\\a_2\end{bmatrix}= \text{softmax}\left(\begin{bmatrix}z_0\\z_1\\z_2\end{bmatrix}\right)=\begin{bmatrix}\frac{e^{z_0}}{e^{z_0}+e^{z_1}+e^{z_2}}\\\frac{e^{z_1}}{e^{z_0}+e^{z_1}+e^{z_2}}\\\frac{e^{z_2}}{e^{z_0}+e^{z_1}+e^{z_2}}\end{bmatrix}$$
3. Softmax loss for this sample is (where output label $y$ is not yet one-hot encoded)
$$\begin{align*}L &=  -\log([\mathbf{a}]_y) \\&= -\log\left(\left[\text{softmax}(\mathbf{z})\right]_y\right)\\ &= -\log\left(\left[\text{softmax}(\mathbf{Wx})\right]_y\right).\end{align*}$$
4. Predicted probability vector that the sample belongs to each one of the output categories is given a new name $$\hat{\mathbf{y}} = \mathbf{a}.$$
5. One-hot encoding the output label $$\underbrace{y\rightarrow\mathbf{y}}_{\text{e.g.}\,2\,\rightarrow\begin{bmatrix}0\\0\\1\end{bmatrix}}$$ results in the following representation for the softmax loss for the sample which is also referred to as the categorical crossentropy (CCE) loss:
$$\begin{align*}L &= L\left(\mathbf{y},\hat{\mathbf{y}}\right)=\sum_{k=0}^2-y_k\log\left(\hat{y}_k\right)\end{align*}.$$
5. Calculate the gradient of the loss for the sample w.r.t. weights by following the computation graph from top to bottom (that is, backward):
$$\begin{align*} L\\{\color{yellow}\downarrow}\\ \hat{\mathbf{y}} &= \mathbf{a}\\{\color{yellow}\downarrow}\\\mathbf{z}\\{\color{yellow}\downarrow}\\\mathbf{W}\end{align*}$$
$$\begin{align*}\Rightarrow \nabla_\mathbf{W}(L) &= \nabla_\mathbf{W}(\mathbf{z}) \times\nabla_\mathbf{z}(\hat{\mathbf{y}})\times\nabla_{\hat{\mathbf{y}}}(L)\\&= \underbrace{\nabla_\mathbf{W}(\mathbf{z})}_\text{first term} \times\underbrace{\nabla_\mathbf{z}(\mathbf{a})}_\text{second to last term}\times\underbrace{\nabla_\hat{\mathbf{y}}(L)}_\text{last term}.\end{align*}$$
7. Now focus on the last term $\nabla_\hat{\mathbf{y}}(L)$:
$$\begin{align*}\nabla_\hat{\mathbf{y}}(L) &=\begin{bmatrix}\nabla_{\hat{y}_0}(L)\\\nabla_{\hat{y}_1}(L)\\\nabla_{\hat{y}_2}(L)\end{bmatrix} = \begin{bmatrix}-y_0/\hat{y}_0\\-y_1/\hat{y}_1\\-y_2/\hat{y}_2\end{bmatrix}.\end{align*}$$
8. Now focus on the second to last term $\nabla_\mathbf{z}(\mathbf{a})$:
$$\begin{align*}\nabla_\mathbf{z}(\mathbf{a}) &= \nabla_\mathbf{z}\left(\begin{bmatrix}a_0\\a_1\\a_2\end{bmatrix}\right)\\ &= \begin{bmatrix}\nabla_\mathbf{z}(a_0)&\nabla_\mathbf{z}(a_1)&\nabla_\mathbf{z}(a_2)\end{bmatrix} \\&= \begin{bmatrix}\nabla_{z_0}(a_0)&\nabla_{z_0}(a_1)&\nabla_{z_0}(a_2)\\\nabla_{z_1}(a_0)&\nabla_{z_1}(a_1)&\nabla_{z_1}(a_2)\\\nabla_{z_2}(a_0)&\nabla_{z_2}(a_1)&\nabla_{z_2}(a_2)\end{bmatrix}\\&=\begin{bmatrix}a_0(1-a_0)&-a_1a_0&-a_2a_0\\-a_0a_1&a_1(1-a_1)&-a_2a_1\\-a_0a_2&-a_1a_2&a_2(1-a_2)\end{bmatrix}.\end{align*}$$
9. Now focus on the last term $\nabla_\mathbf{W}(\mathbf{z}) = \nabla_\mathbf{W}(\mathbf{Wx})$:

![](https://onedrive.live.com/embed?resid=37720F927B6DDC34%21103155&authkey=%21AMH79mXBdb_raAA&width=660)

The full gradient can be written as $\nabla_\mathbf{W}(L)=$

![](https://onedrive.live.com/embed?resid=37720F927B6DDC34%21103156&authkey=%21AIdyOQ3a-er-7-A&width=660)

$$\begin{align*}=\begin{bmatrix}a_1(1-a_1)&-a_2a_1&-a_3a_1\\-a_1a_2&a_2(1-a_2)&-a_3a_2\\-a_1a_3&-a_2a_3&a_3(1-a_3)\end{bmatrix}\times\begin{bmatrix}-y_1/\hat{y}_1\\-y_2/\hat{y}_2\\-y_3/\hat{y}_3\end{bmatrix}\mathbf{x}^\mathrm{T}.\end{align*}$$


---

---

CCE loss and its gradient

$$\begin{align*}L &= L\left(\mathbf{y},\hat{\mathbf{y}}\right)=\sum_{k=0}^2-y_k\log\left(\hat{y}_k\right)\\\nabla_\hat{\mathbf{y}}(L) &=\begin{bmatrix}\nabla_{\hat{y}_0}(L)\\\nabla_{\hat{y}_1}(L)\\\nabla_{\hat{y}_2}(L)\end{bmatrix} = \begin{bmatrix}-y_0/\hat{y}_0\\-y_1/\hat{y}_1\\-y_2/\hat{y}_2\end{bmatrix}.\end{align*}$$


---

In [None]:
## Define the loss function and its gradient
def cce(y, yhat):
  return(?)

def cce_gradient(y, yhat):
  return(?)

# TensorFlow in-built function for categorical crossentropy loss
#cce = tf.keras.losses.CategoricalCrossentropy()

---

Softmax activation layer class
$$\begin{align*}\text{forward:}\ \mathbf{a} &=\text{softmax}(\mathbf{z}),\\\text{backward:}\ \nabla_\mathbf{z}(L) &= \nabla_{\mathbf{z}}(\mathbf{a})\times\nabla_{\mathbf{a}}(L) = \nabla_{\mathbf{z}}(\mathbf{a})\times\nabla_{\hat{\mathbf{y}}}(L)\\&=\begin{bmatrix}a_0(1-a_0)&-a_1a_0&-a_2a_0\\-a_0a_1&a_1(1-a_1)&-a_2a_1\\-a_0a_2&-a_1a_2&a_2(1-a_2)\end{bmatrix}\begin{bmatrix}-y_0/\hat{y}_0\\-y_1/\hat{y}_1\\-y_2/\hat{y}_2\end{bmatrix}.\end{align*}$$


---

In [None]:
## Softmax activation layer class
class Softmax(Layer):
  def forward(self, input):
    self.output = ?

  def backward(self, output_gradient, learning_rate = None):
    T = ?
    return(np.einsum(?, T, output_gradient))

---

Dense layer class

$$\begin{align*}\text{forward:}\ \mathbf{z}&=\mathbf{Wx}\\\text{backward:}\ \nabla_\mathbf{W}(L)&=\nabla_{\mathbf{W}}(\mathbf{z})\times\nabla_{\mathbf{z}}(L)\\&=\nabla_{\mathbf{z}}(L)\mathbf{x}^\mathrm{T}.\end{align*}$$

---

In [None]:
## Dense layer class
class Dense(Layer):
    def __init__(self, input_size, output_size):
        self.weights = np.random.randn(output_size, input_size+1) # bias trick
        self.weights[:, -1] = 0.01 # Set all bias values to the same nonzero constant

    def forward(self, input):
        self.input = np.vstack([input, np.ones((1, input.shape[1]))]) # bias trick
        self.output= np.dot(self.weights, self.input)

    def backward(self, output_gradient, learning_rate):
        weights_gradient = np.dot(?, ?)
        input_gradient = np.dot(self.weights.T, output_gradient)
        self.weights = self.weights + learning_rate * (-weights_gradient)
        return(input_gradient)

In [None]:
# Forward and backward propagation for a batch size 16 of samples
batch_size = 16
learning_rate = 1e-02 # learning rate
dlayer = Dense(num_features, num_labels) # define dense layer
print(dlayer.weights)
dlayer.forward(X[:, 0:batch_size]) # forward prop
softmax = Softmax() # define softmax activation
softmax.forward(dlayer.output) # Softmax activate
loss = cce(y[:, 0:batch_size], softmax.output) # forward prop is over
print(loss)
grad = cce_gradient(y[:, 0:batch_size], softmax.output)
grad = softmax.backward(grad)
grad = dlayer.backward((1/batch_size)*grad, learning_rate)
print('-----')
print(dlayer.weights)

---

Function to generate sample indices for batch processing according to batch size

---

In [None]:
## Function to generate sample indices for batch processing according to batch size
def generate_batch_indices(num_samples, batch_size):
  # Reorder sample indices
  reordered_sample_indices = np.random.choice(num_samples, num_samples, replace = False)
  # Generate batch indices for batch processing
  batch_indices = np.split(reordered_sample_indices, np.arange(batch_size, len(reordered_sample_indices), batch_size))
  return(batch_indices)

---

Example generation of batch indices

---

In [None]:
## Example generation of batch indices
# Batch size
batch_size = 16
# Number of batches per epoch
num_iterations_per_epoch = int(np.ceil(num_samples/batch_size))
print('Number of iterations per epoch = %d\n'%(num_iterations_per_epoch))
b = 0
epoch = 0
for it in range(num_iterations_per_epoch):
  if it % num_iterations_per_epoch == 0:# check if we are at the start of an epoch
    print('--------------------------------')
    print('Epoch %d:'%(epoch+1))
    batch_indices = generate_batch_indices(num_samples, batch_size)
    b = 0
    epoch = epoch + 1
    print('--------------------------------')
  print('In iteration %d, using samples' % (it+1))
  print(batch_indices[b])
  b += 1

---

Train the 0-layer neural network using batch training with batch size = 16

---

In [None]:
## Train the 0-layer neural network using batch training with batch size = 16

learning_rate = 1e-04 # learning rate
batch_size = 16 # batch size
nepochs = 100 # number of epochs

loss = 0 # initialize loss
dlayer = Dense(num_features, num_labels) # define dense layer
softmax = Softmax() # define softmax activation layer

# Steps: run over each sample in the batch, calculate loss, gradient of loss,
# and update weights.

# Number of batches per epoch
num_iterations_per_epoch = int(np.ceil(num_samples/batch_size))
print('Number of iterations per epoch = %d\n'%(num_iterations_per_epoch))
b = 0
epoch = 0
while epoch <= nepochs:
  for it in range(num_iterations_per_epoch):
    if it % num_iterations_per_epoch == 0:# check if we are at the start of an epoch
      print('Epoch %d:, loss = %f'%(epoch+1, loss))
      batch_indices = generate_batch_indices(num_samples, batch_size)
      dlayer.forward(X[:, ?]) # forward prop
      softmax.forward(dlayer.output) # Softmax activate
      loss += cce(y[:, ?], ?) # calculate loss for sample
      # Backward prop starts here
      grad = cce_gradient(?, ?)
      grad = softmax.backward(output_gradient = ?)
      grad = dlayer.backward(?, ?)
      b = 0
      epoch = epoch + 1
      b += 1