In [17]:
## Load libraries
import numpy as np
import sys
import matplotlib.pyplot as plt
import matplotlib.cm as cm
plt.style.use('dark_background')
%matplotlib inline

In [18]:
import tensorflow as tf

In [19]:
tf.__version__

'2.15.0'

In [20]:
# Generate artificial data with 5 samples, 4 features per sample
# and 3 output classes
num_samples = 5 # number of samples
num_features = 4 # number of features (a.k.a. dimensionality)
num_labels = 3 # number of output labels
# Data matrix (each column = single sample)
X = np.random.choice(np.arange(3, 10), size = (num_features, num_samples), replace = True)
# Class labels
y = np.random.choice([0, 1, 2], size = num_samples, replace = True)
print(X)
print('------')
print(y)
print('------')
# One-hot encode class labels
y = tf.keras.utils.to_categorical(y)
print(y)

[[3 4 7 7 3]
 [7 5 9 5 6]
 [7 3 3 3 6]
 [7 4 4 4 3]]
------
[0 1 0 0 1]
------
[[1. 0.]
 [0. 1.]
 [1. 0.]
 [1. 0.]
 [0. 1.]]


---

A generic layer class with forward and backward methods

----

In [21]:
class Layer:
  def __init__(self):
    self.input = None
    self.output = None

  def forward(self, input):
    pass

  def backward(self, output_gradient, learning_rate):
    pass

---

The softmax classifier steps for a generic sample $\mathbf{x}$ with (one-hot encoded) true label $\mathbf{y}$ (3 possible categories) using a randomly initialized weights matrix (with bias abosrbed as its last last column):

1. Calculate raw scores vector for a generic sample $\mathbf{x}$  (bias feature added): $$\mathbf{z} = \mathbf{Wx}.$$
2. Calculate softmax probabilities (that is, softmax-activate the raw scores) $$\mathbf{a} = \text{softmax}(\mathbf{z})\Rightarrow\begin{bmatrix}a_0\\a_1\\a_2\end{bmatrix}= \text{softmax}\left(\begin{bmatrix}z_0\\z_1\\z_2\end{bmatrix}\right)=\begin{bmatrix}\frac{e^{z_0}}{e^{z_0}+e^{z_1}+e^{z_2}}\\\frac{e^{z_1}}{e^{z_0}+e^{z_1}+e^{z_2}}\\\frac{e^{z_2}}{e^{z_0}+e^{z_1}+e^{z_2}}\end{bmatrix}$$
3. Softmax loss for this sample is (where output label $y$ is not yet one-hot encoded)
$$\begin{align*}L &=  -\log([a]_y) \\&= -\log\left(\left[\text{softmax}(\mathbf{z})\right]_y\right)\\ &= -\log\left(\left[\text{softmax}(\mathbf{Wx})\right]_y\right).\end{align*}$$
4. Predicted probability vector that the sample belongs to each one of the output categories is given a new name $$\hat{\mathbf{y}} = \mathbf{a}.$$
5. One-hot encoding the output label $$\underbrace{y\rightarrow\mathbf{y}}_{\text{e.g.}\,2\,\rightarrow\begin{bmatrix}0\\0\\1\end{bmatrix}}$$ results in the following representation for the softmax loss for the sample which is also referred to as the categorical crossentropy (CCE) loss:
$$\begin{align*}L &= L\left(\mathbf{y},\hat{\mathbf{y}}\right)=\sum_{k=0}^2-y_k\log\left(\hat{y}_k\right)\end{align*}.$$
6. Calculate the gradient of the loss for the sample w.r.t. weights by following the computation graph from top to bottom (that is, backward):
$$\begin{align*} L\\{\color{yellow}\downarrow}\\ \hat{\mathbf{y}} &= \mathbf{a}\\{\color{yellow}\downarrow}\\\mathbf{z}\\{\color{yellow}\downarrow}\\\mathbf{W}\end{align*}$$
$$\begin{align*}\Rightarrow \nabla_\mathbf{W}(L) &= \nabla_\mathbf{W}(\mathbf{z}) \times\nabla_\mathbf{z}(\mathbf{a})\times\nabla_\mathbf{a}(L)\\&= \underbrace{\nabla_\mathbf{W}(\mathbf{z})}_\text{first term} \times\underbrace{\nabla_\mathbf{z}(\mathbf{a})}_\text{second to last term}\times\underbrace{\nabla_\hat{\mathbf{y}}(L)}_\text{last term}.\end{align*}$$
7. Now focus on the last term $\nabla_\hat{\mathbf{y}}(L)$:
$$\begin{align*}\nabla_\hat{\mathbf{y}}(L) &=\begin{bmatrix}\nabla_{\hat{y}_0}(L)\\\nabla_{\hat{y}_1}(L)\\\nabla_{\hat{y}_2}(L)\end{bmatrix} = \begin{bmatrix}-y_0/\hat{y}_0\\-y_1/\hat{y}_2\\-y_0/\hat{y}_2.\end{bmatrix}\end{align*}$$
8. Now focus on the second to last term $\nabla_\mathbf{z}(\mathbf{a})$:
$$\begin{align*}\nabla_\mathbf{z}(\mathbf{a}) &= \nabla_\mathbf{z}\left(\begin{bmatrix}a_0\\a_1\\a_2\end{bmatrix}\right)\\ &= \begin{bmatrix}\nabla_\mathbf{z}(a_0)&\nabla_\mathbf{z}(a_1)&\nabla_\mathbf{z}(a_2)\end{bmatrix} \\&= \begin{bmatrix}\nabla_{z_0}(a_0)&\nabla_{z_0}(a_1)&\nabla_{z_0}(a_2)\\\nabla_{z_1}(a_0)&\nabla_{z_1}(a_1)&\nabla_{z_1}(a_2)\\\nabla_{z_2}(a_0)&\nabla_{z_2}(a_1)&\nabla_{z_2}(a_2)\end{bmatrix}\\&=\begin{bmatrix}a_0(1-a_0)&-a_1a_0&-a_2a_0\\-a_0a_1&a_1(1-a_1)&-a_1a_1\\-a_0a_2&-a_1a_2&a_2(1-a_2)\end{bmatrix}.\end{align*}$$
9. On Monday, we will focus on the first term to complete the gradient calculation using the computation graph.


---

In [22]:
## Softmax activation class
class Softmax(Layer):
  def forward(self, input):
    self.output = np.array(tf.nn.softmax(input))

  def backward(self, output_gradient, learning_rate):
    return(np.dot((np.identity(np.size(self.output))-self.output.T) * self.output, output_gradient))

In [23]:
## Define the loss function and its gradient
def cce(y, yhat):
  return(-np.sum(y*np.log(yhat)))

def cce_gradient(y, yhat):
  return(-y/yhat)

# TensorFlow in-built function for categorical crossentropy loss
cce = tf.keras.losses.CategoricalCrossentropy()
cce

<keras.src.losses.CategoricalCrossentropy at 0x191e8dfa910>

In [24]:
## Train the 0-layer neural network using batch training with batch size = 1

# Steps: run over each sample, calculate loss, gradient of loss,
# and update weights.

# Step-1: add the bias feature to all the samples
# Step-2: initialize the entries of the weights matrix randomly
# Step-3: create softmax layer object softmax

# Step-4: run over each sample
for i in range(X.shape[1]):
  # Step-5: forward step
  # (a) Raw scores z = Wx = np.dot(W, x[:, i])
  # (b) Softmax activation: softmax.forward(z)
  # (c) Calculate cce loss for sample: cce(y[i, :], softmax.output)
  # (d) Print cce loss

  # Step-6: backward step
  # (a) Calculate the gradient of the sample loss w.r.t. input of the
  # softmax layer: softmax.backward(output_gradient = cce_gradient(y[i, :], softmax.output))
  # (d) Print gradient

SyntaxError: incomplete input (1102542339.py, line 21)

1. Calculate raw scores vector for a generic sample $\mathbf{x}$  (bias feature added): $$\mathbf{z} = \mathbf{Wx}.$$

In [25]:
x = tf.constant([[1, 2, 3], [4, 5, 6]])

In [26]:
W = tf.Variable(tf.random.normal([3, 2]))

In [27]:
import numpy as np
z = np.dot(W, x)
z

array([[-3.89800286, -6.52875423, -9.15950561],
       [-4.53273789, -5.65191492, -6.77109195],
       [-1.38968897, -2.18854482, -2.98740068]])

2. Calculate softmax probabilities (that is, softmax-activate the raw scores) $$\mathbf{a} = \text{softmax}(\mathbf{z})\Rightarrow\begin{bmatrix}a_0\\a_1\\a_2\end{bmatrix}= \text{softmax}\left(\begin{bmatrix}z_0\\z_1\\z_2\end{bmatrix}\right)=\begin{bmatrix}\frac{e^{z_0}}{e^{z_0}+e^{z_1}+e^{z_2}}\\\frac{e^{z_1}}{e^{z_0}+e^{z_1}+e^{z_2}}\\\frac{e^{z_2}}{e^{z_0}+e^{z_1}+e^{z_2}}\end{bmatrix}$$

In [28]:
z = tf.matmul(W, tf.cast(x, tf.float32))
z

<tf.Tensor: shape=(3, 3), dtype=float32, numpy=
array([[-3.8980029, -6.528754 , -9.159506 ],
       [-4.5327377, -5.651915 , -6.771092 ],
       [-1.389689 , -2.1885448, -2.9874005]], dtype=float32)>

In [29]:
softmax = lambda z: np.exp(z) / np.sum(np.exp(z))
a = softmax(z)
a

array([[4.5181617e-02, 3.2541757e-03, 2.3437974e-04],
       [2.3949694e-02, 7.8207320e-03, 2.5538483e-03],
       [5.5502009e-01, 2.4967213e-01, 1.1231335e-01]], dtype=float32)

3. Softmax loss for this sample is (where output label $y$ is not yet one-hot encoded)
$$\begin{align*}L &=  -\log([a]_y) \\&= -\log\left(\left[\text{softmax}(\mathbf{z})\right]_y\right)\\ &= -\log\left(\left[\text{softmax}(\mathbf{Wx})\right]_y\right).\end{align*}$$

In [30]:
def softmax_loss(y, a):
    return -np.log(a[y])

y = 1  
a = softmax(z)
loss = softmax_loss(y, a)
print(loss)


[3.7317996 4.850977  5.970154 ]


4. Predicted probability vector that the sample belongs to each one of the output categories is given a new name $$\hat{\mathbf{y}} = \mathbf{a}.$$

In [31]:
y_hat = a
y_hat

array([[4.5181617e-02, 3.2541757e-03, 2.3437974e-04],
       [2.3949694e-02, 7.8207320e-03, 2.5538483e-03],
       [5.5502009e-01, 2.4967213e-01, 1.1231335e-01]], dtype=float32)

5. One-hot encoding the output label $$\underbrace{y\rightarrow\mathbf{y}}_{\text{e.g.}\,2\,\rightarrow\begin{bmatrix}0\\0\\1\end{bmatrix}}$$ results in the following representation for the softmax loss for the sample which is also referred to as the categorical crossentropy (CCE) loss:
$$\begin{align*}L &= L\left(\mathbf{y},\hat{\mathbf{y}}\right)=\sum_{k=0}^2-y_k\log\left(\hat{y}_k\right)\end{align*}.$$

In [32]:
y_encoded = tf.keras.utils.to_categorical(y, num_labels)
y_encoded

array([0., 1., 0.], dtype=float32)

6. Calculate the gradient of the loss for the sample w.r.t. weights by following the computation graph from top to bottom (that is, backward):
$$\begin{align*} L\\{\color{yellow}\downarrow}\\ \hat{\mathbf{y}} &= \mathbf{a}\\{\color{yellow}\downarrow}\\\mathbf{z}\\{\color{yellow}\downarrow}\\\mathbf{W}\end{align*}$$
$$\begin{align*}\Rightarrow \nabla_\mathbf{W}(L) &= \nabla_\mathbf{W}(\mathbf{z}) \times\nabla_\mathbf{z}(\mathbf{a})\times\nabla_\mathbf{a}(L)\\&= \underbrace{\nabla_\mathbf{W}(\mathbf{z})}_\text{first term} \times\underbrace{\nabla_\mathbf{z}(\mathbf{a})}_\text{second to last term}\times\underbrace{\nabla_\hat{\mathbf{y}}(L)}_\text{last term}.\end{align*}$$

In [33]:
import numpy as np

def softmax(z):
    exp_z = np.exp(z)
    return exp_z / np.sum(exp_z)

def softmax_loss(y, a):
    return -np.log(a[y])

def gradient_W(X, y, a):
    # Compute the gradient of the loss with respect to z
    grad_z = a
    grad_z[y] -= 1

    # Compute the gradient of z with respect to W
    grad_W = np.outer(grad_z, X)

    return grad_W

x_0 = np.random.rand()
x_1 = np.random.rand()
x_2 = np.random.rand()
x_3 = np.random.rand()

X = np.array([x_0, x_1, x_2, x_3])

print(X)
print('------')
grad_W = gradient_W(X, y, a)
print('------')
print(grad_W)

[0.54602814 0.92693996 0.66335014 0.63450074]
------
------
[[ 2.46704345e-02  4.18806464e-02  2.99712322e-02  2.86677695e-02]
 [ 1.77687149e-03  3.01642545e-03  2.15865789e-03  2.06477686e-03]
 [ 1.27977933e-04  2.17255946e-04  1.55475833e-04  1.48714117e-04]
 [-5.32950942e-01 -9.04740042e-01 -6.47463118e-01 -6.19304648e-01]
 [-5.41757807e-01 -9.19690617e-01 -6.58162264e-01 -6.29538483e-01]
 [-5.44633685e-01 -9.24572722e-01 -6.61656067e-01 -6.32880338e-01]
 [ 3.03056591e-01  5.14470304e-01  3.68172659e-01  3.52160660e-01]
 [ 1.36328009e-01  2.31431074e-01  1.65620043e-01  1.58417151e-01]
 [ 6.13262513e-02  1.04107734e-01  7.45030786e-02  7.12629052e-02]]


7. Now focus on the last term $\nabla_\hat{\mathbf{y}}(L)$:
$$\begin{align*}\nabla_\hat{\mathbf{y}}(L) &=\begin{bmatrix}\nabla_{\hat{y}_0}(L)\\\nabla_{\hat{y}_1}(L)\\\nabla_{\hat{y}_2}(L)\end{bmatrix} = \begin{bmatrix}-y_0/\hat{y}_0\\-y_1/\hat{y}_2\\-y_0/\hat{y}_2.\end{bmatrix}\end{align*}$$

In [34]:
def gradient_y(y, y_hat):
    grad_y = -y / y_hat
    return grad_y

grad_y = gradient_y(y_encoded, y_hat)
grad_y


array([[  -0.       , -307.2975   ,   -0.       ],
       [   0.       ,    1.0078824,    0.       ],
       [  -0.       ,   -4.005253 ,   -0.       ]], dtype=float32)

8. Now focus on the second to last term $\nabla_\mathbf{z}(\mathbf{a})$:
$$\begin{align*}\nabla_\mathbf{z}(\mathbf{a}) &= \nabla_\mathbf{z}\left(\begin{bmatrix}a_0\\a_1\\a_2\end{bmatrix}\right)\\ &= \begin{bmatrix}\nabla_\mathbf{z}(a_0)&\nabla_\mathbf{z}(a_1)&\nabla_\mathbf{z}(a_2)\end{bmatrix} \\&= \begin{bmatrix}\nabla_{z_0}(a_0)&\nabla_{z_0}(a_1)&\nabla_{z_0}(a_2)\\\nabla_{z_1}(a_0)&\nabla_{z_1}(a_1)&\nabla_{z_1}(a_2)\\\nabla_{z_2}(a_0)&\nabla_{z_2}(a_1)&\nabla_{z_2}(a_2)\end{bmatrix}\\&=\begin{bmatrix}a_0(1-a_0)&-a_1a_0&-a_2a_0\\-a_0a_1&a_1(1-a_1)&-a_1a_1\\-a_0a_2&-a_1a_2&a_2(1-a_2)\end{bmatrix}.\end{align*}$$

In [35]:
import numpy as np

def gradient_z(a):
    diag_a = np.diag(a)
    return diag_a - np.outer(a, a)

a_0 = np.random.rand()
a_1 = np.random.rand()  
a_2 = np.random.rand()

a = np.array([a_0, a_1, a_2])
grad_z = gradient_z(a)
grad_z


array([[ 0.24334719, -0.12014334, -0.1270957 ],
       [-0.12014334,  0.16390844, -0.04514758],
       [-0.1270957 , -0.04514758,  0.17078079]])

In [36]:
## Step-1: add the bias feature to all the samples

X = np.reshape(X, (X.shape[0], 1))
X_bias = np.hstack((np.ones((X.shape[0], 1)), X))


In [37]:
import numpy as np

# Step-2: initialize the entries of the weights matrix randomly
initialize_weights = lambda num_features, num_labels: np.random.randn(num_labels, num_features)

num_features = 4
num_labels = 3
W = initialize_weights(num_features, num_labels)
print(W)


[[ 0.91021523 -1.00386618 -0.49760738 -0.97989465]
 [ 0.10668154  0.53053453 -1.27946603  1.16003424]
 [-0.47184029  0.76798295 -1.36321658 -1.08615807]]


In [38]:
# Step-3: create softmax layer object softmax
class SoftmaxLayer:
    def __init__(self):
        pass
    
    def forward(self, z):
        exp_z = np.exp(z)
        self.output = exp_z / np.sum(exp_z)
    
    def backward(self, y):
        return self.output - y

softmax = SoftmaxLayer()


In [40]:
def cross_entropy_loss(y_true, y_pred):
  return -np.sum(y_true * np.log(y_pred))

# Start the loop over each sample
for i in range(X.shape[1]):
  # Calculate the raw scores by multiplying the weights with the sample
  z = np.dot(W, X[:, i])
  
  # Apply the softmax activation function
  softmax.forward(z)
  
  # Calculate the cross-entropy loss for the sample
  def cross_entropy_loss(y_true, y_pred):
    return -np.sum(y_true * np.log(y_pred))
  # Print the loss for the sample
  print(f"Loss for sample {i}: {loss}")

  # Calculate the gradient of the loss with respect to the input of the softmax layer
  grad_z = softmax.backward(y)
  
  # Print the gradient for the sample
  print(f"Gradient for sample {i}: {grad_z}")

Loss for sample 0: 4.33989799221477
Gradient for sample 0: [-0.88190171 -0.26915383 -0.84894446]
