<h2>Chapter - 9</h2>

<h4>Derivation of Categorical Cross-Entropy loss</h4>

<img src="derivationLoss.png" width="70%">

<h5>Where Li denotes sample loss value, i — i-th sample in a set, j — label/output index, y — target
values and y-hat — predicted values.</h5>
<h5>The derivative of this loss function with respect to its inputs (predicted values at the i-th sample,
since we are interested in a gradient with respect to the predicted values) equals the negative
ground-truth vector, divided by the vector of the predicted values (which is also the output vector
of the softmax function).</h5>

In [11]:
# Our Full Code

import numpy as np
import nnfs
from nnfs.datasets import spiral_data

nnfs.init()

# Dense Layer
class Layer_Dense:

    # Layer init
    def __init__(self, n_inputs, n_neurons):
        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
        self.biases = np.zeros((1, n_neurons))

    def forward(self, inputs):
        self.output = np.dot(inputs, self.weights) + self.biases

# ReLU activation function
class Activation_ReLU:

    def forward(self, inputs):
        self.output = np.maximum(0, inputs)

# Softmax Activation Function
class Activation_Softmax:

    def forward(self, inputs):
        expo_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))
        norm_values = expo_values / np.sum(expo_values, axis=1, keepdims=True)
        self.output = norm_values

# Common Loss
class Loss:

    # output => model's prediction
    # y => ground truth
    def calculate(self, output, y):
        # forward method is of specific loss function eg. Cross Entropy
        sample_losses = self.forward(output, y)
        
        data_loss = np.mean(sample_losses)

        return data_loss


# Cross Entropy Loss:
class Loss_Categorical_Cross_Entropy(Loss):
    
    def forward(self, y_pred, y_true):
        
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)

        # check if y_true is sparse or one-hot-coded
        if len(y_true.shape) == 1:
            correct_confidence = y_pred_clipped[range(len(y_pred_clipped)), y_true]
        else:
            correct_confidence = np.sum(y_pred_clipped * y_true, axis=1)

        # Losses
        neg_log = -np.log(correct_confidence)
        return neg_log


X, y = spiral_data(samples=100, classes=3)

# Initialization
dense1 = Layer_Dense(2, 3)
activation1 = Activation_ReLU()

dense2 = Layer_Dense(3, 3)
activation2 = Activation_Softmax()

loss_function = Loss_Categorical_Cross_Entropy()

# Forward pass
dense1.forward(X)
activation1.forward(dense1.output)

dense2.forward(activation1.output)
activation2.forward(dense2.output)

print(activation2.output[:5])

loss = loss_function.calculate(activation2.output, y)
print("Avg Loss: ", loss)


# Accuracy
# outputs the index from softmax_output
predictions = np.argmax(activation2.output, axis=1)

if len(y.shape) == 2:
    y = np.argmax(y, axis=2)

accuracy = np.mean(predictions == y)

# True evaluates to 1; False to 0
print("Accuracy: ", accuracy)

[[0.33333334 0.33333334 0.33333334]
 [0.3333332  0.3333332  0.33333364]
 [0.3333329  0.33333293 0.3333342 ]
 [0.3333326  0.33333263 0.33333477]
 [0.33333233 0.3333324  0.33333528]]
Avg Loss:  1.0986104
Accuracy:  0.34


In [12]:
# Now we'll update our code for the loss function.
# Previously

# ... loss and other code.

# Cross Entropy Loss:
class Loss_Categorical_Cross_Entropy(Loss):

    def forward(self, y_pred, y_true):

        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)

        # check if y_true is sparse or one-hot-coded
        if len(y_true.shape) == 1:
            correct_confidence = y_pred_clipped[range(
                len(y_pred_clipped)), y_true]
        else:
            correct_confidence = np.sum(y_pred_clipped * y_true, axis=1)

        # Losses
        neg_log = -np.log(correct_confidence)
        return neg_log

    # Adding backward pass
    def backward(self, dvalues, y_true):
        # Number of samples in a batch
        samples = len(dvalues)
        labels = len(dvalues[0])

        # Converting sparse to one-hot-vector
        if len(y_true.shape) == 1:
            y_true = np.eye(labels)[y_true]

        # Calculating gradients
        self.dinputs = -y_true / dvalues
        # Normalizing
        self.dinputs = self.dinputs / samples


<h4>Derivation of Softmax Activation Function</h4>

<img src="softmaxFunction.png" width="70%">


After derivations

<img src="softmaxFinal.png" width="70%">


In [13]:
# Now we can write the code : The interesting part!

import numpy as np
softmax_output = [0.7, 0.1, 0.2]

softmax_output = np.array(softmax_output).reshape(-1, 1)
print(softmax_output)

# First we'll solve the left side of equation above

# Kronecker delta => 1 if both the inputs are equal; else 0
print(np.eye(softmax_output.shape[0]))
 
print(softmax_output * np.eye(softmax_output.shape[0]))


[[0.7]
 [0.1]
 [0.2]]
[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]
[[0.7 0.  0. ]
 [0.  0.1 0. ]
 [0.  0.  0.2]]


In [16]:
# The above operation can also be completed using np.diagflat

print(np.diagflat(softmax_output))
# This is the first part (the left side of it) of the above equation.

[[0.7 0.  0. ]
 [0.  0.1 0. ]
 [0.  0.  0.2]]


In [17]:
# Now the right side of the equation
print(np.dot(softmax_output, softmax_output.T))

[[0.49 0.07 0.14]
 [0.07 0.01 0.02]
 [0.14 0.02 0.04]]


In [18]:
# The final step is to subtract both the above matrices to create Jacobian Matrix
print(
    np.diagflat(softmax_output) -
    np.dot(softmax_output, softmax_output.T)
)
# This is the derivative of Softmax Function


[[ 0.20999999 -0.07       -0.14      ]
 [-0.07        0.09       -0.02      ]
 [-0.14       -0.02        0.16      ]]


<img src="softmaxDerivativeDesc.png" width="60%">

In [19]:
# Now we'll update the Activation_Function in our full-code

# Softmax Activation Function
class Activation_Softmax:

    # Forward Function
    def forward(self, inputs):
        expo_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))
        norm_values = expo_values / np.sum(expo_values, axis=1, keepdims=True)
        self.output = norm_values

    # Backward Function
    def backward(self, dvalues):
        # Initialize array
        self.dinputs = np.empty_like(dvalues)

        # Calculating for each sample
        for index, (single_output, single_dvalue) in enumerate(zip(self.output, dvalues)):
            single_output = single_output.reshape(-1, 1)

            jacobian_matrix = np.diagflat(single_output) - \
                np.dot(single_output, single_output.T)

            self.dinputs[index] = np.dot(jacobian_matrix, single_dvalue)


<h3>The Final Step: Common Categorical Cross-Entropy loss and
Softmax activation derivative </h3>

<img src="commonLossAndActi.png"/>

In [None]:
# Code implementation