In [1]:
# Loss functions
# Now we know how to perform a forward pass, how do we determine how wrong our network is

# Calculating network error with loss function.
# Our goal is to train/teach/optimize our model over time. 

# In order to do this, we need to tweak the parameters. But how do we decide what 
# parameters to tweak and how much to modify them?

# In order to do this, we need to calculate how much error a model has, 
# the function for performing this operation is called a loss function also referred to as a cost function

# We ideally want the Loss(the result of the loss function) to be 0.

# The reason we don't use the bare argmax of the prediction is because what we are
# actually trying to find is the confidence of the model

In [2]:
# Categorical cross-entropy loss

# In linear regression, we use mean squared error loss. 
# Since we are performing classification in this problem, we would like to use a
# different meaure for calculating the accuracy of our network using a probability distribution.

# For this we would be using the cross-entropy loss function. 
# Cross-entropy loss is used to compare ground truth probability distributions to some predicted probabilities distribtution

# Cross-entropy loss funciton is the most common loss function used with the softmax activation function.

"""
    L = - sum[j=1, j=num_classes](y*log(y_pred))
    # since the y for all other classes is zero
    then L = -y*log(y_pred)
    # but what then happens to class zero?
"""

# L[i] denotest the loss at a single instance. 
# Why is it called cross entroyp and not log loss, which is also another type of loss function
# Log loss is what is applied to the output of the binary classifier in a logistic regression model.
# In our case, we would be dealing with multiple classes and a probability distribution across those 
# classes for each sample

# The target probabilities are one hot encoded so that they would produce a vector
# The reason it is called one hot encoded is because only one value is hot (on) and the 
# rest are cold "off". 

'\n    L = - sum[j=1, j=num_classes](y*log(y_pred))\n    # since the y for all other classes is zero\n    then L = -y*log(y_pred)\n    # but what then happens to class zero?\n'

In [3]:
# Trying it out in code
import numpy as np

# predictions
softmax_output = np.array([0.7, 0.1, 0.2])
# ground truth
target_output = np.array([1, 0, 0])


In [4]:
- np.sum(np.log(softmax_output) * target_output)

0.35667494393873245

In [6]:
import math
print(math.log(1))
print(math.log(0.95))

print(math.log(0.2))
print(math.log(0.1))


0.0
-0.05129329438755058
-1.6094379124341003
-2.3025850929940455


In [7]:
# It can be observered that he log of the values are negative except for the log of 1
# That is why we add a negative sign infront of the cross entroy loss function in order to flip the negation sign
# So that the value goes higher the worse our prediction confidence is 
# and once our prediction confidence for the target class is 1, which means that our model is very confident
# It will then set our loss value to zero


In [None]:
# The defualt log that is used without any underscore signifies the eulers loss function.
# It is also regarded as the natural log

In [11]:
# Using the cross entropy loss function on a batch of data
softmax_outputs = np.array([[0.7, 0.1, 0.2],
                            [0.1, 0.5, 0.4],
                            [0.02, 0.9, 0.08]])

class_targets = [0, 1, 1]

In [12]:
print(softmax_outputs[range(len(softmax_outputs)), class_targets])

[0.7 0.5 0.9]


In [14]:
loss_list = -np.log(softmax_outputs[range(len(softmax_outputs)), class_targets])

In [15]:
# finally, we would like to average the loss of the batch by fining the average
# We will use the numpy mean mehtod to do this
np.mean(loss_list)

0.38506088005216804

In [16]:
len((4,))

1

In [17]:
# This shows that we can make this work for both one hot encoded values are bare values
# To avoid having a zero value in our exponent, we will clip or results and give it a very small value as the defualt

In [18]:
sample = np.array([[1], [2], [3]])
np.mean(sample)

2.0

In [19]:
class Loss:
    def calculate(self, predictions, targets):
        losses = self.forward(predictions, targets)
        total_loss = np.mean(losses)
        return total_loss

In [23]:
class Loss_CategoricalCrossEntropy(Loss):
    def __init__(self):
        pass
    
    def forward(self, predictions, targets):
        predictions = np.clip(predictions, 1e-7, 1 - 1e-7)
        predictions = -np.log(predictions)
        if len(targets.shape) == 1:
            self.output = predictions[range(len(predictions)), targets]
        
        else:
            predictions = predictions * targets
            self.output = np.sum(predictions, axis=1, keepdims=True)
            
        return self.output

In [22]:
data = [[1, 2, 3], [4, 5, 6]]
np.max(data, axis=1, keepdims=True)

array([[3],
       [6]])

In [27]:
loss_function = Loss_CategoricalCrossEntropy()
loss_function.calculate(softmax_outputs, np.array(class_targets))

0.38506088005216804

In [28]:
# Reimplementing everying so far

class Layer_Dense:
    def __init__(self, n_inputs, n_outputs):
        # use 0.01 to scale down the values
        self.weights = 0.01 * np.random.randn(n_inputs, n_outputs)
        self.biases = np.zeros((1, n_outputs))
    
    def forward(self, X):
        self.output = np.dot(X, self.weights) + self.biases
        return self.output

class Activation_ReLU:
    def __init__(self):
        pass
    def forward(self, X):
        # Returns zero if its less than zero
        self.output =  np.maximum(0, X)
        self.output

class Activation_Softmax:
    def __init__(self):
        pass
    
    def forward(self, X):
        # we need to scale down our exponent values because of exploding exponents
        # we use exponents to remove negative values and convert our values into a probability distribution
        X = X - np.max(X, axis=1, keepdims=True)
        X = np.exp(X)
        self.output = X / np.sum(X, axis=1, keepdims=True)
        return self.output
        
        
class Loss:
    def __init__(self):
        pass
    
    def forward(self, predictions, targets):
        pass
    
    def calculate(self, predictions, targets):
        losses = self.forward(predictions, targets)
        loss = np.mean(losses)
        return loss

class Loss_CategoricalCrossEntropy(Loss):
    def forward(self, predictions, targets):
        # first we need to clip the predictions so that we don't run into errors calculating log of zero
        predictions = np.clip(predictions, 1e-7, 1 - 1e-7)
        # calculate the negative log of all exponents
        predictions = - np.log(predictions)
        
        # dependin on if the targets are one hot encoded or not find 
        # the cross entropy loss
        if (len(targets.shape) == 1):
            self.output = predictions[range(len(predictions)), targets].T # get the transpose, so the results would be the same
        else:
            # perform element wise multiplication (cross product) and get the sum of each row
            self.output = np.sum( predictions * targets , axis=1, keepdims=True)

In [39]:
softmax_outputs

array([[0.7 , 0.1 , 0.2 ],
       [0.1 , 0.5 , 0.4 ],
       [0.02, 0.9 , 0.08]])

In [40]:
# Calculating accuracy
predictions = np.argmax(softmax_outputs, axis=1)
if np.array(class_targets).shape == 2:
    class_targets = np.argmax(class_targets, axis=1)
np.mean(predictions == class_targets)

1.0

In [None]:
# Now we have learnt how to do a forward pass through our model and how to view 
# Its performance, now we would focus on how to optimize