In [None]:
#Loss
class Loss(object):
   
    def __init__(self):
        pass

    def forward(self, prediction: ndarray, target: ndarray) -> float:
        assert prediction.shape == target.shape

        self.prediction = prediction
        self.target = target
        
        #self._output will hold the loss function
        loss_value = self._output()

        return loss_value

    def backward(self) -> ndarray:

        self.input_grad = self._input_grad()

        assert self.prediction.shape == self.input_grad.shape

        #input_grad will hold the gradient of the loss function
        return self.input_grad

    def _output(self) -> float:
        raise NotImplementedError()

    def _input_grad(self) -> ndarray:
        raise NotImplementedError()

class MeanSquaredError(Loss):

    def __init__(self):
        super().__init__()

    def _output(self) -> float:
        loss = (
            np.sum(np.power(self.prediction - self.target, 2)) / 
            self.prediction.shape[0]
        )

        return loss

    def _input_grad(self) -> ndarray:
        return 2.0 * (self.prediction - self.target) / self.prediction.shape[0]

class SoftmaxCrossEntropy(Loss):
    def __init__(self, eps: float=1e-9):
        super().__init__()
        self.eps = eps

    def _output(self) -> float:
        
        # applying the softmax function to each row (observation)
        softmax_preds = self.softmax(self.prediction, axis=1)

        # clipping the softmax output to prevent numeric instability
        #numpy.clip(a, a_min, a_max, out=None, **kwargs)
        #To prevent extremely large loss values that could lead to numeric instability, 
        #we’ll clip the output of the softmax function to be no less than 10–7 and no greater than 10^7
        self.softmax_preds = np.clip(softmax_preds, self.eps, 1 - self.eps)

        # actual loss computation
        softmax_cross_entropy_loss = (
            -1.0 * self.target * np.log(self.softmax_preds) - \
                (1.0 - self.target) * np.log(1 - self.softmax_preds)
        )
        
        #return average loss
        return np.sum(softmax_cross_entropy_loss) / self.prediction.shape[0]

    def _input_grad(self) -> ndarray:
        #return average grad
        return (self.softmax_preds - self.target) / self.prediction.shape[0]

    def softmax(self, x, axis=None):
        #keepdims so that this number can be broadcasted and divided
        return np.exp(x) / np.sum(np.exp(x), axis=axis, keepdims=True)

