In [None]:
from numpy import ndarray
from typing import List
import numpy as np

#Operations
class Operation(object):

    def __init__(self):
        pass

    def forward(self,
                input_: ndarray,
                inference: bool=False) -> ndarray:  #<----inference

        self.input_ = input_

        self.output = self._output(inference) #<----inference

        return self.output

    def backward(self, output_grad: ndarray) -> ndarray:

        #make sure output and output_grad has same shape
        assert self.output.shape == output_grad.shape

        self.input_grad = self._input_grad(output_grad)

        #input grad must have same shape as input
        assert self.input_.shape == self.input_grad.shape

        return self.input_grad

    def _output(self, inference: bool) -> ndarray:  #<----inference
        raise NotImplementedError()

    def _input_grad(self, output_grad: ndarray) -> ndarray:
        raise NotImplementedError()

class ParamOperation(Operation):
    def __init__(self, param: ndarray):
        super().__init__()  #inherit from parent if any
        self.param = param  #this will be used in _output

    def backward(self, output_grad: ndarray) -> ndarray:
        
        #make sure output and output_grad has same shape
        assert self.output.shape == output_grad.shape

        #perform gradients for both input and param
        self.input_grad = self._input_grad(output_grad)
        self.param_grad = self._param_grad(output_grad)

        assert self.input_.shape == self.input_grad.shape
        assert self.param.shape == self.param_grad.shape

        return self.input_grad

    def _param_grad(self, output_grad: ndarray) -> ndarray:
        raise NotImplementedError()  

class WeightMultiply(ParamOperation):

    def __init__(self, W: ndarray):
        #initialize Operation with self.param = W
        super().__init__(W)

    def _output(self, inference: bool) -> ndarray: #<----inference
        return self.input_ @ self.param

    def _input_grad(self, output_grad: ndarray) -> ndarray:
        return output_grad @ self.param.T  #same as last class

    def _param_grad(self, output_grad: ndarray)  -> ndarray:
        return self.input_.T @ output_grad  #same as last class

class BiasAdd(ParamOperation):
    def __init__(self, B: ndarray):
        #initialize Operation with self.param = B.
        assert B.shape[0] == 1  #make sure it's only B
        super().__init__(B)

    def _output(self, inference: bool) -> ndarray: #<----inference
        return self.input_ + self.param

    def _input_grad(self, output_grad: ndarray) -> ndarray:
        return np.ones_like(self.input_) * output_grad

    def _param_grad(self, output_grad: ndarray) -> ndarray:
        param_grad = np.ones_like(self.param) * output_grad
        return np.sum(param_grad, axis=0).reshape(1, param_grad.shape[1])


class Linear(Operation):
    def __init__(self) -> None:
        super().__init__()

    def _output(self, inference: bool) -> ndarray:   #<----inference
        return self.input_

    def _input_grad(self, output_grad: ndarray) -> ndarray:
        return output_grad


class Sigmoid(Operation):
    def __init__(self) -> None:
        super().__init__()

    def _output(self, inference: bool) -> ndarray:   #<----inference
        return 1.0/(1.0+np.exp(-1.0 * self.input_))

    def _input_grad(self, output_grad: ndarray) -> ndarray:
        sigmoid_backward = self.output * (1.0 - self.output)
        input_grad = sigmoid_backward * output_grad
        return input_grad


class Tanh(Operation):
    def __init__(self) -> None:
        super().__init__()

    def _output(self, inference: bool) -> ndarray:  #<----inference
        return np.tanh(self.input_)

    def _input_grad(self, output_grad: ndarray) -> ndarray:
        return output_grad * (1 - self.output * self.output)


#we have to define Dropout again, so it refers to the new Operation class
class Dropout(Operation):

    def __init__(self,
                 keep_prob: float = 0.8):
        super().__init__()
        self.keep_prob = keep_prob

    def _output(self, inference: bool) -> ndarray: 
        if inference:
            return self.input_ * self.keep_prob  #multiply input by probability
        else:
            #binomial will give us list of 0 and 1s with 1s of probability equal to keep_prob
            self.mask = np.random.binomial(1, self.keep_prob,
                                           size=self.input_.shape)  
            return self.input_ * self.mask

    def _input_grad(self, output_grad: ndarray) -> ndarray:
        #since gradient of 0 is nothing, thus the input_grad is simply whatever output_grad multiply with self.mask
        return output_grad * self.mask
