In [45]:
import numpy as np

In [48]:
from abc import ABC, abstractmethod
import numpy as np

class Activations(ABC):
    @abstractmethod
    def __call__(self, x):
        pass 

    @abstractmethod
    def compute_grad(self):
        pass

class ReLU(Activations):
    def __call__(self, x):
        self.activated = np.maximum(0, x)
        return self.activated
    
    def compute_grad(self):
        return np.where(self.activated > 0, 1, 0)

In [49]:
class Layer():
    def __init__(self, input_size, output_size, activation):
        self.W = np.random.random((input_size, output_size))
        self.b = np.random.random((output_size, 1))
        self.z, self.a, self.x = 0, 0, 0
        self.grad_local, self.delta = 0, 0
        self.activation = activation

    def __call__(self, x):
        self.x = x
        self.z = self.W.T @ x + self.b
        self.a = self.activation(self.z)
        return self.a     

In [50]:
import numpy as np
from abc import ABC, abstractmethod

class Loss(ABC):
    def __init__(self, model):
        self.model = model

    @abstractmethod
    def __call__(self, input, output):
        pass

    @abstractmethod
    def dloss(self):
        pass

    def compute_delta(self, layers, l):
        if l == len(layers) - 1:
            return self.dloss() * layers[l].activation.compute_grad()
        else:
            return (layers[l + 1].W @ layers[l + 1].delta) * layers[l].activation.compute_grad()

    def compute_grad_local(self, layers, l):
        if l == 0:
            return layers[l].delta @ layers[l].x.T
        else:
            return layers[l].delta @ layers[l - 1].a.T

    def backward(self):
        layers = self.model.layers
        L = len(layers) - 1

        for l in range(L, -1, -1):
            layers[l].delta = self.compute_delta(layers, l)
            layers[l].grad_local = self.compute_grad_local(layers, l)

    def __repr__(self) -> str:
        info = [f'layer {l} delta: {layer.delta.shape} local {layer.grad_local.shape}\n' for l, layer in enumerate(self.model.layers)]
        return f"{self.__class__.__name__}(\n{''.join(info)})"

In [51]:
class MSE(Loss):
    def __call__(self, input, output):
        self.error = output - input
        _, self.m = input.shape
        return np.sum(self.error @ self.error.T) / self.m

    def dloss(self):
        return np.sum(self.error) / self.m

In [59]:
class NeuralNetwork():
    def __init__(self, layers):
        self.layers = layers

    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        return x
    
    def __repr__(self): 
        info = [f'   Layer {l}: {layer.W.shape} \n' for l, layer in enumerate(self.layers)]
        return f"{self.__class__.__name__}(\n{''.join(info)})"
    
    def get_layers(self):
        return self.layers
    

layers = [Layer(784, 4, ReLU()),
          Layer(4, 5, ReLU()),
          Layer(5, 6, ReLU()),
          Layer(6, 7, ReLU()),
          Layer(7, 2, ReLU())]

# layers = [Layer(784, 128,  ReLU()),
#           Layer(128, 64, ReLU()),
#           Layer(64, 2,   ReLU()),
#           Layer(2, 64,   ReLU()),
#           Layer(64, 128, ReLU()),
#           Layer(128, 784,  ReLU())]

nn = NeuralNetwork(layers)

# (nfeatures, batch)
input = np.random.random((784, 1))
output = np.random.random((784, 1))

l2 = MSE(nn)
nn(input)
# print(l2(input, output))
# print(l2.dloss())

array([[8630.76760173],
       [8980.3885862 ]])

In [53]:
l2.backward()

In [54]:
for l, layer in enumerate(nn.layers):
    print(f'l= {l}')
    print(f'x shape {layer.x.shape}')
    print(f'W shape {layer.W.shape}')
    print(f'W.T @, x.shape {(layer.W.T.shape, layer.x.shape)} = {(layer.W.T @ layer.x).shape}')
    print(f'B shape {layer.b.shape}')
    print(f'a shape {layer.a.shape}')
    print(f'z shape {layer.z.shape}')
    print(f'GRADS')
    print(f'local {layer.grad_local.shape}')
    print(f'delta {layer.delta.shape}\n')


l= 0
x shape (784, 4)
W shape (784, 128)
W.T @, x.shape ((128, 784), (784, 4)) = (128, 4)
B shape (128, 1)
a shape (128, 4)
z shape (128, 4)
GRADS
local (128, 784)
delta (128, 4)

l= 1
x shape (128, 4)
W shape (128, 64)
W.T @, x.shape ((64, 128), (128, 4)) = (64, 4)
B shape (64, 1)
a shape (64, 4)
z shape (64, 4)
GRADS
local (64, 128)
delta (64, 4)

l= 2
x shape (64, 4)
W shape (64, 2)
W.T @, x.shape ((2, 64), (64, 4)) = (2, 4)
B shape (2, 1)
a shape (2, 4)
z shape (2, 4)
GRADS
local (2, 64)
delta (2, 4)

l= 3
x shape (2, 4)
W shape (2, 64)
W.T @, x.shape ((64, 2), (2, 4)) = (64, 4)
B shape (64, 1)
a shape (64, 4)
z shape (64, 4)
GRADS
local (64, 2)
delta (64, 4)

l= 4
x shape (64, 4)
W shape (64, 128)
W.T @, x.shape ((128, 64), (64, 4)) = (128, 4)
B shape (128, 1)
a shape (128, 4)
z shape (128, 4)
GRADS
local (128, 64)
delta (128, 4)

l= 5
x shape (128, 4)
W shape (128, 784)
W.T @, x.shape ((784, 128), (128, 4)) = (784, 4)
B shape (784, 1)
a shape (784, 4)
z shape (784, 4)
GRADS
local