In [1]:
import numpy as np

In [2]:
class Layer:
    def __init__(self):
        self.input = None
        self.output = None
        
    def forward(self, input):
        # return output
        pass
    
    def backward(self, learning_rate, output_gradient):
        # update parameters and return input gradient
        pass

In [3]:
class Dense(Layer):
    def __init__(self, input_size, output_size):
        self.w = np.random.randn(output_size, input_size)
        self.b = np.random.randn(output_size, 1)
    
    def forward(self, input):
        self.input = input
        return np.dot(self.w, self.input) + self.b
    
    def backward(self, learning_rate, output_gradient):
        # update parameters and return input gradient
        dw = np.dot(output_gradient, self.input.T)
        self.w -= learning_rate*dw
        self.b -= learning_rate * output_gradient
        return np.dot(self.w.T, output_gradient)

In [4]:
class Activation(Layer):
    def __init__(self, activation, Activation_derivative):
        self.activation = activation
        self.Activation_derivative = Activation_derivative
        
    def forward(self, input):
        self.input = input
        return self.activation(self.input)
    
    def backward(self, learning_rate, output_gradient):
        # return input gradient
        return output_gradient * self.Activation_derivative(self.input)

In [5]:
class Tanh(Activation):
    def __init__(self):
        def tanh(x):
            return np.tanh(x)
        
        def tanh_derivative(x):
            return np.tanh(x)**2
        
        super().__init__(tanh, tanh_derivative)

In [6]:
def MSE(predicted, original):
    return np.mean( (predicted - original)**2 )

def MSE_derivative(predicted, original):
    return 2*(predicted - original) / len(original)

### XOR Problem

In [7]:
X = np.reshape( np.array([ [0, 0], [1, 0], [0, 1], [1, 1] ]) , (4,2,1) )
Y = np.reshape( np.array([[0], [1], [1], [0]]) , (4, 1, 1) ) 

In [8]:
network = [
    Dense(2, 3),
    Tanh(),
    Dense(3, 1),
    Tanh()
]

In [9]:
epochs = 10000
learning_rate = .1

In [14]:
for e in range(epochs):
    error = 0
    for x, y in zip(X, Y):
        output = x
        for layer in network:
            output = layer.forward(output)
            
        error = MSE(y, output)
        
        grad = MSE_derivative(y, output)
        for layer in reversed(network):
            grad = layer.backward(learning_rate=learning_rate, output_gradient=grad)
    
    error /= len(X)
    
    if (e+1) % 500 == 0:
        print(f'epoch=> {e+1}/{epochs} ---> error={error}')

epoch=> 500/10000 ---> error=0.25
epoch=> 1000/10000 ---> error=0.25
epoch=> 1500/10000 ---> error=0.25
epoch=> 2000/10000 ---> error=0.25
epoch=> 2500/10000 ---> error=0.25
epoch=> 3000/10000 ---> error=0.25
epoch=> 3500/10000 ---> error=0.25
epoch=> 4000/10000 ---> error=0.25
epoch=> 4500/10000 ---> error=0.25
epoch=> 5000/10000 ---> error=0.25
epoch=> 5500/10000 ---> error=0.25
epoch=> 6000/10000 ---> error=0.25
epoch=> 6500/10000 ---> error=0.25
epoch=> 7000/10000 ---> error=0.25
epoch=> 7500/10000 ---> error=0.25
epoch=> 8000/10000 ---> error=0.25
epoch=> 8500/10000 ---> error=0.25
epoch=> 9000/10000 ---> error=0.25
epoch=> 9500/10000 ---> error=0.25
epoch=> 10000/10000 ---> error=0.25
