In [119]:
import numpy as np
from typing import List
from collections.abc import Callable
import operator as op

import math
from collections import deque, defaultdict

In [120]:
class Function:
    def __init__(self, func: Callable, deriv: Callable):
        self.func = func
        self.deriv = deriv

def softmax(x: np.array):
    x = x - np.max(x)
    exp = np.exp(x)
    return exp / np.sum(exp)

def sigmoid(x: np.array):
    return np.vectorize(lambda n: 1 / (1 + math.exp(-n)))(x)

def dsigmoid(x: np.array):
    return np.multiply(sigmoid(x), 1 - sigmoid(x))

def SMCE_loss(y: np.array, y_pred: np.array):
    '''softmax cross entropy loss'''
    sm = softmax(y_pred)
    return -np.sum(y * np.log(sm))

def dSMCE_loss(y: np.array, y_pred: np.array):
    '''derivative of softmax cross entropy loss'''
    return softmax(y_pred) - y

def softmax_cross_entropy(y: np.array):
    SMCE_y = lambda y_pred: SMCE_loss(y, y_pred)
    dSMCE_y = lambda y_pred: dSMCE_loss(y, y_pred)
    return Function(SMCE_y, dSMCE_y)

In [121]:
class Layer:
    def __init__(self, n: int, m: int, activation: Function):
        self.W = np.random.rand(n, m)
        self.b = np.zeros((m))
        self.activation = activation
        self.adjoint = self.W.T

        self.x = None
        self.z = None
        self.s = None

    def _forward(self, x: np.array) -> np.array:
        # f(xW + b)
        self.x = x
        self.z = x @ self.W + self.b
        # cache df/dz
        self.s = self.activation.deriv(self.z)
        return self.activation.func(self.z)
    
    def _backward(self, upstream: np.array) -> np.array:
        # dL/dz = dL/df hadamard df/dz = upstream H df/dz; (1xm) H (1xm)
        delta = upstream * self.s
        # dL/dx = dL/dz @ dz/dx
        dx = delta @ self.adjoint
        # dL/db = dL/dz @ dz/db = dL/dz @ I_m
        db = delta
        # dL/dW = dL/dz outer prod dz/dW (kronecker?)
        dW = np.outer(self.x.T, delta)
        return dx, dW, db

In [122]:
class Model:
    def __init__(self, layers: List[Layer] = None):
        self.layers = [] if layers is None else layers
        self.gradient = []
        self.logits = None
        self.loss = None

    def forward(self, x: np.array) -> np.array:
        '''forward pass'''
        h = x
        for layer in self.layers:
            h = layer._forward(h)
        self.logits = h
        return self.logits

    def backward(self):
        '''calc gradient via backpropagation'''
        self.gradient = []
        # dL/df (1xn gradient)
        upstream = self.loss.deriv(self.logits)
        for layer in reversed(self.layers):
            dx, dW, db = layer._backward(upstream)
            self.gradient.append([dW, db])
            upstream = dx

    def step(self, lr: float):
        '''lr = learning rate'''
        for layer, grad in zip(reversed(self.layers), self.gradient):
            layer.W -= grad[0] * lr
            layer.b -= grad[1] * lr

    def train(self, X: np.array, Y: np.array, loss_func: Callable, lr: float = 0.01, verbose: bool = False):
        '''X is just a single sample'''
        Y_pred = self.forward(X)
        self.loss = loss_func(Y)
        loss = self.loss.func(Y_pred)
        self.backward()
        self.step(lr)

        if verbose:
            print(f'X: {X}, Y: {Y}')
            print(f'Predicted Y: {Y_pred}')
            print(f'Softmax: {softmax(Y_pred)}')
            print(f'Gradient: {self.gradient}')
        
        return loss

In [124]:
sa = Function(lambda x: x/2, lambda x: 1/2)
sigmoidf = Function(sigmoid, dsigmoid)
layers = [Layer(2, 2, sigmoidf), Layer(2, 2, sigmoidf)]
model = Model(layers)


training = [[1,0], [0,1], [1,1], [0,0]]
# train
steps = 100000
lr = 0.01
for i in range(steps):
    avg_loss = 0
    for x in training:
        ans = np.array([0,0])
        ans[x[0] ^ x[1]] = 1
        avg_loss += model.train(np.array(x), ans, loss_func = softmax_cross_entropy, lr = lr) / 4
    if i % (steps // 10) == 0:
        print(f'iter {i}:')
        print(f'Loss: {avg_loss}')
        for layer in model.layers:
            print(layer.W)

iter 0:
Loss: 0.6935463524195231
[[0.6598274  0.89516147]
 [0.30639388 0.19657315]]
[[0.56537038 0.57399178]
 [0.94923346 0.72692567]]
iter 10000:
Loss: 0.6928576635889179
[[0.75162921 0.89251676]
 [0.49005319 0.14736567]]
[[0.36830138 0.81732841]
 [0.86498721 0.85585391]]
iter 20000:
Loss: 0.6835823595581908
[[1.50283865 0.98883192]
 [1.4822224  0.24803416]]
[[-0.25186493  1.55870496]
 [ 0.94503983  0.8575983 ]]
iter 30000:
Loss: 0.5864175345268844
[[4.33393123 1.42715722]
 [4.31085657 0.48266008]]
[[-2.8344788   4.15742743]
 [ 2.0718235   0.16556694]]
iter 40000:
Loss: 0.44646543245386616
[[5.99237493 2.70297   ]
 [5.91934214 2.68447719]]
[[-5.17279716  5.3646836 ]
 [ 5.50640819 -2.28932059]]
iter 50000:
Loss: 0.35131080971853357
[[6.66581385 4.20535851]
 [6.62887016 4.202488  ]]
[[-6.79978046  6.64814086]
 [ 7.40003969 -6.10723835]]
iter 60000:
Loss: 0.3343122137889377
[[6.99516204 4.78452821]
 [6.96899139 4.7820549 ]]
[[-7.67995715  7.50831778]
 [ 8.30960128 -7.57709693]]
iter 7000