# Foundations
---

In [1]:
import numpy as np

## Matrix multiplication from scratch

In [6]:
a = np.random.randn(32, 784)
b = np.random.randn(784, 100)

In [12]:
def matmul(a, b):
    ar, ac = a.shape
    br, bc = b.shape
    assert(ac == br)
    c = np.zeros((ar, bc))
    for i in range(ar):
        for j in range(bc):
            for k in range(ac):
                c[i, j] += a[i, k] * b[k, j]
    return c

In [13]:
c = matmul(a, b)
c.shape

(32, 100)

In [15]:
%timeit matmul(a, b)

1.48 s ± 50.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [17]:
%timeit a@b

67.7 µs ± 3.12 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


**Matrix multiplication with element-wise multiplication**

In [21]:
def matmul(a, b):
    ar, ac = a.shape
    br, bc = b.shape
    assert(ac == br)
    c = np.zeros((ar, bc))
    for i in range(ar):
        for j in range(bc):
            c[i, j] = (a[i] * b[:, i]).sum()
    return c

In [22]:
%timeit matmul(a, b)

16.4 ms ± 190 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


**Matrix multiplication with broadcasting**

In [32]:
def matmul(a,b):
    ar, ac = a.shape
    br, bc = b.shape
    assert(ac == br)
    c = np.zeros((ar, bc))
    for i in range(ar):
        c[i] = (a[i][:, None] * b).sum(0)
    return c

In [33]:
%timeit matmul(a, b)

3.2 ms ± 138 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## Linear layers

In [63]:
def lin(x, w, b):
    return x@w + b

In [45]:
def relu(x):
    return x.clip(0)

In [77]:
x = np.random.randn(5, 28*28)
y = np.random.randn(5)

w1 = np.random.randn(28*28, 100) / np.sqrt(2/(28*28))
b1 = np.zeros(100)
w2 = np.random.randn(100, 1) / np.sqrt(2/100)
b2 = np.zeros(1)

In [78]:
def model(x):
    l1 = lin(x, w1, b)
    l2 = relu(l1)
    l3 = lin(l2, w2, b2)
    return l3

In [79]:
def mse(pred, targ):
    res = pred.squeeze(-1) - targ
    return np.power(res, 2).mean()

In [80]:
pred = model(x)

In [81]:
loss = mse(pred, y)
loss

5346.872755009423

## Define backward functions for each of the functions involved in the forward pass

In [92]:
def mse_grad(pred, targ):
    pred.grad = 2. * (pred.squeeze(-1) - targ)[:, None] / pred.shape[0]

In [93]:
def relu_grad(inp, out):
    inp.grad = out.grad * (inp > 0).astype(float)

In [96]:
def lin_grad(x, w, b, out):
    x.grad = out.grad @ w.transpose()
    w.grad = x.transpose() @ out.grad()
    b.grad = out.grad().sum(0)

In [117]:
def forward_and_backward(x, y):
    # forward
    l1 = lin(x, w1, b1)
    l2 = relu(l1)
    l3 = lin(l2, w2, b2)
    
    loss = mse(l3, y)
    
    # backward
    mse_grad(l3, y)
    lin_grad(l2, w2, b2, l3)
    relu_grad(l1, l2)
    lin_grad(x, w1, b1)

## Combine forward and backward functionalities into a class

In [114]:
class ReLu():
    def __call__(self, inp):
        self.inp = inp
        self.out = inp.clip(0)
        return self.out
    
    def backward(self):
        self.inp.grad = (self.inp > 0).astype(float) * self.out.grad

In [115]:
class Mse():
    def __call__(self, pred, targ):
        self.pred = pred
        self.targ = targ
        res = self.pred.squeeze(-1) - self.targ
        return np.power(res, 2).mean()
    
    def backward(self):
        self.pred.grad = 2. * (self.pred.squeeze(-1) - self.targ)[:, None] / self.pred.shape[0]

In [116]:
class Lin():
    def __init__(self, w, b):
        self.w = w
        self.b = b
    
    def __call__(self, x):
        self.inp = x
        self.out = self.inp @ self.w + self.b
        return self.out
    
    def backward(self):
        self.inp.grad = self.out.grad @ self.w.transpose()
        self.w.grad = self.inp.transpose() @ self.out.grad()
        self.b.grad = self.out.grad().sum(0)

In [123]:
class Model():
    def __init__(self, w1, w2, b1, b2):
        self.layers = [Lin(w1, b1), ReLu(), Lin(w2, b2)]
        self.loss = Mse()
        
    def __call__(self, x, y):
        for i in self.layers:
            x = i(x)
        x = self.loss(x, y)
        return x
    
    def backward(self, x):
        self.loss.backward()
        for i in reversed(self.layers):
            i.backward()

In [124]:
model = Model(w1, w2, b1, b2)

In [126]:
model(x, y)

4930.029257068936

In [127]:
model.backward()

TypeError: backward() missing 1 required positional argument: 'x'

## Refactor

In [128]:
class LayerFunction():
    def __call__(self, *args):
        self.args = args
        self.out = self.forward(*args)
        return self.out
    
    def forward(self): raise Exception('not implemented')
    def bwd(self): raise Exception('not implemented')
    def backward(self): self.bwd(self.out, *self.args)

In [131]:
class ReLu(LayerFunction):
    def forward(self, inp): return inp.clip(0)
    def bwd(self, out, inp): inp.grad = (inp > 0).astype(float) * self.out.grad

In [130]:
class Mse(LayerFunction):
    def forward(self, pred, targ): 
        res = pred.squeeze(-1) - self.targ
        return np.power(res, 2).mean()
    
    def bwd(self, pred, targ): 
        pred.grad = 2. * (pred.squeeze(-1) - targ)[:, None] / pred.shape[0]

In [132]:
class Lin(LayerFunction):
    def __init__(self, w, b): 
        self.w = w
        self.b = b
        
    def forward(self, x):
        return x@self.w + self.b
    
    def bwd(self, out, x):
        x.grad = out.grad @ self.w.transpose()
        self.w.grad = x.transpose() @ out.grad()
        self.b.grad = out.grad().sum(0)