In [None]:
# Using torch as the underlying tensor library so that we can easily check if
# our gradients match the correct gradients.

!pip3 install torch
import torch
import numpy as np

See `autodiff.ipynb` for general purpose autodiff engine implementation.

This notebook is a study reference for an interview which asks you to implement backprop for a feed-foward network. Here, we will skip the autodiff engine, and implement only what we need to write to pass such an interview.

In [None]:
# Define some operations which will let us construct a feed foward network and loss.
# Each operation returns an output and a `vjp` function which can be called to calculate a backward-pass through the same op.

def affine(x, w, b):
  y = x @ w + b
  
  def vjp(v):
    # w.r.t. x
    gx = v @ w.T
    
    # w.r.t. w
    gw = x.T @ v
    
    # w.r.t. b
    gb = v.sum(axis=0)
    
    return gx, gw, gb
  
  return y, vjp


def sigmoid(x):
  y = 1/(1+np.exp(-x))
  
  def vjp(v):
    dy = np.exp(x)/(1 + np.exp(x))**2
    return v*dy
  
  return y, vjp


def relu(x):
  y = np.clip(x, a_min=0, a_max=None)

  def vjp(v):
    # Note that the torch implementation of relu makes the gradient 1 at x=0
    return v * (x >= 0).astype(float)

  return y, vjp


def square_error_loss(x, *, y):
  diff = x - y
  L = (diff**2).sum(axis=-1)
  
  def vjp(v):
    return v * 2 * diff
  
  return L, vjp


def tensor_sum(x):
  y = x.sum()
  
  def vjp(v):
    return v * np.ones_like(x)
  
  return y, vjp

# Feed-forward network

In [None]:
# build a simple NN with toy data

# data - shape == (batch, features)
x = np.array([[1,2,3], [4,5,6]], dtype=float)
y = np.array([[1, 0], [0, 1]], dtype=float)

# params
w1 = np.array([[1, 1], [-1, 1], [-2, 2]], dtype=float)
b1 = np.array([[0, 1]], dtype=float)
w2 = np.array([[.2, .5], [.5, -.5]], dtype=float)
b2 = np.array([[-1, .5]], dtype=float)
w3 = np.array([[.7, -.5], [-.2, .3]], dtype=float)
b3 = np.array([[.3, -.2]], dtype=float)
params = [w1, b1, w2, b2, w3, b3]

# build NN
h0 = x
s1, s1_vjp = affine(h0, w1, b1)
h1, h1_vjp = sigmoid(s1)
s2, s2_vjp = affine(h1, w2, b2)
h2, h2_vjp = relu(s2)
s3, s3_vjp = affine(h2, w3, b3)
Le, Le_vjp = square_error_loss(s3, y=y)
L, L_vjp = tensor_sum(Le)
L  # view the output

In [None]:
# perform backprop
g = L_vjp(1)
g = Le_vjp(g)
g, g_w3, g_b3 = s3_vjp(g)
g = h2_vjp(g)
g, g_w2, g_b2 = s2_vjp(g)
g = h1_vjp(g)
g, g_w1, g_b1 = s1_vjp(g)

my_grads = [g_w1, g_b1, g_w2, g_b2, g_w3, g_b3]
my_grads  # view our grads

# Test

In [None]:
def sigmoid_torch(x):
  y = 1/(1+(-x).exp())
  return y, None


def relu_torch(x):
  # y = torch.maximum(x, torch.zeros_like(x))
  y = torch.clamp(x, min=0)
  return y, None

In [None]:
w1t = torch.tensor(w1, requires_grad=True)
b1t = torch.tensor(b1, requires_grad=True)
w2t = torch.tensor(w2, requires_grad=True)
b2t = torch.tensor(b2, requires_grad=True)
w3t = torch.tensor(w3, requires_grad=True)
b3t = torch.tensor(b3, requires_grad=True)
torch_params = [w1t, b1t, w2t, b2t, w3t, b3t]

h0 = torch.tensor(x)
s1, _ = affine(h0, w1t, b1t)
h1, _ = sigmoid_torch(s1)
s2, _ = affine(h1, w2t, b2t)
h2, _ = relu_torch(s2)
s3, _ = affine(h2, w3t, b3t)
Le, _ = square_error_loss(s3, y=torch.tensor(y))
L, _ = tensor_sum(Le)
L  # view the output

In [None]:
# Compare to gradients calculated by torch.
# This is for debugging purposes.

def get_torch_grads(target, params):
  # zero out previous cum gradients
  for p in params:
    if p.grad is not None:
      p.grad.zero_()
  # update cum gradients
  target.backward(torch.ones_like(target), retain_graph=True)
  return [p.grad for p in params]


# compare to torch grads
torch_grads = get_torch_grads(L, torch_params)
print('matches:', [torch.allclose(torch.tensor(my_g), tc_g) for my_g, tc_g in zip(my_grads, torch_grads)])
# If all are True then we've succeeded