# Backpropagation

![Two Layer Network Diagram](TwoLayerNetworkDiagram.png)

In [None]:
import torch

## Create fake input and output

These are just randomly generated inputs and outputs for sake of computation.

In [None]:
# Total number of training examples
N = 100

# Number of inputs, hidden neurons, and outputs (based on diagram)
nx = 3
n1 = 3
ny = 2

# Each of the N examples has three (nx) real-valued inputs
X = torch.randn(N, nx)

# Each of the N examples has two (ny) independent binary classification outputs
Y = torch.randint(low=0, high=2, size=(N, ny)).type(torch.float)

# aka alpha or α
learning_rate = 0.1

## Manual version

### Create a simple model based on the diagram

In [None]:
def linear(A, W, b):
    return A @ W.T + b


def sigmoid(Z):
    return 1 / (1 + torch.exp(-Z))


class Manual2Layer:
    "A two-layer network."
    
    def __init__(self, nx, n1, ny, learning_rate):
        # n0 and n2 are aliases for nx and ny
        self.n0 = nx
        self.n1 = n1
        self.n2 = ny

        self.learning_rate = learning_rate

        # Layer 1 parameters
        self.W1 = torch.randn(n1, nx)
        self.b1 = torch.randn(n1)
        
        # Layer 2 parameters
        self.W2 = torch.randn(ny, n1)
        self.b2 = torch.randn(ny)

    def __call__(self, X):
        "Forward propagation."
        # A0 is just an alias for the input, X
        A0 = X
    
        # Forward propagation
        Z1 = linear(A0, self.W1, self.b1)
        self.A1 = sigmoid(Z1)
        
        Z2 = linear(self.A1, self.W2, self.b2)
        self.A2 = sigmoid(Z2)
        
        # A2 is just an alias for the output, Yhat
        Yhat = self.A2
    
        return Yhat

    def bce_loss(self, Yhat, Y):
        "Compute loss as the binary cross-entropy loss."
        self.Yhat = Yhat
        self.Y = Y
        return torch.mean(Y * torch.log(Yhat) + (1 - Y) * torch.log(1 - Yhat))
    
    def backward(self):
        "Compute the gradients for all parameters."
        # Compute gradients for W^[2] and b^[2]
        dL_dY = (self.Y / self.Yhat - (1 - self.Y) / (1 - self.Yhat)) / 2
        dY_dZ2 = self.Yhat * (1 - self.Yhat)
        
        dZ2 = dL_dY * dY_dZ2
        
        self.dW2 = (1 / N) * dZ2.T @ self.A1
        self.db2 = dZ2.mean(dim=0)
        
        # Compute gradients for W^[1] and b^[1]
        dZ1 = dZ2 @ self.W2 * ((self.A1 * (1 - self.A1)))
        
        self.dW1 = (1 / N) * dZ1.T @ X
        self.db1 = dZ1.mean(dim=0)
        
    def update(self):
        "Update the parameter values."
        self.W1 -= self.learning_rate * self.dW1
        self.b1 -= self.learning_rate * self.db1
        self.W2 -= self.learning_rate * self.dW2
        self.b2 -= self.learning_rate * self.db2

### Test the model

In [None]:
# Create the model
model = Manual2Layer(nx, n1, ny, learning_rate)

# Compute the models initial output
Yhat = model(X)
bce_loss = model.bce_loss(Yhat, Y)
print(f"Manual loss before training: {bce_loss.item():0.4f}")

### Update parameters

In [None]:
# Compute gradients and update parameters
model.backward()
model.update()

# Recompute loss and see if we've improved
Yhat = model(X)
bce_loss = model.bce_loss(Yhat, Y)
print(f"Manual loss after one step: {bce_loss.item():0.4f}")
print("The loss should be lower. (You can execute this cell multiple times.)")

## Automatic version (using PyTorch)

### Forward and backward propagation using PyTorch

Copy the parameters from above, but configure them to use auto-differentiation

In [None]:
class Automatic2Layer:
    "This model copies parameters from the manual version for the sake of comparisons."
    def __init__(self, manual_model):
        self.learning_rate = manual_model.learning_rate
        
        # Layer 1 parameters
        self.W1 = manual_model.W1.clone().detach().requires_grad_(True)
        self.b1 = manual_model.b1.clone().detach().requires_grad_(True)

        # Layer 2 parameters
        self.W2 = manual_model.W2.clone().detach().requires_grad_(True)
        self.b2 = manual_model.b2.clone().detach().requires_grad_(True)

    def __call__(self, X):
        "Forward propagation (same as above, but using PyTorch functionality)."
        A0 = X
        Z1 = torch.nn.functional.linear(A0, self.W1, self.b1)
        A1 = torch.sigmoid(Z1)
        
        Z2 = torch.nn.functional.linear(A1, self.W2, self.b2)
        A2 = torch.sigmoid(Z2)
        Yhat = A2

        return Yhat

    def bce_loss(self, Yhat, Y):
        "Compute loss as the binary cross-entropy loss."
        self.loss = -torch.nn.functional.binary_cross_entropy(Yhat, Y)
        return self.loss

    def backward(self):
        "Compute the gradients for all parameters."
        self.loss.backward()

    def update(self):
        "Update the parameter values."
        self.W1 -= self.learning_rate * self.W1.grad
        self.b1 -= self.learning_rate * self.b1.grad
        self.W2 -= self.learning_rate * self.W2.grad
        self.b2 -= self.learning_rate * self.b2.grad

In [None]:
# Create the new model and compute its output
model2 = Automatic2Layer(model)
Yhat = model2(X)

model2.bce_loss(Yhat, Y)
print(f"Automatic loss before training: {bce_loss.item():0.4f}")
print("Compare this loss to the manually computed version above")

# Compute the gradients and update the parameters
model2.backward()

## Compare computed gradients

We shouldn't compare floating-point numbers using "==" since results can differ based on the order of operations.

You might get different results if you've run the cells above out-of-order.

In [None]:
# We need to compute new gradients for the manual model since we copied
# the parameters AFTER the parameters were updated.
model.backward()

assert torch.allclose(model.dW2, model2.W2.grad)
assert torch.allclose(model.db2, model2.b2.grad)

assert torch.allclose(model.dW1, model2.W1.grad)
assert torch.allclose(model.db1, model2.b1.grad)

## Suggestions

Try

- Adding additional layers
- Changing the loss function
- Changing the activation function(s)