In [2]:
import torch
import numpy as np
import dlc_practical_prologue as prologue

In [3]:
def sigma(x):
    return torch.tanh(x)

def dsigma(x):
    return torch.mul(torch.pow(1 - sigma(x), 2), torch.exp(2 * x))

In [6]:
def loss(v, t):
    return torch.pow(t - v, 2).sum()

def dloss(v, t):
    return 2 * (v - t)

In [34]:
def forward_pass(w1, b1, w2, b2, x):
    x0 = x
    s1 = w1 @ x0 + b1
    x1 = sigma(s1)
    s2 = w2 @ x1 + b2
    x2 = sigma(s2)
    return x0, s1, x1, s2, x2

def backward_pass(w1, b1, w2, b2,
                  t,
                  x0, s1, x1, s2, x2,
                  dl_dw1, dl_db1, dl_dw2, dl_db2):
    d2 = 2 * torch.mul(sigma(s2) - t, dsigma(s2)).view(-1,1)
    dl_dw2 += d2 @ x1.view(1,-1)
    dl_db2 += d2
    d1 = torch.mul(w2.t() @ d2, dsigma(s1)).view(-1,1)
    dl_dw1 += d1 @ x0.view(1,-1)
    dl_db1 += d1

In [46]:
# Load data
train_input, train_target, test_input, test_target = prologue.load_data(False, True, True)
# Make targets fall strictly in tanh range
train_target *= 0.9
test_target *= 0.9

# Initialize weights
np.random.seed(1)
EPS = 1e-6
INPUT_SIZE = 784
HIDDEN_SIZE = 50
OUTPUT_SIZE = 10
w1 = torch.Tensor(np.random.normal(0, EPS, (HIDDEN_SIZE, INPUT_SIZE)))
b1 = torch.Tensor(np.random.normal(0, EPS, (HIDDEN_SIZE, 1)))
w2 = torch.Tensor(np.random.normal(0, EPS, (OUTPUT_SIZE, HIDDEN_SIZE)))
b2 = torch.Tensor(np.random.normal(0, EPS, (OUTPUT_SIZE, 1)))

# Create gradient accumulators
dl_dw1 = torch.zeros((HIDDEN_SIZE, INPUT_SIZE))
dl_db1 = torch.zeros((HIDDEN_SIZE, 1))
dl_dw2 = torch.zeros((OUTPUT_SIZE, HIDDEN_SIZE))
dl_db2 = torch.zeros((OUTPUT_SIZE, 1))

# Train
STEPS = 1000
PRINT_STEPS = 100
gamma = 0.1 / train_input.shape[0]
for step in range(STEPS):
    # Set gradients to zero
    dl_dw1.zero_()
    dl_db1.zero_()
    dl_dw2.zero_()
    dl_db2.zero_()
    # Go over all examples
    for x, t in zip(train_input, train_target):
        x0, s1, x1, s2, x2 = forward_pass(w1, b1, w2, b2, x.view(-1,1))
        backward_pass(w1, b1, w2, b2, t.view(-1,1), x0, s1, x1, s2, x2, dl_dw1, dl_db1, dl_dw2, dl_db2)
    # Update weights
    w1 -= gamma * dl_dw1
    b1 -= gamma * dl_db1
    w2 -= gamma * dl_dw2
    b2 -= gamma * dl_db2
    # Compute losses
    if step % PRINT_STEPS == 0:
        _, _, _, _, y_tr = forward_pass(w1, b1, w2, b2, train_input.t())
        _, pred_tr = y_tr.max(0)
        _, real_tr = train_target.max(1)
        error_tr = 100 * (pred_tr != real_tr).sum().item() / real_tr.shape[0]
        _, _, _, _, y_te = forward_pass(w1, b1, w2, b2, test_input.t())
        _, pred_te = y_te.max(0)
        _, real_te = test_target.max(1)
        error_te = 100 * (pred_te != real_te).sum().item() / real_te.shape[0]
        print(f'Step {step} -> Training error: {error_tr}. Test error: {error_te}')

* Using MNIST
** Reduce the data-set (use --full for the full thing)
** Use 1000 train and 1000 test samples
Step 0 -> Training error: 88.3. Test error: 90.1
Step 100 -> Training error: 17.9. Test error: 27.3
Step 200 -> Training error: 6.4. Test error: 17.4
Step 300 -> Training error: 3.0. Test error: 15.3
Step 400 -> Training error: 2.7. Test error: 16.9
Step 500 -> Training error: 1.2. Test error: 16.1
Step 600 -> Training error: 0.7. Test error: 16.4
Step 700 -> Training error: 0.6. Test error: 17.0
Step 800 -> Training error: 0.2. Test error: 17.0
Step 900 -> Training error: 0.1. Test error: 16.7
