#  a fully-connected ReLU network
only have one hidden layer using pytorch

## project dependencies

In [22]:
# Code in file tensor/two_layer_net_tensor.py
import torch
import numpy as np

In [5]:
device = torch.device('cpu')
# device = torch.device('cuda') # Uncomment this to run on GPU

## set up parameter and hyperparameter
### torch api
1. `torch.randn`
2. `h_relu.mm(w2) matrix *`
3. `h.clamp(min=0)`

In [6]:
# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device)
w2 = torch.randn(H, D_out, device=device)

learning_rate = 1e-6

In [13]:
def forward(x_in, y_out=None):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    # Compute and print loss; loss is a scalar, and is stored in a PyTorch Tensor
    # of shape (); we can get its value as a Python number with loss.item().
    loss = (y_pred - y).pow(2).sum()

    return loss, y_pred, h_relu, h

def backward(y_pred, h_relu, h):
    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    return grad_w1, grad_w2

for t in range(500):
    loss, y_pred, h_relu, h = forward(x, y)
    print(t, loss.item())

    grad_w1, grad_w2 = backward(y_pred, h_relu, h)

    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 1.871203608061478e-06
1 1.8785796100928565e-06
2 1.8772682324197376e-06
3 1.8719372292252956e-06
4 1.8654804989637341e-06
5 1.8617549812915968e-06
6 1.867563696578145e-06
7 1.8672508304007351e-06
8 1.8645096133695915e-06
9 1.8575565263745375e-06
10 1.8467956124368357e-06
11 1.8485744703866658e-06
12 1.8425350845063804e-06
13 1.8384705526841572e-06
14 1.8395709275864647e-06
15 1.8307841855857987e-06
16 1.8296416328666965e-06
17 1.8339870848649298e-06
18 1.8323153199162334e-06
19 1.8313990040041972e-06
20 1.847232852014713e-06
21 1.8512205315346364e-06
22 1.8411442397336941e-06
23 1.827448386393371e-06
24 1.806582986318972e-06
25 1.796234982975875e-06
26 1.8024869632426999e-06
27 1.797962454475055e-06
28 1.795892785594333e-06
29 1.7877433720059344e-06
30 1.7825217355493805e-06
31 1.7944264527613996e-06
32 1.7984000351134455e-06
33 1.8046764580503805e-06
34 1.79800611022074e-06
35 1.796085143723758e-06
36 1.784595269782585e-06
37 1.7817997104430106e-06
38 1.7780847656467813e-06
39 1.782

## autograd
1. set `requires_grad=True` to autograd
2. set `torch.no_grad()` to prevent grad update eg. in parameter update
3. set zero the gradients after running the backward pass `w1.grad.zero_()`

In [14]:
# Code in file autograd/two_layer_net_autograd.py
import torch

device = torch.device('cpu')
# device = torch.device('cuda') # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# Create random Tensors for weights; setting requires_grad=True means that we
# want to compute gradients for these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors. Since w1 and
    # w2 have requires_grad=True, operations involving these Tensors will cause
    # PyTorch to build a computational graph, allowing automatic computation of
    # gradients. Since we are no longer implementing the backward pass by hand we
    # don't need to keep references to intermediate values.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss. Loss is a Tensor of shape (), and loss.item()
    # is a Python number giving its value.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Update weights using gradient descent. For this step we just want to mutate
    # the values of w1 and w2 in-place; we don't want to build up a computational
    # graph for the update steps, so we use the torch.no_grad() context manager
    # to prevent PyTorch from building a computational graph for the updates
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after running the backward pass
        w1.grad.zero_()
        w2.grad.zero_()

0 32561048.0
1 31525604.0
2 34617940.0
3 35333936.0
4 30086292.0
5 19945222.0
6 10826222.0
7 5306579.5
8 2736589.5
9 1614679.75
10 1104386.75
11 839129.3125
12 677808.3125
13 565815.75
14 480842.0
15 413018.375
16 357377.0625
17 310928.6875
18 271725.34375
19 238426.9375
20 210112.03125
21 185766.515625
22 164732.21875
23 146483.203125
24 130587.6484375
25 116675.125
26 104462.3203125
27 93709.734375
28 84218.234375
29 75822.984375
30 68375.84375
31 61767.72265625
32 55880.515625
33 50621.15625
34 45913.484375
35 41694.5703125
36 37905.8671875
37 34497.3515625
38 31424.947265625
39 28654.037109375
40 26151.349609375
41 23888.080078125
42 21839.908203125
43 19982.6953125
44 18298.2734375
45 16768.30859375
46 15378.0322265625
47 14113.125
48 12962.287109375
49 11912.8828125
50 10955.646484375
51 10081.625
52 9283.0908203125
53 8552.578125
54 7884.755859375
55 7273.1787109375
56 6712.4970703125
57 6198.2685546875
58 5726.85009765625
59 5294.14697265625
60 4896.4404296875
61 4530.675292968

## define our own backward
1. define a class extends `torch.autograd.Function`
2. overwrite `forward(ctx, x)` must cache output in ctx for backward
3. overwrite `backward(ctx, grad_output)` get x from `ctx.saved_tensors`
4. use `apply` in forward

In [17]:
# Code in file autograd/two_layer_net_custom_function.py
import torch

class MyReLU(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """
    @staticmethod
    def forward(ctx, x):
        """
        In the forward pass we receive a context object and a Tensor containing the
        input; we must return a Tensor containing the output, and we can use the
        context object to cache objects for use in the backward pass.
        """
        ctx.save_for_backward(x)
        return x.clamp(min=0)

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive the context object and a Tensor containing
        the gradient of the loss with respect to the output produced during the
        forward pass. We can retrieve cached data from the context object, and must
        compute and return the gradient of the loss with respect to the input to the
        forward function.
        """
        x, = ctx.saved_tensors
        grad_x = grad_output.clone()
        grad_x[x < 0] = 0
        return grad_x


device = torch.device('cpu')
# device = torch.device('cuda') # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and output
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# Create random Tensors for weights.
w1 = torch.randn(D_in, H, device=device, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; we call our
    # custom ReLU implementation using the MyReLU.apply function
    y_pred = MyReLU.apply(x.mm(w1)).mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass.
    loss.backward()

    with torch.no_grad():
        # Update weights using gradient descent
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after running the backward pass
        w1.grad.zero_()
        w2.grad.zero_()

0 27072992.0
1 22983770.0
2 24267158.0
3 27426404.0
4 29172010.0
5 26555398.0
6 19761596.0
7 12093381.0
8 6518920.5
9 3433390.0
10 1940608.5
11 1238647.0
12 891377.5625
13 699607.4375
14 578775.25
15 493089.5
16 427031.25
17 373475.125
18 328727.0
19 290725.90625
20 258092.359375
21 229853.984375
22 205287.875
23 183812.28125
24 164977.640625
25 148386.5625
26 133740.640625
27 120780.703125
28 109268.328125
29 99035.3359375
30 89902.03125
31 81733.96875
32 74412.078125
33 67832.625
34 61912.0390625
35 56574.34765625
36 51755.1796875
37 47399.67578125
38 43452.2421875
39 39870.359375
40 36617.29296875
41 33659.37890625
42 30968.599609375
43 28513.65234375
44 26273.7265625
45 24227.650390625
46 22357.7578125
47 20646.72265625
48 19079.12890625
49 17641.958984375
50 16324.041015625
51 15113.80859375
52 14001.2822265625
53 12978.232421875
54 12036.0595703125
55 11168.1103515625
56 10367.81640625
57 9629.708984375
58 8948.5966796875
59 8319.46875
60 7737.97900390625
61 7200.3232421875
62 67

## use nn package to simplify forward


In [27]:
# Code in file nn/two_layer_net_nn.py
import torch

device = torch.device('cpu')
# device = torch.device('cuda') # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. Each Linear Module computes output from input using a
# linear function, and holds internal Tensors for its weight and bias.
# After constructing the model we use the .to() method to move it to the
# desired device.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
).to(device)

# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function. Setting
# reduction='sum' means that we are computing the *sum* of squared errors rather
# than the mean; this is for consistency with the examples above where we
# manually compute the loss, but in practice it is more common to use mean
# squared error as a loss by setting reduction='elementwise_mean'.
loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4
for t in range(500):
    # output
    y_pred = model(x)

    # Compute loss
    loss = loss_fn(y_pred, y)
    print(t, loss.item(), type(loss))   # convert tensor to number only work for one number

    # Zero the gradients before running the backward pass.
    model.zero_grad()

    # Backward pass: compute gradient of the loss with respect to all the learnable
    # parameters of the model. Internally, the parameters of each Module are stored
    # in Tensors with requires_grad=True, so this call will compute gradients for
    # all learnable parameters in the model.
    loss.backward()

    # Update the weights using gradient descent. Each parameter is a Tensor, so
    # we can access its data and gradients like we did before.
    with torch.no_grad():
        for param in model.parameters():
            param.data -= learning_rate * param.grad


0 665.5538940429688 <class 'torch.Tensor'>
rate tensor(0)
1 617.1970825195312 <class 'torch.Tensor'>
rate tensor(0)
2 575.0623779296875 <class 'torch.Tensor'>
rate tensor(0)
3 537.6188354492188 <class 'torch.Tensor'>
rate tensor(0)
4 504.3430480957031 <class 'torch.Tensor'>
rate tensor(0)
5 474.08880615234375 <class 'torch.Tensor'>
rate tensor(0)
6 446.5764465332031 <class 'torch.Tensor'>
rate tensor(0)
7 421.4475402832031 <class 'torch.Tensor'>
rate tensor(0)
8 398.20391845703125 <class 'torch.Tensor'>
rate tensor(0)
9 376.4761962890625 <class 'torch.Tensor'>
rate tensor(0)
10 356.02093505859375 <class 'torch.Tensor'>
rate tensor(0)
11 336.80340576171875 <class 'torch.Tensor'>
rate tensor(0)
12 318.67193603515625 <class 'torch.Tensor'>
rate tensor(0)
13 301.52606201171875 <class 'torch.Tensor'>
rate tensor(0)
14 285.2677917480469 <class 'torch.Tensor'>
rate tensor(0)
15 269.74267578125 <class 'torch.Tensor'>
rate tensor(0)
16 254.95773315429688 <class 'torch.Tensor'>
rate tensor(0)
17