In [None]:
%matplotlib inline


PyTorch: Control Flow + Weight Sharing
--------------------------------------

To showcase the power of PyTorch dynamic graphs, we will implement a very strange
model: a fully-connected ReLU network that on each forward pass randomly chooses
a number between 1 and 4 and has that many hidden layers, reusing the same
weights multiple times to compute the innermost hidden layers.



In [1]:
import random
import torch


class DynamicNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we construct three nn.Linear instances that we will use
        in the forward pass.
        """
        super(DynamicNet, self).__init__()
        self.input_linear = torch.nn.Linear(D_in, H)
        self.middle_linear = torch.nn.Linear(H, H)
        self.output_linear = torch.nn.Linear(H, D_out)

    def forward(self, x):
        """
        For the forward pass of the model, we randomly choose either 0, 1, 2, or 3
        and reuse the middle_linear Module that many times to compute hidden layer
        representations.

        Since each forward pass builds a dynamic computation graph, we can use normal
        Python control-flow operators like loops or conditional statements when
        defining the forward pass of the model.

        Here we also see that it is perfectly safe to reuse the same Module many
        times when defining a computational graph. This is a big improvement from Lua
        Torch, where each Module could be used only once.
        """
        h_relu = self.input_linear(x).clamp(min=0)
        for _ in range(random.randint(0, 3)):
            h_relu = self.middle_linear(h_relu).clamp(min=0)
        y_pred = self.output_linear(h_relu)
        return y_pred


# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# Construct our model by instantiating the class defined above
model = DynamicNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. Training this strange model with
# vanilla stochastic gradient descent is tough, so we use momentum
criterion = torch.nn.MSELoss(size_average=False)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
for t in range(500):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 674.745849609375
1 672.3202514648438
2 671.0899047851562
3 711.3137817382812
4 659.3146362304688
5 654.3379516601562
6 661.365234375
7 643.7911376953125
8 638.5135498046875
9 632.905029296875
10 650.254150390625
11 622.3875122070312
12 617.0642700195312
13 467.0130310058594
14 647.5877075195312
15 618.9353637695312
16 597.5647583007812
17 592.8633422851562
18 355.07818603515625
19 637.1373901367188
20 571.5284423828125
21 518.5073852539062
22 550.5350952148438
23 235.6942901611328
24 521.4791870117188
25 424.08636474609375
26 482.06683349609375
27 159.62496948242188
28 338.0915222167969
29 306.3524169921875
30 120.80313110351562
31 493.157470703125
32 333.17919921875
33 425.815185546875
34 381.61761474609375
35 135.83888244628906
36 323.771484375
37 308.4855041503906
38 246.97373962402344
39 107.70095825195312
40 86.30836486816406
41 163.20713806152344
42 321.5209655761719
43 68.96023559570312
44 396.7673034667969
45 583.14892578125
46 120.2828140258789
47 194.2571258544922
48 364.28