**Dynamic Model** - A fully connected ReLU network that on each forward pass chooses a random number between 1 and 4 and uses that many hidden layers, reusing the same weights multiple times to compute the innermost hidden layers

In [1]:
# Imports
import random

import torch
from torch.autograd import Variable

In [2]:
class DynamicNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In constructor instantiate three nn.Linear instances which will 
        be used in the forward pass.
        """
        super(DynamicNet, self).__init__()
        self.input_linear = torch.nn.Linear(D_in, H)
        self.middle_linear = torch.nn.Linear(H, H)
        self.output_linear = torch.nn.Linear(H, D_out)
        
    def forward(self, x):
        """
        In forward pass, randomly choose either 0, 1, 2, or 3 and reuse 
        the middle_linear Module that many times to compute hidden layer 
        representations.
        
        Each forward pass builds a dynamic computation graph: Normal Python 
        flow control operators like loops or conditional statements when 
        defining forward pass can be used.
        """
        h_relu = self.input_linear(x).clamp(min=0)
        for _ in range(random.randint(0, 3)):
            h_relu = self.middle_linear(h_relu).clamp(min=0)
        y_pred = self.output_linear(h_relu)
        return y_pred

In [3]:
# N: batch size, D_in: input dimension, H: hidden dimension, D_out: output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

In [4]:
# Create random Tensors to hold input and output and wrap them in Variables
x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

In [5]:
# Create model by instantiating "DynamicNet" class
model = DynamicNet(D_in, H, D_out)

In [6]:
# MSE Loss (size_average=False -> does not divide sum by n)
loss_fn = torch.nn.MSELoss(size_average=False)

In [7]:
# SGD (use momentum)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)

for i in xrange(500):
    # Forward pass
    y_pred = model(x)
    
    # Compute loss
    loss = loss_fn(y_pred, y)
    if i % 50 == 0:
        print i, loss.data[0]
    
    # Mutate the gradients to zero before running the backward pass
    optimizer.zero_grad()
       
    # Backward pass: d(loss)/d(model parameters)
    loss.backward()
    
    # Update parameters
    optimizer.step()

0 631.687194824
50 83.5273666382
100 65.1574249268
150 6.78415155411
200 1.04509472847
250 0.942779004574
300 1.56572329998
350 3.27318453789
400 0.570410430431
450 10.405828476
