# PyTorch Tutorials

In [45]:
import random
import numpy
import torch
import torchvision
import torch.nn as nn
import numpy as np
import torchvision.transforms as transforms

In [51]:
N, D_in, H, D_out = 64, 1000, 100, 10
device = torch.device('cpu')
# device = torch.device('gpu')
dtype = torch.float

# Two-Layer NN (Numpy, manual backprop)

In [65]:
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# No bias
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6
for epoch in range(500):
    # Forward pass: Compute y_pred
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)
    
    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    
    # print(epoch, loss)
    
    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred) # 1st
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h) # 2nd
    
    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2
    
print(epoch, loss.item())

499 0.00026356245392334385


# Two-Layer NN (PyTorch, manual backprop)

In [67]:
x = torch.randn(N, D_in, device=device, dtype = dtype)
y = torch.randn(N, D_out, device=device, dtype = dtype)

w1 = torch.randn(D_in, H, device=device)
w2 = torch.randn(H, D_out, device=device)

learning_rate = 1e-6
for epoch in range(500):
    # Forward pass: Compute y_pred
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    # Compute and print loss (loss is scalar and stored in a PyTorch 
    # tensor, we can get its value as a Python number with loss.item()
    loss = (y_pred - y).pow(2).sum()
    # print(epoch, loss.item())
    
    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred) # 1st
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h) # 2nd
    
    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2
    
print(epoch, loss.item())

499 0.00019266879826318473


# Two-Layer NN (PyTorch, autograd)

When using **autograd**, the forward pass of your network will define a **computational graph**; *nodes* in the graph will be Tensors, and *edges* will be functions that produce output Tensors from input Tensors. Backpropagating through this graph then allows you to easily compute gradients.

If we want to compute gradients with respect to some Tensor, then we set **requires_grad=True** when constructing that Tensor. Any PyTorch operations on that Tensor will cause a computational graph to be constructed, allowing us to later perform backpropagation through the graph. If x is a Tensor with requires_grad=True, then after backpropagation **x.grad** will be another Tensor holding the gradient of x with respect to some scalar value. On the other hand, we usually don't want to backpropagate through the weight update steps when training a neural network. In such scenarios we can use the **torch.no_grad()** context manager to prevent the construction of a computational graph.

In [68]:
x = torch.randn(N, D_in, device=device, dtype = dtype)
y = torch.randn(N, D_out, device=device, dtype = dtype)

# No bias
w1 = torch.randn(D_in, H, device=device, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, requires_grad=True)

learning_rate = 1e-6
for epoch in range(500):
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
    loss = (y_pred - y).pow(2).sum()
    # print(epoch, loss.item())
    
    # This call will compute the gradient of loss with respect to 
    # all Tensors with requires_grad=True. 
    # After this call w1.grad and w2.grad will be Tensors holding 
    # the gradient of the loss with respect to w1 and w2 respectively.
    loss.backward()
    
    # Update weights using gradient descent. For this step we just want 
    # to mutate the values of w1 and w2 in-place; we don't want to build 
    # up a computational graph for the update steps, so we use the 
    # torch.no_grad() context manager to prevent PyTorch from building a 
    # computational graph for the updates
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        
        # Manually zero the gradients after running the backward pass
        w1.grad.zero_()
        w2.grad.zero_()
        
print(epoch, loss.item())

499 2.4723132810322568e-05


# PyTorch: Defining new autograd functions

In [69]:
x = torch.randn(N, D_in, device=device, dtype = dtype)
y = torch.randn(N, D_out, device=device, dtype = dtype)

class MyReLU(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """
    @staticmethod
    def forward(ctx, x):
        """
        In the forward pass we receive a context object and a Tensor containing the
        input; we must return a Tensor containing the output, and we can use the
        context object to cache objects for use in the backward pass.
        """
        ctx.save_for_backward(x)
        return x.clamp(min=0)

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive the context object and a Tensor containing
        the gradient of the loss with respect to the output produced during the
        forward pass. We can retrieve cached data from the context object, and must
        compute and return the gradient of the loss with respect to the input to the
        forward function.
        """
        x, = ctx.saved_tensors
        grad_x = grad_output.clone()
        grad_x[x < 0] = 0
        return grad_x

# Create random Tensors for weights.
w1 = torch.randn(D_in, H, device=device, dtype = dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype = dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; we call our
    # custom ReLU implementation using the MyReLU.apply function
    y_pred = MyReLU.apply(x.mm(w1)).mm(w2)
 
    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    # print(t, loss.item())

    # Use autograd to compute the backward pass.
    loss.backward()

    with torch.no_grad():
        # Update weights using gradient descent
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after running the backward pass
        w1.grad.zero_()
        w2.grad.zero_()

print(epoch, loss.item())

499 4.41391093772836e-05


# PyTorch: nn

When building neural networks we frequently think of arranging the computation into **layers**, some of which have **learnable parameters** which will be optimized during learning.

In PyTorch, the **nn** package serves this purpose. The nn package defines a set of **Modules**, which are roughly equivalent to *neural network layers*. A Module receives input Tensors and computes output Tensors, but may also hold internal state such as Tensors containing learnable parameters. The nn package also defines a set of useful loss functions that are commonly used when training neural networks.

In [70]:
x = torch.randn(N, D_in, device=device, dtype = dtype)
y = torch.randn(N, D_out, device=device, dtype = dtype)

model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
    ).to(device)

criterion = torch.nn.MSELoss(size_average=False)

learning_rate = 1e-4
for t in range(500):
    y_pred = model(x)
    loss = criterion(y_pred, y)
    # print(t, loss.item())
    # Zero the gradients before running the backward pass.
    model.zero_grad()
    # Backward pass: compute gradient of the loss with respect to all the learnable
    # parameters of the model. Internally, the parameters of each Module are stored
    # in Tensors with requires_grad=True, so this call will compute gradients for
    # all learnable parameters in the model.
    loss.backward()
    with torch.no_grad():
        for param in model.parameters():
            param.data -= learning_rate * param.grad
            
print(epoch, loss.item())

499 2.322055934200762e-06


In [58]:
for param in model.parameters():
    print(param.size(), '\n')

torch.Size([100, 1000]) 

torch.Size([100]) 

torch.Size([10, 100]) 

torch.Size([10]) 



# PyTorch: optim

In [71]:
x = torch.randn(N, D_in, device=device, dtype = dtype)
y = torch.randn(N, D_out, device=device, dtype = dtype)

model = torch.nn.Sequential(
        torch.nn.Linear(D_in, H),
        torch.nn.ReLU(),
        torch.nn.Linear(H, D_out)
        ).to(device)

criterion = torch.nn.MSELoss(size_average=False)

learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

for epoch in range(500):
    y_pred = model(x) # Forward pass
    
    loss = criterion(y_pred, y)
    # print(epoch, loss.item())
    
    optimizer.zero_grad()
    
    loss.backward() # Backprop (Backward pass)
    
    optimizer.step()
    
print(epoch, loss.item())

499 4.684503895902026e-10


# PyTorch: Custom nn Modules

In [72]:
x = torch.randn(N, D_in, device=device, dtype = dtype)
y = torch.randn(N, D_out, device=device, dtype = dtype)

class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)
        
    def forward(self, x):
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred

model = TwoLayerNet(D_in, H, D_out)

criterion = torch.nn.MSELoss(size_average=False)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

for epoch in range(500):
    y_pred = model(x)
    
    loss = criterion(y_pred, y)
    # print(epoch, loss.item())
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
print(epoch, loss.item())

499 1.109344793803757e-07


# PyTorch: Control Flow + Weight Sharing

In [73]:
x = torch.randn(N, D_in, device=device, dtype = dtype)
y = torch.randn(N, D_out, device=device, dtype = dtype)

class DynamicNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super(DynamicNet, self).__init__()
        self.input_linear = torch.nn.Linear(D_in, H)
        self.middle_linear = torch.nn.Linear(H, H)
        self.output_linear = torch.nn.Linear(H, D_out)
        
    def forward(self, x):
        h_relu = self.input_linear(x).clamp(min=0)
        for _ in range(random.randint(0, 3)):
            h_relu = self.middle_linear(h_relu).clamp(min=0)
        y_pred = self.output_linear(h_relu)
        return y_pred

model = DynamicNet(D_in, H, D_out)

criterion = torch.nn.MSELoss(size_average=False)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)

for epoch in range(500):
    y_pred = model(x)
    
    loss = criterion(y_pred, y)
    # print(epoch, loss.item())
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
print(epoch, loss.item())

499 0.30067142844200134
