In this notebook we compare three approaches to implementing a simple NN.
<ul>
<li> implement backpropagation explicitely using `numpy` ndarrays</li>
<li> implement backpropagation explicitely using `pytorch` Tensors</li>
<li> use the computational graphs approach using `pytorch` for reverse mode differentiation</li>
</ul>
The three examples are taken from:
http://pytorch.org/tutorials/beginner/pytorch_with_examples.html#warm-up-numpy

In [17]:
import numpy as np
import timeit
import torch
from torch.autograd import Variable
from torch import Tensor
import line_profiler
%load_ext line_profiler

The line_profiler extension is already loaded. To reload it, use:
  %reload_ext line_profiler


In [18]:
# CONFIGURATION VARIABLES
# these values are shared by the three implementations

# We are implementing a NN with one hidden layer:
#  INSZ  - the size of the input  layer
#  HIDSZ - the size of the hidden layer
#  OUTSZ - the size of the output layer
INSZ, HIDSZ, OUTSZ = 784, 30, 10

# batch size
BATCHSZ = 64

# how many times to repeat the training process 
NREPS = 10

dtype = torch.DoubleTensor

In [19]:
# FIRST IMPLEMENTATION:
# Implement backpropagation explicitely using 'numpy' ndarrays

def backprop_numpy():
    start_time = timeit.default_timer()
    
    for counter in range(NREPS):

        # Create random input and output data
        x = np.random.randn(BATCHSZ, INSZ)
        y = np.random.randn(BATCHSZ, OUTSZ)

        # Randomly initialize weights
        w1 = np.random.randn(INSZ , HIDSZ)
        w2 = np.random.randn(HIDSZ, OUTSZ)

        learning_rate = 1e-6
        for t in range(500):
            
            # Forward pass: compute predicted y
            h = x.dot(w1)
            h_relu = np.maximum(h, 0)
            y_pred = h_relu.dot(w2)

            # Compute loss
            loss = np.square(y_pred - y).sum()

            # Backprop to compute gradients of w1 and w2 with respect to loss
            grad_y_pred = 2.0 * (y_pred - y)
            grad_w2 = h_relu.T.dot(grad_y_pred)
            grad_h_relu = grad_y_pred.dot(w2.T)
            grad_h = grad_h_relu.copy()
            grad_h[h < 0] = 0
            grad_w1 = x.T.dot(grad_h)

            # Update weights
            w1 -= learning_rate * grad_w1
            w2 -= learning_rate * grad_w2

    print(timeit.default_timer() - start_time)

In [20]:
# SECOND IMPLEMENTATION:
# Implement backpropagation explicitely using 'pytorch' Tensors

def backprop_pytorch():
    start_time = timeit.default_timer()
    
    for counter in range(NREPS):

        # Create random input and output data
        x = torch.randn(BATCHSZ,  INSZ).type(dtype)
        y = torch.randn(BATCHSZ, OUTSZ).type(dtype)

        # Randomly initialize weights
        w1 = torch.randn( INSZ, HIDSZ).type(dtype)
        w2 = torch.randn(HIDSZ, OUTSZ).type(dtype)

        learning_rate = 1e-6
        for t in range(500):
            
            # Forward pass: compute predicted y
            h = x.mm(w1)
            h_relu = h.clamp(min=0)
            y_pred = h_relu.mm(w2)

            # Compute loss
            loss = (y_pred - y).pow(2).sum()

            # Backprop to compute gradients of w1 and w2 with respect to loss
            grad_y_pred = 2.0 * (y_pred - y)
            grad_w2 = h_relu.t().mm(grad_y_pred)
            grad_h_relu = grad_y_pred.mm(w2.t())
            grad_h = grad_h_relu.clone()
            grad_h[h < 0] = 0
            grad_w1 = x.t().mm(grad_h)

            # Update weights using gradient descent
            w1 -= learning_rate * grad_w1
            w2 -= learning_rate * grad_w2

    print(timeit.default_timer() - start_time)

In [21]:
# THIRD IMPLEMENTATION:
# Use the computational graph / reverse mode differentiation approach (using 'pytorch')

def autograd_pytorch():
    start_time = timeit.default_timer()

    for counter in range(NREPS):

        # Create random Tensors to hold input and outputs, and wrap them in Variables.
        # Setting requires_grad=False indicates that we do not need to compute gradients
        # with respect to these Variables during the backward pass.
        x = Variable(torch.randn(BATCHSZ,  INSZ).type(dtype), requires_grad=False)
        y = Variable(torch.randn(BATCHSZ, OUTSZ).type(dtype), requires_grad=False)

        # Create random Tensors for weights, and wrap them in Variables.
        # Setting requires_grad=True indicates that we want to compute gradients with
        # respect to these Variables during the backward pass.
        w1 = Variable(torch.randn( INSZ, HIDSZ).type(dtype), requires_grad=True)
        w2 = Variable(torch.randn(HIDSZ, OUTSZ).type(dtype), requires_grad=True)

        learning_rate = 1e-6
        for t in range(500):
            
            # Forward pass: compute predicted y using operations on Variables; these
            # are exactly the same operations we used to compute the forward pass using
            # Tensors, but we do not need to keep references to intermediate values since
            # we are not implementing the backward pass by hand.
            y_pred = x.mm(w1).clamp(min=0).mm(w2)

            # Compute and loss using operations on Variables.
            # Now loss is a Variable of shape (1,) and loss.data is a Tensor of shape
            # (1,); loss.data[0] is a scalar value holding the loss.
            loss = (y_pred - y).pow(2).sum()

            # Use autograd to compute the backward pass. This call will compute the
            # gradient of loss with respect to all Variables with requires_grad=True.
            # After this call w1.grad and w2.grad will be Variables holding the gradient
            # of the loss with respect to w1 and w2 respectively.
            loss.backward()

            # Update weights using gradient descent; w1.data and w2.data are Tensors,
            # w1.grad and w2.grad are Variables and w1.grad.data and w2.grad.data are
            # Tensors.
            w1.data -= learning_rate * w1.grad.data
            w2.data -= learning_rate * w2.grad.data

            # Manually zero the gradients after updating weights
            w1.grad.data.zero_()
            w2.grad.data.zero_()

    print(timeit.default_timer() - start_time)

In [22]:
# Run the first implementation
# uncomment one of the two lines below (the second one produces profiling info)

#backprop_numpy()
%lprun -f backprop_numpy backprop_numpy()

2.00706556183286


In [23]:
# Run the second implementation
# uncomment one of the two lines below (the second one produces profiling info)

#backprop_pytorch()
%lprun -f backprop_pytorch backprop_pytorch()

1.4933980808127671


In [24]:
# Run the third implementation
# uncomment one of the two lines below (the second one produces profiling info)

#autograd_pytorch()
%lprun -f autograd_pytorch autograd_pytorch()

5.049617765005678
