# MNIST

This notebook contains our experiements comparing forward gradient and backpropagation with neural networks and Convolutional nets for the MNIST dataset

#### Setup

In [1]:
# Run once
# CPU only: !pip install torch==1.11.0 torchvision==0.12.0 torchaudio==0.11.0 -f https://download.pytorch.org/whl/cpu/torch_stable.html
!pip install --pre torch -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html --upgrade
!pip install functorch
print("--> Restarting colab instance") 
get_ipython().kernel.do_shutdown(True)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
Collecting torch
  Downloading https://download.pytorch.org/whl/nightly/cpu/torch-1.13.0.dev20220611%2Bcpu-cp37-cp37m-linux_x86_64.whl (190.7 MB)
[K     |████████████████████████████████| 190.7 MB 78 kB/s 
Installing collected packages: torch
  Attempting uninstall: torch
    Found existing installation: torch 1.11.0+cu113
    Uninstalling torch-1.11.0+cu113:
      Successfully uninstalled torch-1.11.0+cu113
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.12.0+cu113 requires torch==1.11.0, but you have torch 1.13.0.dev20220611+cpu which is incompatible.
torchtext 0.12.0 requires torch==1.11.0, but you have torch 1.13.0.dev20220611+cpu which is incompatible.
torcha

{'restart': True, 'status': 'ok'}

In [1]:
!git clone https://github.com/benjaminrike1/forward_gradient

Cloning into 'forward_gradient'...
remote: Enumerating objects: 68, done.[K
remote: Counting objects: 100% (68/68), done.[K
remote: Compressing objects: 100% (54/54), done.[K
remote: Total 68 (delta 25), reused 46 (delta 13), pack-reused 0[K
Unpacking objects: 100% (68/68), done.


In [2]:
cd forward_gradient

/content/forward_gradient


In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import torchvision
import functorch as ft

import numpy as np
from functools import partial

import matplotlib.pyplot as plt
import seaborn as sns

from optim_functions import beale, rosenbrock
from helpers import optimize
from plot_helpers import plot_loss, plot_countour, plot_contour2
from loss import functional_xent, softmax, clamp_probs, _xent
from optimizers import ForwardSGD
from models import Net, ConvNet, LogisticRegression

torch.manual_seed(0)


<torch._C.Generator at 0x7f1685df1830>

## MNIST

Importing the data from torchvision.

In [4]:
transform = torchvision.transforms.Compose([
    torchvision.transforms.ToTensor()
])


mnist_train = torchvision.datasets.MNIST(
    '/tmp/data',
    train=True, 
    download=True, 
    transform=transform
)

train, val = torch.utils.data.random_split(mnist_train, [50000, 10000])

train_data_loader = torch.utils.data.DataLoader(train, 
                                          batch_size=64, 
                                          shuffle=True)

val_data_loader = torch.utils.data.DataLoader(val, 
                                          batch_size=64, 
                                          shuffle=True)

mnist_test = torchvision.datasets.MNIST(
    '/tmp/data',
    train=False, 
    download=True, 
    transform=transform
)
test_data_loader = torch.utils.data.DataLoader(mnist_test, 
                                              batch_size=64,
                                              shuffle=True)

## Neural Network

### SGD

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Forward gradient:

In [None]:
net = Net().to(device) # defining net

# making the net functional to run the code in functorch
# for evaluating the Jacobian-vector product
func, params = ft.make_functional(net)

# removing requires gradient as it will not be used
# for the forward AD
for param in params:
    param.requires_grad_(False)

# defining our optimizer
opt = ForwardSGD(func, functional_xent, params, lr=2e-4, momentum = False, decay=1e-5)

# running the code for e epochs
losses_fwd = []
epochs = 50
test_losses = []
for e in range(epochs):
  # training
  for i, (image, label) in enumerate(train_data_loader):
    image, label = image.to(device), label.to(device)
    params, loss = opt.step(image, label)
    losses_fwd.append(loss.item())
    # evaluating on the test set
    for i, (image, label) in enumerate(test_data_loader):
      with torch.no_grad():
        batch_loss = []
        image, label = image.to(device), label.to(device)
        test_loss = functional_xent(func, params, image, label)
        batch_loss.append(test_loss.item())
        
    test_losses.append(np.mean(batch_loss))
    print(f"Test loss in epoch {i+1}: {test_losses[-1]}")

Backpropagation:

In [None]:
criterion = nn.CrossEntropyLoss() # loss function
net = Net().to(device) # defining net
backprop = torch.optim.SGD(net.parameters(), lr=2e-4) # normal SGD in torch
scheduler = torch.optim.lr_scheduler.ExponentialLR(backprop, gamma=1e-4)

# storing losses
losses = []
test_losses = []
for epoch in range(epochs):
  # going over training set in batches
  for i, (image, label) in enumerate(train_data_loader):
    image, label = image.to(device), label.to(device)
    backprop.zero_grad()
    outputs = net(image)
    loss = criterion(outputs, label)
    loss.backward()
    backprop.step()
    losses.append(loss.item())
    for i, (image, label) in enumerate(test_data_loader):
      with torch.no_grad():
        batch_loss = []
        image, label = image.to(device), label.to(device)
        test_loss = criterion(net(image), label)
        batch_loss.append(test_loss.item())
      test_losses.append(np.mean(batch_loss))
    print(f"Test loss in epoch {i+1}: {(test_losses[-1])}")
    scheduler.step()

### Comparing results

In [None]:
fig, ax = plt.subplots(1,2,figsize=(16,8))

ax[0].plot(losses, color='r', label="Backprop", alpha=.7)
ax[0].set_xlabel("Iterations")
ax[0].set_ylabel("Loss")
ax[0].plot(losses_fwd, color='b', label='Forward gradient', alpha=.7)
ax[0].legend()

ax[1].plot(test_losses, color='r', label="Backprop", alpha=.7)
ax[1].set_xlabel("Epochs")
ax[1].set_ylabel("Loss")
ax[1].plot(test_losses_fwd, color='b', label='Forward gradient', alpha=.7)
ax[1].legend()

### Learning rate optimization

The final search for learning rate is in a quite small interval as we earlier tried a wider search, but wanted to reduce the width to find a better optimum.

In [8]:
# learning rate and decays search for forward gradient

learning_rates = np.logspace(-5, -3, 6)
decays = np.logspace(-6, -4, 3)

epochs = 10

for gamma in learning_rates:
  for lambda_ in decays:
    net = Net().to(device) # defining net

    # making the net functional to run the code in functorch
    # for evaluating the Jacobian-vector product
    func, params = ft.make_functional(net)

    # removing requires gradient as it will not be used
    # for the forward AD
    for param in params:
        param.requires_grad_(False)

    # defining our optimizer
    opt = ForwardSGD(func, functional_xent, params, lr=gamma, momentum = False, decay=lambda_)
    # running the code for e epochs
    epochs = 10
    for e in range(epochs):
      # training
      for i, (image, label) in enumerate(train_data_loader):
        image, label = image.to(device), label.to(device)
        params, loss = opt.step(image, label)
      # evaluating on the test set
    test_losses = []
    for i, (image, label) in enumerate(val_data_loader):
      batch_loss = []
      with torch.no_grad():
        image, label = image.to(device), label.to(device)
        val_loss = functional_xent(params, func,image, label)
        batch_loss.append(val_loss.item())
      test_losses.append(np.mean(batch_loss))
    print(f"Validation loss for lr = {format(gamma,'.6f')}, decay = {format(lambda_,'.6f')}: {np.mean(test_losses)}")

Validation loss for lr = 1e-05, decay = 1e-06: 2.3034265436184636
Validation loss for lr = 1e-05, decay = 1e-05: 2.3073570864975075
Validation loss for lr = 1e-05, decay = 0.0001: 2.29786008786244
Validation loss for lr = 2.5118864315095822e-05, decay = 1e-06: 2.303787557942093
Validation loss for lr = 2.5118864315095822e-05, decay = 1e-05: 2.301249429678461
Validation loss for lr = 2.5118864315095822e-05, decay = 0.0001: 2.301920254519031
Validation loss for lr = 6.309573444801929e-05, decay = 1e-06: 2.30180081288526
Validation loss for lr = 6.309573444801929e-05, decay = 1e-05: 2.3055311509758045
Validation loss for lr = 6.309573444801929e-05, decay = 0.0001: 2.30460032687825
Validation loss for lr = 0.00015848931924611142, decay = 1e-06: 2.2985980480339876
Validation loss for lr = 0.00015848931924611142, decay = 1e-05: 2.30292820778622
Validation loss for lr = 0.00015848931924611142, decay = 0.0001: 2.2999486255038315
Validation loss for lr = 0.00039810717055349735, decay = 1e-06: 2

## Conv Net

Forward gradient:

In [None]:
net = ConvNet().to(device) # defining net

# making the net functional to run the code in functorch
# for evaluating the Jacobian-vector product
func, params = ft.make_functional(net)

# removing requires gradient as it will not be used
# for the forward AD
for param in params:
    param.requires_grad_(False)

# defining our optimizer
opt = ForwardSGD(func, functional_xent, params, lr=2e-4, momentum = False, decay=1e-5)

# running the code for e epochs
losses_fwd = []
epochs = 50
test_losses = []
for e in range(epochs):
  # training
  for i, (image, label) in enumerate(train_data_loader):
    image, label = image.to(device), label.to(device)
    params, loss = opt.step(image, label)
    losses_fwd.append(loss.item())
  # evaluating on the test set
  for i, (image, label) in enumerate(test_data_loader):
    batch_loss = []
    with torch.no_grad():
      image, label = image.to(device), label.to(device)
      test_loss = functional_xent(func, params, image, label)
      batch_loss.append(test_loss.item())
    test_losses.append(np.mean(batch_loss))
    print(f"Test loss in epoch {i+1}: {np.mean(batch_loss)}")

Backpropagation:

In [None]:
criterion = nn.CrossEntropyLoss() # loss function
net = ConvNet().to(device) # defining net
backprop = torch.optim.SGD(net.parameters(), lr=2e-4) # normal SGD in torch
scheduler = torch.optim.lr_scheduler.ExponentialLR(backprop, gamma=1e-4)

# storing losses
losses = []
test_losses = []
for epoch in range(epochs):
  # going over training set in batches
  for i, (image, label) in enumerate(train_data_loader):
    image, label = image.to(device), label.to(device)
    backprop.zero_grad()
    outputs = net(image)
    loss = criterion(outputs, label)
    loss.backward()
    backprop.step()
    losses.append(loss.item())
    for i, (image, label) in enumerate(test_data_loader):
      with torch.no_grad():
        image, label = image.to(device), label.to(device)
        test_loss = criterion(net(image), label)
        batch_loss.append(test_loss.item())
        test_losses.append(np.mean(batch_loss))
        batch_loss = []
    print(f"Test loss in epoch {i+1}: {np.mean(batch_loss)}")
    scheduler.step()

### Comparing results

In [None]:
fig, ax = plt.subplots(1,2,figsize=(16,8))

ax[0].plot(losses, color='r', label="Backprop", alpha=.7)
ax[0].set_xlabel("Iterations")
ax[0].set_ylabel("Loss")
ax[0].plot(losses_fwd, color='b', label='Forward gradient', alpha=.7)
ax[0].legend()

ax[1].plot(test_losses, color='r', label="Backprop", alpha=.7)
ax[1].set_xlabel("Epochs")
ax[1].set_ylabel("Loss")
ax[1].plot(test_losses_fwd, color='b', label='Forward gradient', alpha=.7)
ax[1].legend()