# Testing and Comparing Different Optimizers

In this lab session we will implement some gradient descent algorithms and compare their performance
Optimizers: Vanilla Gradient Descent, Momentum, NAG, Adagrad, RMSprop, Adam.

In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from matplotlib import animation
from IPython.display import HTML
from itertools import zip_longest

The function to minimize is the **Beale's function**: <br>
$f(x) = (1.5-x_1+x_1x_2)^2 + (2.25-x_1+x_1{x_2}^2)^2 + (2.625-x_1+x_1{x_2}^3)^2$

In [2]:
f  = lambda x, y: (1.5 - x + x*y)**2 + (2.25 - x + x*y**2)**2 + (2.625 - x + x*y**3)**2

### Question 1
Analytically compute the gradient of f

In [None]:
# gradf_x = lambda x, y:
# gradf_y = lambda x, y: 

In [None]:
xmin, xmax, xstep = -4.5, 4.5, .2
ymin, ymax, ystep = -4.5, 4.5, .2

In [None]:
x, y = np.meshgrid(np.arange(xmin, xmax + xstep, xstep), np.arange(ymin, ymax + ystep, ystep))

In [None]:
z = f(x, y)

In [None]:
minima = np.array([3., .5])

In [None]:
f(*minima)

### Question 2
Create a surface plot to visualize the function and mark the global minimum

In [None]:
fig = plt.figure(figsize=(8, 5))
ax = plt.axes(projection='3d', elev=50, azim=-50)

ax.set_xlabel('$x$')
ax.set_ylabel('$y$')
ax.set_zlabel('$z$')

ax.set_xlim((xmin, xmax))
ax.set_ylim((ymin, ymax))

plt.show()

### Question 3
Create a contour plot and a quiver plot of the function

In [None]:
dz_dx = gradf_x(x, y)
dz_dy = gradf_y(x, y)

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

ax.set_xlabel('$x$')
ax.set_ylabel('$y$')

ax.set_xlim((xmin, xmax))
ax.set_ylim((ymin, ymax))

plt.show()

In [None]:
steps = 500

## Gradient Descent
This vanilla gradient descent: <br>
$\theta_t = \theta_{t-1} - \alpha\nabla_\theta f(\theta_{t-1})$

### Question 4
Implement vanilla gradient descent

In [None]:
alpha = 0.001
x0 = np.array([1.1, 2.])

In [None]:
x_gd = np.zeros((x0.shape[0], steps))
iteration = []
losses_gd = []
loss = f(*x0)
print(f'X:{x0}, f:{loss}')
for i in range(steps):
    print(f'X:{x0}, f:{loss}, i:{i}')


In [None]:
def Plot_Loss(iteration, losses):
    fig, ax = plt.subplots(figsize=(10, 6))    
    ax.semilogy(iteration, losses)
    ax.set_xlabel('$i$')
    ax.set_ylabel('$loss$')


In [None]:
def Plot_Trajectory(x_val, label):
    fig, ax = plt.subplots(figsize=(10, 6))

    ax.contour(x, y, z, levels=np.logspace(0, 5, 35), norm=LogNorm(), cmap=plt.cm.jet)
    ax.plot(x_val[0,:], x_val[1,:], 'b', label=label, lw=2)
    ax.plot(*minima, 'r*', markersize=18)
    ax.legend(loc='upper left')

    ax.set_xlabel('$x$')
    ax.set_ylabel('$y$')

    ax.set_xlim((xmin, xmax))


In [None]:
Plot_Loss(iteration, losses_gd)

In [None]:
Plot_Trajectory(x_gd, 'GD')

## Momentum
GD with momentum helps accelerate GD and prevents it from being stuck in regions with small gradients: <br>
$v_t = \gamma v_{t-1} + \alpha \nabla_\theta f(\theta_{t-1})$ <br>
$\theta_t = \theta_{t-1} - v_t$

### Question 5
Implement GD with momentum

In [None]:
alpha = 0.001
gamma = 0.9
x0 = np.array([1.1, 2.])

In [None]:
iteration = []
losses_mom = []
loss = f(*x0)
print(f'X:{x0}, f:{loss}')
x_mom = np.zeros((x0.shape[0], steps))
for i in range(steps):

    print(f'X:{x0}, f:{loss}, i:{i}')


In [None]:
Plot_Loss(iteration, losses_mom)

In [None]:
Plot_Trajectory(x_mom, 'momentum')

## Nesterov Accelerated Gradient (NAG)
NAG is very similar to momentum but it calculates the gradient at the next position to stabilize the learning process. <br>
$v_t = \gamma v_{t-1} + \alpha \nabla_\theta f(\theta_{t-1} - \gamma v_{t-1})$ <br>
$\theta_t = \theta_{t-1} - v_t$

### Question 6
Implement NAG

In [None]:
alpha = 0.001
gamma = 0.9
x0 = np.array([1.1, 2.])

In [None]:
iteration = []
losses_nag = []
loss = f(*x0)
print(f'X:{x0}, f:{loss}')
x_nag = np.zeros((x0.shape[0], steps))
for i in range(steps):

    print(f'X:{x0}, f:{loss}, i:{i}')


In [None]:
Plot_Loss(iteration, losses_nag)

In [None]:
Plot_Trajectory(x_nag, 'NAG')

## Adagrad
Adagrad adapts the learning rate based on the frequency of previous parameter updates. <br>
more updates --> smaller learning rate <br>
less updates --> higher learning rate <br>
$G_t = G_{t-1} + \left(\nabla_\theta f(\theta_{t-1})\right)^2$ <br>
$\theta_t = \theta_{t-1} - \frac{\alpha}{\sqrt{G_t + \epsilon}}\nabla_\theta f(\theta_{t-1})$ <br>
with $G_0 = \vec{0}$

### Question 7
Implement Adagrad

In [None]:
alpha = 1.0
x0 = np.array([1.1, 2.])
Eg = np.zeros(x0.shape)
epsilon = 1e-8

In [None]:
x_adag = np.zeros((x0.shape[0], steps))
iteration = []
losses_adag = []
loss = f(*x0)
print(f'X:{x0}, f:{loss}')
for i in range(steps):

    print(f'X:{x0}, f:{loss}, i:{i}')


In [None]:
Plot_Loss(iteration, losses_adag)

In [None]:
Plot_Trajectory(x_adag, 'Adagrad')

## RMSprop
RMSprop is very similar to Adagrad but it keeps the average over a finite window resolving the radically diminishing leanring rates. <br>
$G_t = 0.9G_{t-1} + 0.1\left(\nabla_\theta f(\theta_{t-1})\right)^2$ <br>
$\theta_t = \theta_{t-1} - \frac{\alpha}{\sqrt{G_t + \epsilon}}\nabla_\theta f(\theta_{t-1})$ <br>
with $G_0 = \vec{0}$

### Question 8
Implement RMSprop

In [None]:
alpha = 0.01
x0 = np.array([1.1, 2.])
Eg = np.zeros(x0.shape)
epsilon = 1e-8

In [None]:
x_rms = np.zeros((x0.shape[0], steps))
iteration = []
losses_rms = []
loss = f(*x0)
print(f'X:{x0}, f:{loss}')
for i in range(steps):
    
    print(f'X:{x0}, f:{loss}, i:{i}')
    

In [None]:
Plot_Loss(iteration, losses_rms)

In [None]:
Plot_Trajectory(x_rms, 'RMSprop')

## Adam
Adam is an optimizer that combines momentum GD and learning rate adaptation. <br>
$m_t = \beta_1 m_{t-1} + (1-\beta1)\nabla_\theta f(\theta_{t-1})$ <br>
$v_t = \beta_2 v_{t-1} + (1-\beta2)\left(\nabla_\theta f(\theta_{t-1})\right)^2$ <br>
$\hat{m}_t = \frac{m_t}{1-{\beta_1}^t}$ <br>
$\hat{v}_t = \frac{v_t}{1-{\beta_2}^t}$ <br>
$\theta_t = \theta_{t-1} - \frac{\alpha}{\sqrt{\hat{v}_t + \epsilon}}\hat{m}_t$ <br>
with $m_0 = v_0 = \vec{0}$

### Question 9
Implement Adam

In [None]:
alpha = 0.01
epsilon = 1e-8
beta1 = 0.9
beta2 = 0.9
x0 = np.array([1.1, 2.])
mt = np.array([0., 0.])
vt = np.array([0., 0.])

In [None]:
iteration = []
losses_adam = []
loss = f(*x0)
print(f'X:{x0}, f:{loss}')
x_adam = np.zeros((x0.shape[0], steps))
for i in range(steps):

    print(f'X:{x0}, f:{loss}, i:{i}')


In [None]:
Plot_Loss(iteration, losses_adam)

In [None]:
Plot_Trajectory(x_adam, 'Adam')

## Simulate Animation

In [None]:
class TrajectoryAnimation(animation.FuncAnimation):
    
    def __init__(self, *paths, labels=[], fig=None, ax=None, frames=None, 
                 interval=60, repeat_delay=5, blit=True, **kwargs):

        if fig is None:
            if ax is None:
                fig, ax = plt.subplots()
            else:
                fig = ax.get_figure()
        else:
            if ax is None:
                ax = fig.gca()

        self.fig = fig
        self.ax = ax
        
        self.paths = paths

        if frames is None:
            frames = max(path.shape[1] for path in paths)
  
        self.lines = [ax.plot([], [], label=label, lw=2)[0] 
                      for _, label in zip_longest(paths, labels)]
        self.points = [ax.plot([], [], 'o', color=line.get_color())[0] 
                       for line in self.lines]

        super(TrajectoryAnimation, self).__init__(fig, self.animate, init_func=self.init_anim,
                                                  frames=frames, interval=interval, blit=blit,
                                                  repeat_delay=repeat_delay, **kwargs)

    def init_anim(self):
        for line, point in zip(self.lines, self.points):
            line.set_data([], [])
            point.set_data([], [])
        return self.lines + self.points

    def animate(self, i):
        for line, point, path in zip(self.lines, self.points, self.paths):
            line.set_data(*path[::,:i])
            point.set_data(*path[::,i-1:i])
        return self.lines + self.points

In [None]:
def Plot_Losses(iteration, labels, losses):
    fig, ax = plt.subplots(figsize=(10, 6))
    for loss in losses:
        ax.semilogy(iteration, loss)
    ax.legend(labels)
    ax.set_xlabel('i')
    ax.set_ylabel('loss')


### Question 10
Use the function `TrajectoryAnimation()` to make an animation that shows the learning process of each method and comment on the results.

In [None]:
methods = ["GD", "Momentum", "NAG", "Adagrad", "RMSprop", "Adam"]

In [None]:
paths = [x_gd[:,0:-1:10], x_mom[:,0:-1:10], x_nag[:,0:-1:10],
         x_adag[:,0:-1:10], x_rms[:,0:-1:10], x_adam[:,0:-1:10]]

In [None]:
losses = [losses_gd, losses_mom, losses_nag, losses_adag, losses_rms, losses_adam]

In [None]:
Plot_Losses(iteration, methods, losses)

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

ax.contour(x, y, z, levels=np.logspace(0, 5, 35), norm=LogNorm(), cmap=plt.cm.jet)
ax.plot(*minima, 'r*', markersize=10)

ax.set_xlabel('$x$')
ax.set_ylabel('$y$')

ax.set_xlim((xmin, xmax))
ax.set_ylim((ymin, ymax))

anim = TrajectoryAnimation(*paths, labels=methods, ax=ax)

ax.legend(loc='upper left')

In [None]:
HTML(anim.to_html5_video())