In [None]:
#!pip install "git+https://github.com/broutonlab/deep-learning-course.git"

import dl_course

In [None]:
#@title Configure notebook (if this cell fails, just launch it again)

from dl_course.optimizers.utils import *
from dl_course.optimizers.visual import *
optim_install_dependencies()
optim_configure_notebooks()

In [None]:
import matplotlib.pyplot as plt
# remove this if you are not using dark theme
plt.style.use('dark_background')

## Loss function visualization

As discussed previously, **loss function** is a way to evaluate how well a model (defined by **a set of parameters**) performs based on ground truth values provided by training dataset.

Imagine our model has two parameters. This allows us to plot a 3D surface (or a heatmap), where XY plane is the space of all possible combinations of parameters, and Z axis shows loss function values for each of the points in the parameter space.
Of course, actual ML tasks have thousands of parameter dimensions, and not just two, but there's no way we could visualize that. Two-parameter case is used to help to develop intuition about different optimization algorithms.

Remember that our task is to minimize the loss function, i.e. in this case, find (x, y) coordinates of global minimum.

## Consider this loss function landscape:

In [None]:
# benchmark function for testing optimization algorithms
# https://en.wikipedia.org/wiki/Rosenbrock_function
def rosenbrock(x1x2, a=1, b=100):
    x1, x2 = x1x2
    return b*(x2-x1**2)**2+(a-x1)**2
    
visualize_3d(rosenbrock, xlim=(-7, 10), elev=45, azim=-120)

### All optimizer functions in this notebook have following API:

Input:
- w: numpy array of current weights (of size 2 in our case)
- dw: numpy array of gradients with respect to w
- params: parameters such as learning rate, momentum, etc.

Returns:
- next_w: next predicted position which should optimize the loss
- params: dictionary of parameters for the next update (some optimizers have a state which updates with each iteration)

## Stochastic Gradient Descent (SGD)

Implement SGD in the cell below.

In [None]:
def SGD(w, dw, **params):
    lr = params['learning_rate']
    
    # ========================
    # [!] TODO: implement SGD

    # ========================
    
    return next_w, params

## Test your SGD implementation

Global minimum of our Rosenbrock function should be around **(1, 1)**.
Final loss should be **0**.

In [None]:
rosenbrock_start = [8., 0.]
num_iter = 1000

# test your SGD
path_SGD = run_optim(w=rosenbrock_start, 
          f=rosenbrock, 
          optimizer=SGD, 
          n_iter=num_iter, 
          learning_rate=1e-4)

print_convergence(path_SGD)

In [None]:
anim = visualize_3d(rosenbrock, 
                    paths=[path_SGD], 
                    colors=["green"], 
                    xlim=(-7, 10), elev=45, azim=-120)
anim

## SGD+Momentum

Implement SGD+momentum in the cell below.

In [None]:
def SGD_momentum(w, dw, **params):
    lr = params['learning_rate']
    momentum = params['momentum']
    velocity = params.get("velocity", np.zeros_like(w))
    
    # ========================
    # [!] TODO: implement SGD+momentum
    
    
    # ========================
    
    params["velocity"] = velocity
    
    return next_w, params

## Test your SGD+Momentum implementation

It is recommended that you try different learning rate and compare the results.

In [None]:
path_SGD_momentum = run_optim(w=rosenbrock_start, 
          f=rosenbrock, 
          optimizer=SGD_momentum, 
          n_iter=num_iter, 
          learning_rate=1e-4,
          momentum=0.7)

print_convergence(path_SGD_momentum)

In [None]:
anim = visualize_3d(rosenbrock, 
                    paths=[path_SGD_momentum, path_SGD], 
                    colors=["red", "green"], 
                    xlim=(-7, 10), elev=45, azim=-120)
anim

In [None]:
plot_losses([path_SGD_momentum, path_SGD], colors=["red", "green"])

## Implement RMSProp

Implement RMSProp in the cell below.

In [None]:
def RMSProp(w, dw, **params):
    lr = params['learning_rate']
    decay_rate = params['decay_rate']
    epsilon = params['epsilon']
    cache = params.get("cache", np.zeros_like(w))
    
    # ========================
    # [!] TODO: implement RMSProp
    

    # ========================
    
    params["cache"] = cache
    
    return next_w, params

## Test your RMSProp implementation

We'll be using a different function, which has not one, but 4 global minima.
Values at these minima are still **0**.

In [None]:
# https://en.wikipedia.org/wiki/Himmelblau%27s_function
def himmelblau(x1x2):
    x1, x2 = x1x2
    return (x1**2+x2-11)**2 + (x1+x2**2-7)**2

himmelblau_start = [-0.27, -0.923]

In [None]:
path_RMSProp = run_optim(w=himmelblau_start, 
          f=himmelblau, 
          optimizer=RMSProp, 
          n_iter=num_iter, 
          learning_rate=1e-1,
          decay_rate=0.9,
          epsilon=1e2)

print_convergence(path_RMSProp)

In [None]:
anim = visualize_3d(himmelblau, 
                    paths=[path_RMSProp], 
                    colors=["yellow"], 
                    xlim=(-4.5, 4.5), elev=50, azim=10)
anim

## Implement AdaGrad

Implement AdaGrad in the cell below.

In [None]:
def AdaGrad(w, dw, **params):
    lr = params['learning_rate']
    decay_rate = params['decay_rate']
    epsilon = params['epsilon']
    cache = params.get("cache", np.zeros_like(w))
    
    # ========================
    # [!] TODO: implement AdaGrad


    # ========================
    
    params["cache"] = cache
    
    return next_w, params

In [None]:
path_AdaGrad = run_optim(w=himmelblau_start, 
          f=himmelblau, 
          optimizer=AdaGrad, 
          n_iter=num_iter, 
          learning_rate=1e-1,
          decay_rate=0.9,
          epsilon=1e1)

print_convergence(path_AdaGrad)

## Implement Adam

Implement Adam in the cell below.

In [None]:
def Adam(w, dw, **params):
    lr = params['learning_rate']
    beta1 = params['beta1']
    beta2 = params['beta2']
    epsilon = params['epsilon']
    m = params.get("m", np.zeros_like(w))
    v = params.get("v", np.zeros_like(w))
    t = params.get("t", 0)
    
    # ========================
    # [!] TODO: implement Adam
    




    # ========================
    
    # ========================
    # [!] TODO: update parameters for the next iteration



    # ========================
    
    return next_w, params

In [None]:
path_Adam = run_optim(w=himmelblau_start, 
          f=himmelblau, 
          optimizer=Adam, 
          n_iter=num_iter, 
          learning_rate=1e-2,
          beta1=0.9,
          beta2=0.999, 
          epsilon=1e-8)

print_convergence(path_Adam)

## Let's try some state of the art optimizers

We'll be using RangerQH optimizer for this one.

In [None]:
import torch_optimizer as optim

path_Ranger = run_torch_optim(w=himmelblau_start, 
                              f=himmelblau,
                              optimizer_cls=optim.RangerQH,
                              lr=1e-1,
                              n_iter=num_iter)

print_convergence(path_Ranger)

## Let's plot all of our results!

In [None]:
# running your functions on the new landscape
path_SGD_himmelblau = run_optim(w=himmelblau_start, 
          f=himmelblau, 
          optimizer=SGD, 
          n_iter=num_iter, 
          learning_rate=1e-3)

path_SGD_momentum_himmelblau = run_optim(w=himmelblau_start, 
          f=himmelblau, 
          optimizer=SGD_momentum, 
          n_iter=num_iter, 
          learning_rate=1e-3,
          momentum=0.9
          )

In [None]:
paths = [path_Adam, 
         path_SGD_himmelblau, 
         path_SGD_momentum_himmelblau, 
         path_RMSProp,
         path_AdaGrad,
         path_Ranger]

colors = ["cyan",
          "green",
          "red",
          "yellow",
          "purple",
          "orange"]

anim = visualize_3d(himmelblau, 
                    paths=paths, 
                    colors=colors, 
                    xlim=(-4.5, 4.5), elev=80, azim=10)
anim

In [None]:
plot_losses(paths, colors=colors)

# Home task

- try implementing one of the state-of-the-art optimizers 
(you may find this useful: https://danielhanchen.github.io/optimizers/sota%20optimizers.htm)
- try benchmarking your optimizers using the function below. Its global minimum is **0** at **(0, 0)**
- try programmatically tuning parameters such as learning rate (use path['best_loss'])

In [None]:
def ackley(x1x2):
  x1, x2 = x1x2
  exp, sqrt, cosine = np.exp, np.sqrt, np.cos
  if torch.is_tensor(x1x2):
    exp, sqrt, cosine = torch.exp, torch.sqrt, torch.cos
    
  return -20.0 * exp(-0.2 * sqrt(0.5 * (x1**2 + x2**2))) - \
         exp(0.5 * (cosine(2 * np.pi * x1) + cosine(2 * np.pi * x2))) + np.e + 20

ackley_start = [-10., -10.]

visualize_3d(ackley, xlim=(-20, 20), elev=10, azim=-120)