In [None]:
#default_exp core.optim

In [None]:
# hide
import warnings
warnings.filterwarnings("ignore")

In [None]:
# hide
from nbdev.showdoc import *
from nbdev.export import *
from nbdev.imports import Config as NbdevConfig

nbdev_path = str(NbdevConfig().path("nbs_path")/'data')
nbdev_path

'/Users/ayushman/Desktop/lightning_cv/nbs/data'

# Optimizers
> Collection of usefull `Optimizers`

In [None]:
# export
from typing import *
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.optimizer import Optimizer, required

from timm.optim import Lookahead, RAdam, RMSpropTF
from fastcore.all import delegates
from lightning_cv.core.common import Registry

In [None]:
# hide
# for test
from fastcore.all import *
from torch import optim

**Ranger**

In [None]:
#export
@delegates(RAdam)
def Ranger(params: Iterable, betas:Tuple[float, float]=(.95, 0.999), eps:float=1e-5, 
           k: int=6, alpha: float=0.5, **kwargs):
    "Convenience method for `Lookahead` with `RAdam`"
    return Lookahead(RAdam(params, betas=betas, eps=eps, **kwargs), alpha=alpha, k=k)

In [None]:
show_doc(Ranger)

<h4 id="Ranger" class="doc_header"><code>Ranger</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>Ranger</code>(**`params`**:`Iterable`\[`T_co`\], **`betas`**:`Tuple`\[`float`, `float`\]=*`(0.95, 0.999)`*, **`eps`**:`float`=*`1e-05`*, **`k`**:`int`=*`6`*, **`alpha`**:`float`=*`0.5`*, **`lr`**=*`0.001`*, **`weight_decay`**=*`0`*)

Convenience method for `Lookahead` with `RAdam`

**Ranger with Gradient Centralization**

In [None]:
# export
class RangerGC(Optimizer):
    """
    Ranger deep learning optimizer - RAdam + Lookahead + Gradient Centralization, combined into one optimizer.  
    From - https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer/blob/master/ranger/ranger.py
    """

    def __init__(self, params:Iterable, lr:float=1e-3, alpha:float=0.5, k:int=6, N_sma_threshhold:int=5,
                 betas:Tuple[float, float]=(.95, 0.999), eps:float=1e-5, weight_decay:Union[float, int]=0, 
                 use_gc:bool=True, gc_conv_only:bool=False):

        # parameter checks
        if not 0.0 <= alpha <= 1.0:
            raise ValueError(f'Invalid slow update rate: {alpha}')
        if not 1 <= k:
            raise ValueError(f'Invalid lookahead steps: {k}')
        if not lr > 0:
            raise ValueError(f'Invalid Learning Rate: {lr}')
        if not eps > 0:
            raise ValueError(f'Invalid eps: {eps}')

        # prep defaults and init torch.optim base
        defaults = dict(lr=lr, alpha=alpha, k=k, step_counter=0, betas=betas,
                        N_sma_threshhold=N_sma_threshhold, eps=eps, weight_decay=weight_decay)
        super().__init__(params, defaults)

        # adjustable threshold
        self.N_sma_threshhold = N_sma_threshhold

        # look ahead params

        self.alpha = alpha
        self.k = k

        # radam buffer for state
        self.radam_buffer = [[None, None, None] for ind in range(10)]

        # gc on or off
        self.use_gc = use_gc

        # level of gradient centralization
        self.gc_gradient_threshold = 3 if gc_conv_only else 1

    def __setstate__(self, state):
        print("set state called")
        super(Ranger, self).__setstate__(state)

    def step(self, closure=None):
        loss = None

        if closure is not None:
            loss = closure()

        # Evaluate averages and grad, update param tensors
        for group in self.param_groups:

            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data.float()

                if grad.is_sparse:
                    raise RuntimeError(
                        'Ranger optimizer does not support sparse gradients')

                p_data_fp32 = p.data.float()

                state = self.state[p]

                if len(state) == 0:
                    state['step'] = 0
                    state['exp_avg'] = torch.zeros_like(p_data_fp32)
                    state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)

                    # look ahead weight storage now in state dict
                    state['slow_buffer'] = torch.empty_like(p.data)
                    state['slow_buffer'].copy_(p.data)

                else:
                    state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
                    state['exp_avg_sq'] = state['exp_avg_sq'].type_as(
                        p_data_fp32)

                # begin computations
                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                beta1, beta2 = group['betas']

                # GC operation for Conv layers and FC layers
                if grad.dim() > self.gc_gradient_threshold:
                    grad.add_(-grad.mean(dim=tuple(range(1, grad.dim())), keepdim=True))

                state['step'] += 1

                # compute variance mov avg
                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
                # compute mean moving avg
                exp_avg.mul_(beta1).add_(1 - beta1, grad)

                buffered = self.radam_buffer[int(state['step'] % 10)]

                if state['step'] == buffered[0]:
                    N_sma, step_size = buffered[1], buffered[2]
                else:
                    buffered[0] = state['step']
                    beta2_t = beta2 ** state['step']
                    N_sma_max = 2 / (1 - beta2) - 1
                    N_sma = N_sma_max - 2 * \
                        state['step'] * beta2_t / (1 - beta2_t)
                    buffered[1] = N_sma
                    if N_sma > self.N_sma_threshhold:
                        step_size = math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (
                            N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step'])
                    else:
                        step_size = 1.0 / (1 - beta1 ** state['step'])
                    buffered[2] = step_size

                if group['weight_decay'] != 0:
                    p_data_fp32.add_(-group['weight_decay']
                                     * group['lr'], p_data_fp32)

                # apply lr
                if N_sma > self.N_sma_threshhold:
                    denom = exp_avg_sq.sqrt().add_(group['eps'])
                    p_data_fp32.addcdiv_(-step_size *
                                         group['lr'], exp_avg, denom)
                else:
                    p_data_fp32.add_(-step_size * group['lr'], exp_avg)

                p.data.copy_(p_data_fp32)

                # integrated look ahead...
                # we do it at the param level instead of group level
                if state['step'] % group['k'] == 0:
                    # get access to slow param tensor
                    slow_p = state['slow_buffer']
                    # (fast weights - slow weights) * alpha
                    slow_p.add_(self.alpha, p.data - slow_p)
                    # copy interpolated weights to RAdam param tensor
                    p.data.copy_(slow_p)

        return loss

In [None]:
show_doc(RangerGC)

<h2 id="RangerGC" class="doc_header"><code>class</code> <code>RangerGC</code><a href="" class="source_link" style="float:right">[source]</a></h2>

> <code>RangerGC</code>(**`params`**:`Iterable`\[`T_co`\], **`lr`**:`float`=*`0.001`*, **`alpha`**:`float`=*`0.5`*, **`k`**:`int`=*`6`*, **`N_sma_threshhold`**:`int`=*`5`*, **`betas`**:`Tuple`\[`float`, `float`\]=*`(0.95, 0.999)`*, **`eps`**:`float`=*`1e-05`*, **`weight_decay`**:`Union`\[`float`, `int`\]=*`0`*, **`use_gc`**:`bool`=*`True`*, **`gc_conv_only`**:`bool`=*`False`*) :: `Optimizer`

Ranger deep learning optimizer - RAdam + Lookahead + Gradient Centralization, combined into one optimizer.  
From - https://github.com/lessw2020/Ranger-Deep-Learning-Optimizer/blob/master/ranger/ranger.py

**SGDP**

In [None]:
# export
class SGDP(Optimizer):
    "SGDP Optimizer Implementation copied from https://github.com/clovaai/AdamP/blob/master/adamp/sgdp.py"
    
    def __init__(self, params: Iterable, lr = required, momentum: Union[float, int] = 0, 
                 dampening: Union[float, int] = 0, weight_decay: Union[float, int] = 0, 
                 nesterov: bool = False, eps: float = 1e-8, delta: float = 0.1, 
                 wd_ratio: Union[float, int] = 0.1):
        
        defaults = dict(lr=lr, momentum=momentum, dampening=dampening, weight_decay=weight_decay,
                        nesterov=nesterov, eps=eps, delta=delta, wd_ratio=wd_ratio)
        super(SGDP, self).__init__(params, defaults)

    def _channel_view(self, x):
        return x.view(x.size(0), -1)

    def _layer_view(self, x):
        return x.view(1, -1)

    def _cosine_similarity(self, x, y, eps, view_func):
        x = view_func(x)
        y = view_func(y)

        return F.cosine_similarity(x, y, dim=1, eps=eps).abs_()

    def _projection(self, p, grad, perturb, delta, wd_ratio, eps):
        wd = 1
        expand_size = [-1] + [1] * (len(p.shape) - 1)
        for view_func in [self._channel_view, self._layer_view]:

            cosine_sim = self._cosine_similarity(grad, p.data, eps, view_func)

            if cosine_sim.max() < delta / math.sqrt(view_func(p.data).size(1)):
                p_n = p.data / view_func(p.data).norm(dim=1).view(expand_size).add_(eps)
                perturb -= p_n * view_func(p_n * perturb).sum(dim=1).view(expand_size)
                wd = wd_ratio

                return perturb, wd

        return perturb, wd

    def step(self, closure=None):
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            momentum = group['momentum']
            dampening = group['dampening']
            nesterov = group['nesterov']

            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data
                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['momentum'] = torch.zeros_like(p.data)

                # SGD
                buf = state['momentum']
                buf.mul_(momentum).add_(grad, alpha=1 - dampening)
                if nesterov:
                    d_p = grad + momentum * buf
                else:
                    d_p = buf

                # Projection
                wd_ratio = 1
                if len(p.shape) > 1:
                    d_p, wd_ratio = self._projection(p, grad, d_p, group['delta'], group['wd_ratio'], group['eps'])

                # Weight decay
                if group['weight_decay'] > 0:
                    p.data.mul_(1 - group['lr'] * group['weight_decay'] * wd_ratio / (1-momentum))

                # Step
                p.data.add_(d_p, alpha=-group['lr'])

        return loss

In [None]:
show_doc(SGDP)

<h2 id="SGDP" class="doc_header"><code>class</code> <code>SGDP</code><a href="" class="source_link" style="float:right">[source]</a></h2>

> <code>SGDP</code>(**`params`**:`Iterable`\[`T_co`\], **`lr`**=*`<required parameter>`*, **`momentum`**:`Union`\[`float`, `int`\]=*`0`*, **`dampening`**:`Union`\[`float`, `int`\]=*`0`*, **`weight_decay`**:`Union`\[`float`, `int`\]=*`0`*, **`nesterov`**:`bool`=*`False`*, **`eps`**:`float`=*`1e-08`*, **`delta`**:`float`=*`0.1`*, **`wd_ratio`**:`Union`\[`float`, `int`\]=*`0.1`*) :: `Optimizer`

SGDP Optimizer Implementation copied from https://github.com/clovaai/AdamP/blob/master/adamp/sgdp.py

**AdamP**

In [None]:
# export
class AdamP(Optimizer):
    "AdamP Optimizer Implementation copied from https://github.com/clovaai/AdamP/blob/master/adamp/adamp.py"
    
    def __init__(self, params: Iterable, lr: Union[float, int] = 1e-3, betas: Tuple[float, float] = (0.9, 0.999), 
                 eps: float = 1e-8, weight_decay: Union[float, int] = 0, delta: float = 0.1, 
                 wd_ratio: float = 0.1, nesterov: bool = False):
        
        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay,
                        delta=delta, wd_ratio=wd_ratio, nesterov=nesterov)
        
        super(AdamP, self).__init__(params, defaults)

    def _channel_view(self, x):
        return x.view(x.size(0), -1)

    def _layer_view(self, x):
        return x.view(1, -1)

    def _cosine_similarity(self, x, y, eps, view_func):
        x = view_func(x)
        y = view_func(y)

        x_norm = x.norm(dim=1).add_(eps)
        y_norm = y.norm(dim=1).add_(eps)
        dot = (x * y).sum(dim=1)

        return dot.abs() / x_norm / y_norm

    def _projection(self, p, grad, perturb, delta, wd_ratio, eps):
        wd = 1
        expand_size = [-1] + [1] * (len(p.shape) - 1)
        for view_func in [self._channel_view, self._layer_view]:

            cosine_sim = self._cosine_similarity(grad, p.data, eps, view_func)

            if cosine_sim.max() < delta / math.sqrt(view_func(p.data).size(1)):
                p_n = p.data / view_func(p.data).norm(dim=1).view(expand_size).add_(eps)
                perturb -= p_n * view_func(p_n * perturb).sum(dim=1).view(expand_size)
                wd = wd_ratio

                return perturb, wd

        return perturb, wd

    def step(self, closure=None):
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue

                grad = p.grad.data
                beta1, beta2 = group['betas']
                nesterov = group['nesterov']

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['step'] = 0
                    state['exp_avg'] = torch.zeros_like(p.data)
                    state['exp_avg_sq'] = torch.zeros_like(p.data)

                # Adam
                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']

                state['step'] += 1
                bias_correction1 = 1 - beta1 ** state['step']
                bias_correction2 = 1 - beta2 ** state['step']

                exp_avg.mul_(beta1).add_(1 - beta1, grad)
                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)

                denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps'])
                step_size = group['lr'] / bias_correction1

                if nesterov:
                    perturb = (beta1 * exp_avg + (1 - beta1) * grad) / denom
                else:
                    perturb = exp_avg / denom

                # Projection
                wd_ratio = 1
                if len(p.shape) > 1:
                    perturb, wd_ratio = self._projection(p, grad, perturb, group['delta'], group['wd_ratio'], group['eps'])

                # Weight decay
                if group['weight_decay'] > 0:
                    p.data.mul_(1 - group['lr'] * group['weight_decay'] * wd_ratio)

                # Step
                p.data.add_(-step_size, perturb)

        return loss

In [None]:
show_doc(AdamP)

<h2 id="AdamP" class="doc_header"><code>class</code> <code>AdamP</code><a href="" class="source_link" style="float:right">[source]</a></h2>

> <code>AdamP</code>(**`params`**:`Iterable`\[`T_co`\], **`lr`**:`Union`\[`float`, `int`\]=*`0.001`*, **`betas`**:`Tuple`\[`float`, `float`\]=*`(0.9, 0.999)`*, **`eps`**:`float`=*`1e-08`*, **`weight_decay`**:`Union`\[`float`, `int`\]=*`0`*, **`delta`**:`float`=*`0.1`*, **`wd_ratio`**:`float`=*`0.1`*, **`nesterov`**:`bool`=*`False`*) :: `Optimizer`

AdamP Optimizer Implementation copied from https://github.com/clovaai/AdamP/blob/master/adamp/adamp.py

**RMSpropTF from `timm`**

In [None]:
show_doc(RMSpropTF)

<h2 id="RMSpropTF" class="doc_header"><code>class</code> <code>RMSpropTF</code><a href="timm/optim/rmsprop_tf.py#L14" class="source_link" style="float:right">[source]</a></h2>

> <code>RMSpropTF</code>(**`params`**, **`lr`**=*`0.01`*, **`alpha`**=*`0.9`*, **`eps`**=*`1e-10`*, **`weight_decay`**=*`0`*, **`momentum`**=*`0.0`*, **`centered`**=*`False`*, **`decoupled_decay`**=*`False`*, **`lr_in_momentum`**=*`True`*) :: `Optimizer`

Implements RMSprop algorithm (TensorFlow style epsilon)

NOTE: This is a direct cut-and-paste of PyTorch RMSprop with eps applied before sqrt
and a few other modifications to closer match Tensorflow for matching hyper-params.

Noteworthy changes include:
1. Epsilon applied inside square-root
2. square_avg initialized to ones
3. LR scaling of update accumulated in momentum buffer

Proposed by G. Hinton in his
`course <http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf>`_.

The centered version first appears in `Generating Sequences
With Recurrent Neural Networks <https://arxiv.org/pdf/1308.0850v5.pdf>`_.

Arguments:
    params (iterable): iterable of parameters to optimize or dicts defining
        parameter groups
    lr (float, optional): learning rate (default: 1e-2)
    momentum (float, optional): momentum factor (default: 0)
    alpha (float, optional): smoothing (decay) constant (default: 0.9)
    eps (float, optional): term added to the denominator to improve
        numerical stability (default: 1e-10)
    centered (bool, optional) : if ``True``, compute the centered RMSProp,
        the gradient is normalized by an estimation of its variance
    weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
    decoupled_decay (bool, optional): decoupled weight decay as per https://arxiv.org/abs/1711.05101
    lr_in_momentum (bool, optional): learning rate scaling is included in the momentum buffer
        update as per defaults in Tensorflow

## OPTIM_REGISTERY
> `Registery` of Optimizers

In [None]:
# export
from torch.optim import SGD, Adam, AdamW

In [None]:
# export
OPTIM_REGISTERY = Registry("OPTIMIZERS")
OPTIM_REGISTERY.register(SGD)
OPTIM_REGISTERY.register(SGDP)
OPTIM_REGISTERY.register(Adam)
OPTIM_REGISTERY.register(AdamW)
OPTIM_REGISTERY.register(AdamP)
OPTIM_REGISTERY.register(Ranger)
OPTIM_REGISTERY.register(RangerGC)

In [None]:
# hide-input
OPTIM_REGISTERY

Registry of OPTIMIZERS:
╒══════════╤═════════════════════════════════════╕
│ Names    │ Objects                             │
╞══════════╪═════════════════════════════════════╡
│ SGD      │ <class 'torch.optim.sgd.SGD'>       │
├──────────┼─────────────────────────────────────┤
│ SGDP     │ <class '__main__.SGDP'>             │
├──────────┼─────────────────────────────────────┤
│ Adam     │ <class 'torch.optim.adam.Adam'>     │
├──────────┼─────────────────────────────────────┤
│ AdamW    │ <class 'torch.optim.adamw.AdamW'>   │
├──────────┼─────────────────────────────────────┤
│ AdamP    │ <class '__main__.AdamP'>            │
├──────────┼─────────────────────────────────────┤
│ Ranger   │ <function Ranger at 0x7f972710f430> │
├──────────┼─────────────────────────────────────┤
│ RangerGC │ <class '__main__.RangerGC'>         │
╘══════════╧═════════════════════════════════════╛

In [None]:
# export
def create_optimizer(params: Iterable, cfg=None):
    "Instante an optimizer from `OPTIM_REGISTRY` given `params` with lightning_cv `cfg`"
    opt_cls = OPTIM_REGISTERY.get(cfg.OPTIMIZER.NAME)
    opt = opt_cls(params=params, **cfg.OPTIMIZER.ARGUMENTS)
    return opt

Creating an Optimizer using `create_optimizer` from LightningCv config -

In [None]:
from lightning_cv.config import get_cfg
from omegaconf import DictConfig, OmegaConf

cfg = get_cfg(strict=False)
print(OmegaConf.to_yaml(cfg.OPTIMIZER))

NAME: Ranger
ARGUMENTS:
  betas:
  - 0.95
  - 0.999
  eps: 1.0e-05
  weight_decay: 0.01
  k: 6
  alpha: 0.5



In [None]:
# these also support multiple param groups
p1 = dict(params=[torch.nn.Parameter(torch.randn(1, 2))], lr=1e-04, weight_decay=0)
p2 = dict(params=[torch.nn.Parameter(torch.randn(2, 2))], lr=1e-02, weight_decay=0.1)

params = [p1, p2]
opt = create_optimizer(params, cfg)
opt

Lookahead (
Parameter Group 0
    betas: [0.95, 0.999]
    eps: 1e-05
    lookahead_alpha: 0.5
    lookahead_k: 6
    lookahead_step: 0
    lr: 0.0001
    weight_decay: 0

Parameter Group 1
    betas: [0.95, 0.999]
    eps: 1e-05
    lookahead_alpha: 0.5
    lookahead_k: 6
    lookahead_step: 0
    lr: 0.01
    weight_decay: 0.1
)

In [None]:
#hide
notebook2script()

Converted 00_config.ipynb.
Converted 00a_core.common.ipynb.
Converted 00b_core.data_utils.ipynb.
Converted 00c_core.optim.ipynb.
Converted 00d_core.schedules.ipynb.
Converted 00e_core.layers.ipynb.
Converted 01a_classification.data.transforms.ipynb.
Converted 01b_classification.data.datasets.ipynb.
Converted 01c_classification.modelling.body.ipynb.
Converted index.ipynb.
