In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
#export
from exp.nb_06 import *

## ConvNet

Let's get the data and training interface from where we left in the last notebook.

[Jump_to lesson 10 video](https://course.fast.ai/videos/?lesson=10&t=5899)

In [3]:
x_train,y_train,x_valid,y_valid = get_data()

x_train,x_valid = normalize_to(x_train,x_valid)
train_ds,valid_ds = Dataset(x_train, y_train),Dataset(x_valid, y_valid)

nh,bs = 50,512
c = y_train.max().item()+1
loss_func = F.cross_entropy

data = DataBunch(*get_dls(train_ds, valid_ds, bs), c)

In [4]:
mnist_view = view_tfm(1,28,28)
cbfs = [Recorder,
        partial(AvgStatsCallback,accuracy),
        #CudaCallback,
        partial(BatchTransformXCallback, mnist_view)]

In [5]:
nfs = [8,16,32,64,64]

In [9]:
# no batchnorm trains only with low lr

learn,run = get_learn_run(nfs, data, 0.4, conv_layer, cbs=cbfs)
%time run.fit(2, learn)

train: [0.912379609375, tensor(0.7047)]
valid: [0.17211876220703126, tensor(0.9478)]
train: [0.148779208984375, tensor(0.9526)]
valid: [0.14120567626953126, tensor(0.9540)]
CPU times: user 36 s, sys: 2.86 s, total: 38.9 s
Wall time: 13.2 s


In [8]:
# doesn't train with higher lr

learn,run = get_learn_run(nfs, data, 1.0, conv_layer, cbs=cbfs)
%time run.fit(2, learn)

train: [1929393.27488, tensor(0.1155)]
valid: [2.3021896484375, tensor(0.1064)]
train: [2.3016140625, tensor(0.1136)]
valid: [2.302555859375, tensor(0.1064)]
CPU times: user 35.4 s, sys: 2.64 s, total: 38 s
Wall time: 12.9 s


## Batchnorm

### Custom

Let's start by building our own `BatchNorm` layer from scratch.

[Jump_to lesson 10 video](https://course.fast.ai/videos/?lesson=10&t=6018)

In [6]:
class BatchNorm(nn.Module):
    def __init__(self, nf, mom=0.1, eps=1e-5):
        super().__init__()
        # NB: pytorch bn mom is opposite of what you'd expect
        self.mom,self.eps = mom,eps
        self.mults = nn.Parameter(torch.ones (nf,1,1))   # gamma (scale parameter)
        self.adds  = nn.Parameter(torch.zeros(nf,1,1))   # beta  (shift parameter)
        # learnable parameters help to maintain representational power of layer after normalization
        
        self.register_buffer('vars',  torch.ones(1,nf,1,1))
        self.register_buffer('means', torch.zeros(1,nf,1,1))
        # register_buffer: same as parameter (pushed to device, saved in state_dict)
        #                  but not trained by optimizer (not returned in model.parameters())

    def update_stats(self, x):
        m = x.mean((0,2,3), keepdim=True) # mean averaged over batch and spatial dimensions => mean/channel
        v = x.var ((0,2,3), keepdim=True) # keepdim=True => leaves empty unit axes and allows for broadcasting
        self.means.lerp_(m, self.mom) # linearly interpolate previous means with current mean using momentum
        self.vars.lerp_ (v, self.mom) # lerp : linear interpolation => x1*(1-momentum) + x2*(momentum) [pytorch]
        # exponentially weighted moving average: influence of each previous point is exponentially decayed
        return m,v
        
    def forward(self, x):
        if self.training:
            with torch.no_grad(): m,v = self.update_stats(x) # calculate mean/var and update running avg
        else: m,v = self.means,self.vars  # inference: don't calculate mean/variance.  Use running avg instead
        x = (x-m) / (v+self.eps).sqrt()   # normalize: subtract mean and divide by std (sqrt of var)
        return x*self.mults + self.adds   # scale and shift normalized values w/ learnable params

In [7]:
def conv_layer(ni, nf, ks=3, stride=2, bn=True, **kwargs):
    # No bias needed if using bn
    layers = [nn.Conv2d(ni, nf, ks, padding=ks//2, stride=stride, bias=not bn),
              GeneralRelu(**kwargs)]
    if bn: layers.append(BatchNorm(nf))
    return nn.Sequential(*layers)

In [6]:
#export
def init_cnn_(m, f):
    if isinstance(m, nn.Conv2d):
        f(m.weight, a=0.1)
        if getattr(m, 'bias', None) is not None: m.bias.data.zero_()
    for l in m.children(): init_cnn_(l, f)

def init_cnn(m, uniform=False):
    f = init.kaiming_uniform_ if uniform else init.kaiming_normal_
    init_cnn_(m, f)

def get_learn_run(nfs, data, lr, layer, cbs=None, opt_func=None, uniform=False, **kwargs):
    model = get_cnn_model(data, nfs, layer, **kwargs)
    init_cnn(model, uniform=uniform)
    return get_runner(model, data, lr=lr, cbs=cbs, opt_func=opt_func)

We can then use it in training and see how it helps keep the activations means to 0 and the std to 1.

In [27]:
learn,run = get_learn_run(nfs, data, 1., conv_layer, cbs=cbfs)

In [None]:
with Hooks(learn.model, append_stats) as hooks:
    run.fit(1, learn)
    fig,(ax0,ax1) = plt.subplots(1,2, figsize=(10,4))
    for h in hooks[:-1]:
        ms,ss = h.stats
        ax0.plot(ms[:10])
        ax1.plot(ss[:10])
        h.remove()
    plt.legend(range(6));
    
    fig,(ax0,ax1) = plt.subplots(1,2, figsize=(10,4))
    for h in hooks[:-1]:
        ms,ss = h.stats
        ax0.plot(ms)
        ax1.plot(ss)

In [13]:
# with batchnorm, training with higher lr and better accuracy

learn,run = get_learn_run(nfs, data, 1.0, conv_layer, cbs=cbfs)
%time run.fit(2, learn)

train: [0.27712578125, tensor(0.9130)]
valid: [0.1102513916015625, tensor(0.9662)]
train: [0.091840654296875, tensor(0.9715)]
valid: [0.08538733520507813, tensor(0.9733)]
CPU times: user 55.8 s, sys: 4.01 s, total: 59.8 s
Wall time: 20.3 s


### Builtin batchnorm

[Jump_to lesson 10 video](https://course.fast.ai/videos/?lesson=10&t=6679)

In [13]:
#export
def conv_layer(ni, nf, ks=3, stride=2, bn=True, **kwargs):
    layers = [nn.Conv2d(ni, nf, ks, padding=ks//2, stride=stride, bias=not bn),
              GeneralRelu(**kwargs)]
    if bn: layers.append(nn.BatchNorm2d(nf, eps=1e-5, momentum=0.1))
    return nn.Sequential(*layers)

In [16]:
learn,run = get_learn_run(nfs, data, 1., conv_layer, cbs=cbfs)
%time run.fit(2, learn)

train: [0.22017669921875, tensor(0.9305)]
valid: [0.09249447631835937, tensor(0.9723)]
train: [0.064310859375, tensor(0.9797)]
valid: [0.07336126098632813, tensor(0.9789)]
CPU times: user 49.5 s, sys: 3.53 s, total: 53 s
Wall time: 17.9 s


### With scheduler

Now let's add the usual warm-up/annealing.

In [None]:
sched = combine_scheds([0.3, 0.7], [sched_lin(0.6, 2.), sched_lin(2., 0.1)]) 

In [None]:
learn,run = get_learn_run(nfs, data, 0.9, conv_layer, cbs=cbfs
                          +[partial(ParamScheduler,'lr', sched)])

In [None]:
run.fit(8, learn)

## More norms

### Layer norm

From [the paper](https://arxiv.org/abs/1607.06450): "*batch normalization cannot be applied to online learning tasks or to extremely large distributed models where the minibatches have to be small*".

General equation for a norm layer with learnable affine:

$$y = \frac{x - \mathrm{E}[x]}{ \sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta$$

The difference with BatchNorm is
1. we don't keep a moving average (the mean/variance of each img is independent)
2. we don't average over the batches dimension but over the hidden dimension, so it's independent of the batch size

[Jump_to lesson 10 video](https://course.fast.ai/videos/?lesson=10&t=6717)

In [None]:
class LayerNorm(nn.Module):
    __constants__ = ['eps']
    def __init__(self, eps=1e-5):
        super().__init__()
        self.eps = eps
        self.mult = nn.Parameter(tensor(1.))
        self.add  = nn.Parameter(tensor(0.))

    def forward(self, x):
        m = x.mean((1,2,3), keepdim=True) # mean averaged over channel and spatial dimensions => mean/image
        v = x.var ((1,2,3), keepdim=True) 
        x = (x-m) / ((v+self.eps).sqrt()) # image brightness (mean) and contrast (variance) are normalized out...
        return x*self.mult + self.add

In [None]:
def conv_ln(ni, nf, ks=3, stride=2, bn=True, **kwargs):
    layers = [nn.Conv2d(ni, nf, ks, padding=ks//2, stride=stride, bias=True),
              GeneralRelu(**kwargs)]
    if bn: layers.append(LayerNorm())
    return nn.Sequential(*layers)

In [None]:
learn,run = get_learn_run(nfs, data, 0.8, conv_ln, cbs=cbfs)

In [None]:
%time run.fit(3, learn)

*Thought experiment*: can this distinguish foggy days from sunny days (assuming you're using it before the first conv)?

### Instance norm

From [the paper](https://arxiv.org/abs/1607.08022): 

The key difference between **contrast** and batch normalization is that the latter applies the normalization to a  whole batch of images instead for single ones:

\begin{equation}\label{eq:bnorm}
    y_{tijk} =  \frac{x_{tijk} - \mu_{i}}{\sqrt{\sigma_i^2 + \epsilon}},
    \quad
    \mu_i = \frac{1}{HWT}\sum_{t=1}^T\sum_{l=1}^W \sum_{m=1}^H x_{tilm},
    \quad
    \sigma_i^2 = \frac{1}{HWT}\sum_{t=1}^T\sum_{l=1}^W \sum_{m=1}^H (x_{tilm} - mu_i)^2.
\end{equation}

In order to combine the effects of instance-specific normalization and batch normalization, we propose to replace the latter by the *instance normalization* (also known as *contrast normalization*) layer:

\begin{equation}\label{eq:inorm}
    y_{tijk} =  \frac{x_{tijk} - \mu_{ti}}{\sqrt{\sigma_{ti}^2 + \epsilon}},
    \quad
    \mu_{ti} = \frac{1}{HW}\sum_{l=1}^W \sum_{m=1}^H x_{tilm},
    \quad
    \sigma_{ti}^2 = \frac{1}{HW}\sum_{l=1}^W \sum_{m=1}^H (x_{tilm} - mu_{ti})^2.
\end{equation}

[Jump_to lesson 10 video](https://course.fast.ai/videos/?lesson=10&t=7114)

In [None]:
class InstanceNorm(nn.Module):
    __constants__ = ['eps']
    def __init__(self, nf, eps=1e-0):
        super().__init__()
        self.eps = eps
        self.mults = nn.Parameter(torch.ones (nf,1,1))
        self.adds  = nn.Parameter(torch.zeros(nf,1,1))

    def forward(self, x):
        m = x.mean((2,3), keepdim=True) # mean over spatial dimensions => mean per image per channel
        v = x.var ((2,3), keepdim=True)
        res = (x-m) / ((v+self.eps).sqrt()) # removes differences in mean/var for each channel and image
        return res*self.mults + self.adds   # useful for style transfer, not classification

In [None]:
def conv_in(ni, nf, ks=3, stride=2, bn=True, **kwargs):
    layers = [nn.Conv2d(ni, nf, ks, padding=ks//2, stride=stride, bias=True),
              GeneralRelu(**kwargs)]
    if bn: layers.append(InstanceNorm(nf))
    return nn.Sequential(*layers)

In [None]:
learn,run = get_learn_run(nfs, data, 0.1, conv_in, cbs=cbfs)

In [None]:
%time run.fit(3, learn)

*Question*: why can't this classify anything?

Lost in all those norms? The authors from the [group norm paper](https://arxiv.org/pdf/1803.08494.pdf) have you covered:

![Various norms](images/norms.png)

### Group norm

Intermediate generalization btw InstanceNorm and LayerNorm depending on # of groups
- groups = 1         => InstanceNorm
- groups = #channels => LayerNorm

[Jump_to lesson 10 video](https://course.fast.ai/videos/?lesson=10&t=7213)

*From the PyTorch docs:*

`GroupNorm(num_groups, num_channels, eps=1e-5, affine=True)`

The input channels are separated into `num_groups` groups, each containing
``num_channels / num_groups`` channels. The mean and standard-deviation are calculated
separately over the each group. $\gamma$ and $\beta$ are learnable
per-channel affine transform parameter vectorss of size `num_channels` if
`affine` is ``True``.

This layer uses statistics computed from input data in both training and
evaluation modes.

Args:
-    num_groups (int): number of groups to separate the channels into
-    num_channels (int): number of channels expected in input
-    eps: a value added to the denominator for numerical stability. Default: 1e-5
-    affine: a boolean value that when set to ``True``, this module
        has learnable per-channel affine parameters initialized to ones (for weights)
        and zeros (for biases). Default: ``True``.

Shape:
- Input: `(N, num_channels, *)`
- Output: `(N, num_channels, *)` (same shape as input)

Examples::

    >>> input = torch.randn(20, 6, 10, 10)
    >>> # Separate 6 channels into 3 groups
    >>> m = nn.GroupNorm(3, 6)
    >>> # Separate 6 channels into 6 groups (equivalent with InstanceNorm)
    >>> m = nn.GroupNorm(6, 6)
    >>> # Put all 6 channels into a single group (equivalent with LayerNorm)
    >>> m = nn.GroupNorm(1, 6)
    >>> # Activating the module
    >>> output = m(input)

## Fix small batch sizes

### What's the problem?

When we compute the statistics (mean and std) for a BatchNorm Layer on a small batch, it is possible that we get a standard deviation very close to 0. because there aren't many samples (the variance of one thing is 0. since it's equal to its mean).

[Jump_to lesson 10 video](https://course.fast.ai/videos/?lesson=10&t=7304)

In [8]:
data = DataBunch(*get_dls(train_ds, valid_ds, 2), c)

In [19]:
# learns very slowly with high lr: 1.0

learn,run = get_learn_run(nfs, data, 1.0, conv_layer, cbs=cbfs)
%time run.fit(1, learn)

train: [164004.4032, tensor(0.1076)]
valid: [724882507522952.4, tensor(0.1531)]
CPU times: user 2min 53s, sys: 8.36 s, total: 3min 1s
Wall time: 3min 1s


In [14]:
# reducing learning rate doesn't improve accuracy

learn,run = get_learn_run(nfs, data, 0.4, conv_layer, cbs=cbfs)
%time run.fit(1, learn)

train: [2.3629571875, tensor(0.1662)]
valid: [1448330.5472, tensor(0.1841)]
CPU times: user 2min 48s, sys: 7.81 s, total: 2min 56s
Wall time: 2min 56s


### Running Batch Norm

To solve this problem we introduce a Running BatchNorm that uses smoother running mean and variance for the mean and std.

[Jump_to lesson 10 video](https://course.fast.ai/videos/?lesson=10&t=7516)

In [7]:
# class RunningBatchNorm(nn.Module):
#     def __init__(self, nf, mom=0.1, eps=1e-5):
#         super().__init__()
#         self.mom,self.eps = mom,eps
#         self.mults = nn.Parameter(torch.ones (nf,1,1))
#         self.adds = nn.Parameter(torch.zeros(nf,1,1))
#         self.register_buffer('sums', torch.zeros(1,nf,1,1))
#         self.register_buffer('sqrs', torch.zeros(1,nf,1,1))
#         self.register_buffer('batch', tensor(0.))
#         self.register_buffer('count', tensor(0.))
#         self.register_buffer('step', tensor(0.))
#         self.register_buffer('dbias', tensor(0.))

#     def update_stats(self, x):
#         bs,nc,*_ = x.shape
#         self.sums.detach_()
#         self.sqrs.detach_()
#         dims = (0,2,3)
#         s = x.sum(dims, keepdim=True)
#         ss = (x*x).sum(dims, keepdim=True)
#         c = self.count.new_tensor(x.numel()/nc)
#         mom1 = 1 - (1-self.mom)/math.sqrt(bs-1)
#         self.mom1 = self.dbias.new_tensor(mom1)
#         self.sums.lerp_(s, self.mom1)
#         self.sqrs.lerp_(ss, self.mom1)
#         self.count.lerp_(c, self.mom1)
#         self.dbias = self.dbias*(1-self.mom1) + self.mom1
#         self.batch += bs
#         self.step += 1

#     def forward(self, x):
#         if self.training: self.update_stats(x)
#         sums = self.sums
#         sqrs = self.sqrs
#         c = self.count
#         if self.step<100:
#             sums = sums / self.dbias
#             sqrs = sqrs / self.dbias
#             c    = c    / self.dbias
#         means = sums/c
#         vars = (sqrs/c).sub_(means*means)
#         if bool(self.batch < 20): vars.clamp_min_(0.01)
#         x = (x-means).div_((vars.add_(self.eps)).sqrt())
#         return x.mul_(self.mults).add_(self.adds)

In [24]:
# simplified version of above

# key idea: use moving avg at training time as well as at inference time!!
# exp weighted moving avg (lerp) of variance doesn't make sense
# Variance => mean(x.pow(2)) - mean(x).pow(2)
# lerp sums,squares,counts and then calculate variance from those

class RunningBatchNorm(nn.Module):
    def __init__(self, nf, mom=0.1, eps=1e-5):
        super().__init__()
        self.mom, self.eps = mom, eps
        self.mults = nn.Parameter(torch.ones (nf,1,1))
        self.adds  = nn.Parameter(torch.zeros(nf,1,1))
        self.register_buffer('sums', torch.zeros(1,nf,1,1))
        self.register_buffer('sqrs', torch.zeros(1,nf,1,1))
        self.register_buffer('count', tensor(0.))
        self.register_buffer('factor', tensor(0.))
        self.register_buffer('offset', tensor(0.))
        self.batch = 0
        
    def update_stats(self, x):
        bs,nc,*_ = x.shape
        self.sums.detach_()
        self.sqrs.detach_()
        s    = x.sum((0,2,3), keepdim=True)     # sum
        ss   = (x*x).sum((0,2,3), keepdim=True) # sum of squares
        c    = s.new_tensor(x.numel()/nc)       # count = batch size * spatial dimensions
        mom1 = s.new_tensor(1 - (1-self.mom)/math.sqrt(bs-1))  # scale momentum by batchsize (new technique)
        self.sums .lerp_(s , mom1)
        self.sqrs .lerp_(ss, mom1)
        self.count.lerp_(c , mom1) # exp weighted moving avg of count because bs could vary
        self.batch += bs
        means = self.sums/self.count
        varns = (self.sqrs/self.count).sub_(means*means)
        if bool(self.batch < 20): varns.clamp_min_(0.01)
        self.factor = self.mults / (varns+self.eps).sqrt()
        self.offset = self.adds - means*self.factor
        
    def forward(self, x):
        if self.training: self.update_stats(x)
        return x*self.factor + self.offset

In [25]:
def conv_rbn(ni, nf, ks=3, stride=2, bn=True, **kwargs):
    layers = [nn.Conv2d(ni, nf, ks, padding=ks//2, stride=stride, bias=not bn),
              GeneralRelu(**kwargs)]
    if bn: layers.append(RunningBatchNorm(nf))
    return nn.Sequential(*layers)

In [11]:
# doesn't learn with high learning rate

learn,run = get_learn_run(nfs, data, 1.0, conv_rbn, cbs=cbfs)
%time run.fit(1, learn)

train: [nan, tensor(0.0985)]
valid: [nan, tensor(0.0991)]
CPU times: user 2min 49s, sys: 7.68 s, total: 2min 56s
Wall time: 2min 56s


In [26]:
# with lower learning rate, accuracy is very good

learn,run = get_learn_run(nfs, data, 0.4, conv_rbn, cbs=cbfs)
%time run.fit(1, learn)

train: [0.4000426953125, tensor(0.8957)]
valid: [0.18907635498046876, tensor(0.9621)]
CPU times: user 2min 50s, sys: 8.47 s, total: 2min 58s
Wall time: 2min 59s


This solves the small batch size issue!

### What can we do in a single epoch?

Now let's see with a decent batch size what result we can get.

[Jump_to lesson 10 video](https://course.fast.ai/videos/?lesson=10&t=8068)

In [None]:
data = DataBunch(*get_dls(train_ds, valid_ds, 32), c)

In [None]:
learn,run = get_learn_run(nfs, data, 0.9, conv_rbn, cbs=cbfs
                          +[partial(ParamScheduler,'lr', sched_lin(1., 0.2))])

In [None]:
%time run.fit(1, learn)

## Export

In [None]:
nb_auto_export()