In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

During this seminar we will implement 2 most confusing but very popular layers - BatchNorm and Dropout along with the most popular architectures such as VGG, Inception, ResNet, ResNeXt, DenseNet, SENet and Mobilenet_v2.

The main goal of this seminar is to make you understand how simple is to create your **arbitrary** neural network in pytorch and to give you some practical experience. PyTorch is very flexible and still simple, so you can implement any idea in just a few lines of code!

Enjoy!

# Layers

## Dropout

In [2]:
class Dropout(nn.Module):
    def __init__(self, p):
        super().__init__()
        self.p = p
        
    def forward(self, x):
        # there is F.dropout(...) in torch, but let's not use it here
        # use self.training flag
        # TODO(students)
        
        if self.training:
            mask = torch.empty(x.size()).bernoulli(1. - self.p)
            return x * mask / (1. - self.p)
        else:
            return x
        
t = torch.ones(10)
d = Dropout(p=0.5)
d(t)

tensor([2., 0., 2., 0., 2., 2., 0., 2., 0., 2.])

In [3]:
d.eval()
d(t)

tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

## BatchNorm

In [4]:
from torch.nn.parameter import Parameter
from torch.nn import init

# TODO(me) still good realization somewhere
class BatchNorm2D(nn.Module):
    def __init__(self, num_features, momentum=0.1, eps=1e-5):
        super().__init__()
        self.momentum = momentum
        self.eps = eps
        self.gamma = Parameter(torch.Tensor(1, num_features, 1, 1), )
        self.beta = Parameter(torch.Tensor(1, num_features, 1, 1))
        self.register_buffer('running_mean', torch.zeros(num_features, 1, 1))
        self.register_buffer('running_var', torch.ones(num_features, 1, 1))
        self.register_buffer('num_batches_tracked', torch.tensor(0, dtype=torch.long))
        self.reset_parameters()

    def reset_running_stats(self):
        self.running_mean.zero_()
        self.running_var.fill_(1)
        self.num_batches_tracked.zero_()

    def reset_parameters(self):
        self.reset_running_stats()
        init.ones_(self.gamma)
        init.zeros_(self.beta)
        
    def forward(self, x):
        # again, do not use F.batchnorm
        # use self.training flag, don't forget to update running mean and variance while training
        x_ch = x.transpose(0,1).contiguous().view(x.size(1), -1)
        if self.training:
            mean = x_ch.mean(1)[:, None, None]
            var = x_ch.var(1)[:, None, None]
            
            self.running_mean = self.momentum * mean + (1. - self.momentum) * self.running_mean
            self.running_var = self.momentum * var + (1. - self.momentum) * self.running_var
        else:
            mean = self.running_mean
            var = self.running_std
            
        x = (x - mean) / (torch.sqrt(var + self.eps))
        return self.gamma * x + self.beta

In [5]:
t = torch.arange(40).float().reshape(2, 5, 2, 2)
bn = BatchNorm2D(5)
bn_torch = nn.BatchNorm2d(5)
normed = bn(t)
normed_torch = bn_torch(t)

def approx_eq(a, b, eps=1e-5):
    return (a.reshape(-1)-b.reshape(-1)).abs().sum() < eps
assert approx_eq(bn.running_var, bn_torch.running_var)
assert approx_eq(bn.running_mean, bn_torch.running_mean)
assert torch.all(bn.gamma == bn_torch.weight)
assert torch.all(bn.beta == bn_torch.bias)
assert approx_eq(normed, normed_torch, eps=10)

In [6]:
normed_torch[:, 0], normed[:, 0]

(tensor([[[-1.1429, -1.0435],
          [-0.9441, -0.8447]],
 
         [[ 0.8447,  0.9441],
          [ 1.0435,  1.1429]]], grad_fn=<SelectBackward>),
 tensor([[[-1.0691, -0.9761],
          [-0.8831, -0.7902]],
 
         [[ 0.7902,  0.8831],
          [ 0.9761,  1.0691]]], grad_fn=<SelectBackward>))

## Architectures

## Alexnet

![](images/alexnet.png)

In [7]:
# source: torchvision
class AlexNet(nn.Module):

    def __init__(self, num_classes=1000):
        super(AlexNet, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(64, 192, kernel_size=5, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
        )
        self.avgpool = nn.AdaptiveAvgPool2d((6, 6))
        self.classifier = nn.Sequential(
            nn.Dropout(),
            nn.Linear(256 * 6 * 6, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, num_classes),
        )

    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x
    
# AlexNet()

AlexNet(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (avgpool): AdaptiveAvgPool2d(output_size=(6, 6))
  (classifier): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=9216, out_features=4096, bias=True)
 

![](images/vgg.png)

In [13]:
class VGG(nn.Module):
    def __init__(self, cfg, use_bn=False, num_classes=1000):
        super(VGG, self).__init__()
        self.features = VGG.make_layers(VGG.cfgs[cfg], batch_norm=use_bn)
        self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
        self.classifier = nn.Sequential(
            nn.Linear(512 * 7 * 7, 4096),
            nn.ReLU(True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(True),
            nn.Dropout(),
            nn.Linear(4096, num_classes),
        )
    
    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x
    
    cfgs = {
        'vgg11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
        'vgg13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
        'vgg16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
        'vgg19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
    }
    
    @staticmethod
    def make_layers(cfg, batch_norm=False):
        layers = []
        in_channels = 3
        for v in cfg:
            if v == 'M':
                layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
            else:
                conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
                if batch_norm:
                    layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
                else:
                    layers += [conv2d, nn.ReLU(inplace=True)]
                in_channels = v
        return nn.Sequential(*layers)
    
# VGG('vgg16')

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1

So far we have seen only sequential models which are essentially sequence of conv, pool, nonlinearity and normalization layers. Such architectures are in most cases very stable in training, but typically are slower, heavier and even converge to lower values. Modern architectures are somehow "less sequential", the main ideas are: compute several branches in parallel (inception), use information from previous layers and not only the last one (resnet, densenet).

## Inception

![alt](images/we_need_to_go_deeper.jpeg "")

[Exceptional inception guide](https://towardsdatascience.com/a-simple-guide-to-the-versions-of-the-inception-network-7fc52b863202)

Why only 3x3 convolutions? Maybe 5x5 will be better? Or 7x7? When you can not choose, it's time to take them all!

![alt](images/inception_naive.png "")
![alt](images/inception_module.png "")

In [14]:
# source: torchvision
class BasicConv2d(nn.Module):

    def __init__(self, in_channels, out_channels, **kwargs):
        super(BasicConv2d, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
        self.bn = nn.BatchNorm2d(out_channels, eps=0.001)

    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        return F.relu(x, inplace=True)
    

class Inception(nn.Module):
    def __init__(self, in_channels, ch1x1, ch3x3red, ch3x3, ch5x5red, ch5x5, pool_proj,
                 conv_block=None):
        super(Inception, self).__init__()
        if conv_block is None:
            conv_block = BasicConv2d
        # TODO(students): implement all branches based on the picture
        self.branch1 = conv_block(in_channels, ch1x1, kernel_size=1)

        self.branch2 = nn.Sequential(
            conv_block(in_channels, ch3x3red, kernel_size=1),
            conv_block(ch3x3red, ch3x3, kernel_size=3, padding=1)
        )

        self.branch3 = nn.Sequential(
            conv_block(in_channels, ch5x5red, kernel_size=1),
            conv_block(ch5x5red, ch5x5, kernel_size=3, padding=1)
        )

        self.branch4 = nn.Sequential(
            nn.MaxPool2d(kernel_size=3, stride=1, padding=1, ceil_mode=True),
            conv_block(in_channels, pool_proj, kernel_size=1)
        )

    def _forward(self, x):
        branch1 = self.branch1(x)
        branch2 = self.branch2(x)
        branch3 = self.branch3(x)
        branch4 = self.branch4(x)

        outputs = [branch1, branch2, branch3, branch4]
        return outputs

    def forward(self, x):
        outputs = self._forward(x)
        return torch.cat(outputs, 1)

![](images/googlenet.png)

In [17]:
# googlelenet

class InceptionAux(nn.Module):
    """Intermediate 'heads' of googlenet"""

    def __init__(self, in_channels, num_classes, conv_block=None):
        super(InceptionAux, self).__init__()
        if conv_block is None:
            conv_block = BasicConv2d
        self.conv = conv_block(in_channels, 128, kernel_size=1)

        self.fc1 = nn.Linear(2048, 1024)
        self.fc2 = nn.Linear(1024, num_classes)

    def forward(self, x):
        x = F.adaptive_avg_pool2d(x, (4, 4))
        x = self.conv(x)
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x), inplace=True)
        x = F.dropout(x, 0.7, training=self.training)
        x = self.fc2(x)
        # N x 1000 (num_classes)
        return x
    

class GoogLeNet(nn.Module):
    __constants__ = ['aux_logits', 'transform_input']

    def __init__(self, num_classes=1000, aux_logits=True, transform_input=False, init_weights=True,
                 blocks=None):
        super(GoogLeNet, self).__init__()
        if blocks is None:
            blocks = [BasicConv2d, Inception, InceptionAux]
        assert len(blocks) == 3
        conv_block = blocks[0]
        inception_block = blocks[1]
        inception_aux_block = blocks[2]

        self.aux_logits = aux_logits
        self.transform_input = transform_input

        self.conv1 = conv_block(3, 64, kernel_size=7, stride=2, padding=3)
        self.maxpool1 = nn.MaxPool2d(3, stride=2, ceil_mode=True)
        self.conv2 = conv_block(64, 64, kernel_size=1)
        self.conv3 = conv_block(64, 192, kernel_size=3, padding=1)
        self.maxpool2 = nn.MaxPool2d(3, stride=2, ceil_mode=True)

        self.inception3a = inception_block(192, 64, 96, 128, 16, 32, 32)
        self.inception3b = inception_block(256, 128, 128, 192, 32, 96, 64)
        self.maxpool3 = nn.MaxPool2d(3, stride=2, ceil_mode=True)

        self.inception4a = inception_block(480, 192, 96, 208, 16, 48, 64)
        self.inception4b = inception_block(512, 160, 112, 224, 24, 64, 64)
        self.inception4c = inception_block(512, 128, 128, 256, 24, 64, 64)
        self.inception4d = inception_block(512, 112, 144, 288, 32, 64, 64)
        self.inception4e = inception_block(528, 256, 160, 320, 32, 128, 128)
        self.maxpool4 = nn.MaxPool2d(2, stride=2, ceil_mode=True)

        self.inception5a = inception_block(832, 256, 160, 320, 32, 128, 128)
        self.inception5b = inception_block(832, 384, 192, 384, 48, 128, 128)

        if aux_logits:
            self.aux1 = inception_aux_block(512, num_classes)
            self.aux2 = inception_aux_block(528, num_classes)

        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(1024, num_classes)

    def forward(self, x):
        # type: (Tensor) -> Tuple[Tensor, Tensor, Tensor]
        # N x 3 x 224 x 224
        x = self.conv1(x)
        # N x 64 x 112 x 112
        x = self.maxpool1(x)
        # N x 64 x 56 x 56
        x = self.conv2(x)
        # N x 64 x 56 x 56
        x = self.conv3(x)
        # N x 192 x 56 x 56
        x = self.maxpool2(x)

        # N x 192 x 28 x 28
        x = self.inception3a(x)
        # N x 256 x 28 x 28
        x = self.inception3b(x)
        # N x 480 x 28 x 28
        x = self.maxpool3(x)
        # N x 480 x 14 x 14
        x = self.inception4a(x)
        # N x 512 x 14 x 14
        aux_defined = self.training and self.aux_logits
        if aux_defined:
            aux1 = self.aux1(x)
        else:
            aux1 = None

        x = self.inception4b(x)
        # N x 512 x 14 x 14
        x = self.inception4c(x)
        # N x 512 x 14 x 14
        x = self.inception4d(x)
        # N x 528 x 14 x 14
        if aux_defined:
            aux2 = self.aux2(x)
        else:
            aux2 = None

        x = self.inception4e(x)
        # N x 832 x 14 x 14
        x = self.maxpool4(x)
        # N x 832 x 7 x 7
        x = self.inception5a(x)
        # N x 832 x 7 x 7
        x = self.inception5b(x)
        # N x 1024 x 7 x 7

        x = self.avgpool(x)
        # N x 1024 x 1 x 1
        x = torch.flatten(x, 1)
        # N x 1024
        x = self.dropout(x)
        x = self.fc(x)
        # N x 1000 (num_classes)
        return x, aux2, aux1
    
# GoogLeNet()

Does it look... complicated? It was the *first* inception network, but in the newer version (v2 and v3 appeared in the same paper) the authors suggested different versions of inception modules, by factorizing nxn convolutions in different ways.

In the even newer Inception_v4 paper authors introduced even more modules

![](images/inception_v4_blocks.jpeg)
![](images/inception_v4_reduction_blocks.jpeg)

![](images/jackie_chan.jpg)

It is still very simple to implement but may be a little bit confusing from the first glance. More importantly, why should this handcrafted design be optimal? This is one of the motivations for **Neural Architecture Search**. 

#### But anyway, let's look into other architecture ideas

# Resnet

Blogposts to understand resnets:
[1](https://medium.com/@14prakash/understanding-and-implementing-architectures-of-resnet-and-resnext-for-state-of-the-art-image-cf51669e1624)
[2](https://towardsdatascience.com/an-overview-of-resnet-and-its-variants-5281e2f56035)

The key idea is to modify our layer transformation function in such a way that allows better and more stable training. Instead of learning transformation function $F(x)$ you learn another function $G(x)=F(x)-x$, but your layer is computed as $F(x)=G(x)+x$ which is the equivalent.

There may be different orderings of the operations in the residual part. The one proposed in original paper was proved to converge worse with the increase of depth, to the extent that very deep ResNets were actually less powerfull than Resnet50. Among the other variants, pre-activation order (bn-relu-conv) converges faster.

![alt text](images/resnet_ordering.png "Title")

In [18]:
def conv3x3(ch_in, ch_out, stride=1, groups=1, dilation=1):
    """3x3 convolution with padding"""
    return nn.Conv2d(ch_in, ch_out, kernel_size=3, stride=stride,
                     padding=dilation, groups=groups, bias=False, dilation=dilation)


def conv1x1(ch_in, ch_out, stride=1):
    """1x1 convolution"""
    return nn.Conv2d(ch_in, ch_out, kernel_size=1, stride=stride, bias=False)


def get_skipconnection(ch_in, ch_out, stride):
    if ch_in == ch_out and stride == 1:
        return nn.Sequential()
    return conv1x1(ch_in, ch_out, stride=stride)


class ResNetBasicBlockDummy(nn.Module):
    def __init__(self, ch_in, ch_out, stride=1):
        super().__init__()
        # Both self.conv1 and self.skipconnect layers downsample the input when stride != 1
        # TODO(students): implement conv-bn-relu order (figure (c))
        self.net = nn.Sequential(
            conv3x3(ch_in, ch_out, stride=stride),
            nn.BatchNorm2d(ch_out),
            nn.ReLU(inplace=True),
            conv3x3(ch_out, ch_out),
            nn.BatchNorm2d(ch_out),
            nn.ReLU(inplace=True)
        )
        self.skipconnect = get_skipconnection(ch_in, ch_out, stride)
        
    def forward(self, x):
        return self.net(x) + self.skipconnect(x)


class ResNetBasicBlock(nn.Module):
    def __init__(self, ch_in, ch_out, stride=1):
        super().__init__()
        # Both self.conv1 and self.skipconnect layers downsample the input when stride != 1
        # TODO(students): implement pre-activation order
        self.net = nn.Sequential(
            nn.BatchNorm2d(ch_in),
            nn.ReLU(inplace=True),
            conv3x3(ch_in, ch_out, stride=stride),
            nn.BatchNorm2d(ch_out),
            nn.ReLU(inplace=True),
            conv3x3(ch_out, ch_out),
        )
        self.skipconnect = get_skipconnection(ch_in, ch_out, stride)
        
    def forward(self, x):
        return self.net(x) + self.skipconnect(x)

Let's now implement ResNet18

![alt text](images/resnet_architectures.png "Title")

In [20]:
class ResNet18(nn.Module):
    def __init__(self, n_classes=1000):
        super().__init__()
        self.net = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False),
#             nn.BatchNorm2d(64),
#             nn.ReLU(inplace=True),
            *self._make_layer(64, 64, 2),
            *self._make_layer(64, 128, 2),
            *self._make_layer(128, 256, 2),
            *self._make_layer(256, 512, 2),
            nn.AdaptiveAvgPool2d((1,1)),
            nn.Flatten(),
            nn.Linear(512, 1000)
        )
        
    def _make_layer(self, ch_in, ch_out, n_layers, stride=1):
        layers = []
        layers.append(ResNetBasicBlock(ch_in, ch_out, stride))
        layers += [ResNetBasicBlock(ch_out, ch_out) for _ in range(n_layers - 1)]
        return nn.Sequential(*layers)
    
# ResNet18()

ResNet18(
  (net): Sequential(
    (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (1): ResNetBasicBlock(
      (net): Sequential(
        (0): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (1): ReLU(inplace=True)
        (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (4): ReLU(inplace=True)
        (5): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      )
      (skipconnect): Sequential()
    )
    (2): ResNetBasicBlock(
      (net): Sequential(
        (0): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (1): ReLU(inplace=True)
        (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_st

### Wide Resnet
Wider networks are more parallizable than deeper networks, just do wider layers. Also you may add dropout inside bottleneck layer to improve robustness.

In [None]:
class WideResNetBasicBlock(nn.Module):
    def __init__(self, ch_in, ch_out, stride=1, p_drop=0.3):
        super().__init__()
        # Both self.conv1 and self.skipconnect layers downsample the input when stride != 1
        # TODO(students): add dropout where appropriate
        self.net = nn.Sequential(
            nn.BatchNorm2d(ch_in),
            nn.ReLU(inplace=True),
            conv3x3(ch_in, ch_out, stride=stride),
            nn.Dropout(p_drop),
            nn.BatchNorm2d(ch_out),
            nn.ReLU(inplace=True),
            conv3x3(ch_out, ch_out),
        )
        self.skipconnect = get_skipconnection(ch_in, ch_out, stride)
        
    def forward(self, x):
        return self.net(x) + self.skipconnect(x)

### ResNeXt
One more idea is to apply group convolution (more parameter-efficient)
![](images/resnext_block.png)
![](images/resnext_block2.png)

In [44]:
class ResNeXtBottleneck(nn.Module):
    def __init__(self, ch_in, ch_out, stride=1, groups=32, base_width=4):
        super().__init__()
        # Both self.conv1 and self.skipconnect layers downsample the input when stride != 1
        # TODO(students): implement bottleneck (1x1->3x3->1x1) layer
        ch_hid = int(ch_out * (base_width / 64.)) * groups
        self.net = nn.Sequential(
            nn.BatchNorm2d(ch_in),
            nn.ReLU(inplace=True),
            conv1x1(ch_in, ch_hid),
            nn.BatchNorm2d(ch_hid),
            nn.ReLU(inplace=True),
            conv3x3(ch_hid, ch_hid, stride=stride, groups=groups),
            nn.Dropout(p_drop),
            nn.BatchNorm2d(ch_hid),
            nn.ReLU(inplace=True),
            conv1x1(ch_hid, ch_out),
        )
        self.skipconnect = get_skipconnection(ch_in, ch_out, stride)
        
    def forward(self, x):
        return self.net(x) + self.skipconnect(x)

# Densenet

~~Addition~~ -> Concatenation of all previous layers outputs

In this architecture, the input of each layer consists of the feature maps of all earlier layer, and its output is passed to each subsequent layer. The feature maps are aggregated with depth-concatenation.

![alt text](images/densenet_layer.png "Title")

In [21]:
class DenseLayer(nn.Module):
    """1x1 -> 3x3 bottleneck"""
    def __init__(self, ch_in, ch_out, ch_bn=4, s=1, p=1):
        super().__init__()
        
        ch_hid = ch_bn * ch_out  # channels after 1x1 convolution
 
        self.conv = nn.Sequential(
            nn.BatchNorm2d(ch_in),
            nn.ReLU(),
            nn.Conv2d(ch_in, ch_hid, kernel_size=1),
            nn.BatchNorm2d(ch_hid),
            nn.ReLU(),
            nn.Conv2d(ch_hid, ch_out, kernel_size=3, stride=s, padding=p)
        )
 
    def forward(self, x):
        return self.conv(x)
 
 
class TransitionLayer(nn.Module):
    """1x1 conv + average pool 2x2"""
    def __init__(self, ch_in, ch_out, use_bn=True):
        super().__init__()
        self.conv = nn.Sequential(
            nn.BatchNorm2d(ch_in),
            nn.ReLU(),
            nn.Conv2d(ch_in, ch_out, kernel_size=1),
            nn.AvgPool2d(kernel_size=2, stride=2)
        )
 
    def forward(self, x):
        return self.conv(x)
 
 
class DenseBlock(nn.Module):
    def __init__(self, k_0, k=12, n=4):
        super().__init__()
        for l in range(n):
            layer = DenseLayer(k_0 + k * l, k)
            self.add_module('denselayer%d' % (l + 1), layer)
 
    def forward(self, x):
        for name, layer in self.named_children():
            x = torch.cat((x, layer(x)), dim=1)
        return x

![alt text](images/densenet_architectures.png "Title")

In [23]:
class DenseNet(nn.Module):
    def __init__(self, ch_in=3, k=12, n_blocks=(6, 12, 24, 16), ch_mid=64, n_classes=1000):
        super().__init__()
        layers = [
            nn.Conv2d(ch_in, ch_mid, kernel_size=(3, 3), padding=1),
            DenseBlock(ch_mid, k=k, n=n_blocks[0])
        ]
        for i in range(len(n_blocks) - 1):
            layers.extend([
                TransitionLayer(ch_mid + k * n_blocks[i], ch_mid),
                DenseBlock(ch_mid, k=k, n=n_blocks[i + 1])
            ])
            
        layers += [
            nn.AdaptiveAvgPool2d((1,1)),
            nn.Flatten(),
            nn.Linear(ch_mid + k * n_blocks[-1], 1000)
        ]
        self.net = nn.Sequential(*layers)
 
    def forward(self, x):
        return self.net(x)
    
# DenseNet()

DenseNet(
  (net): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): DenseBlock(
      (denselayer1): DenseLayer(
        (conv): Sequential(
          (0): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (1): ReLU()
          (2): Conv2d(64, 48, kernel_size=(1, 1), stride=(1, 1))
          (3): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (4): ReLU()
          (5): Conv2d(48, 12, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        )
      )
      (denselayer2): DenseLayer(
        (conv): Sequential(
          (0): BatchNorm2d(76, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (1): ReLU()
          (2): Conv2d(76, 48, kernel_size=(1, 1), stride=(1, 1))
          (3): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (4): ReLU()
          (5): Conv2d(48, 12, kernel_size=(3, 3), strid

# SENet

Learn which channels are important, implement Squeeze-and-Excitation module!
![alt text](images/se_module.png "Title")

In [3]:
class SELayer(nn.Module):
    def __init__(self, channel, reduction=16):
        super(SELayer, self).__init__()
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.fc = nn.Sequential(
            nn.Linear(channel, channel // reduction, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(channel // reduction, channel, bias=False),
            nn.Sigmoid()
        )

    def forward(self, x):
        b, c, _, _ = x.size()
        y = self.avg_pool(x).view(b, c)
        y = self.fc(y).view(b, c, 1, 1)
        return x * y.expand_as(x)

You can now inject it into any architecture, e.g. into resnet bottleneck blocks and it will boost the performance by quite a margin!
![alt text](images/se_inception_and_resnet.png "Title")
![alt text](images/se_performance.png "Title")

# I can write models and modules and so what?
How to compare your models and modules?
- final network performance (your target metrics)
- inference speed (in milliseconds on specific device)
- FLOPS (rough approximation of real performance)
- number of parameters (== weight of your model)
- speed of training (how many epochs/time your model need to converge)

## Accuracy vs FLOPS

![](images/arch_chart.jpeg)

## Number of parameters

In [129]:
def calc_params(model):
    return sum(p.numel() for p in model.parameters())

def calc_trainable_params(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

## Convergence speed
Add batch normalization layers to speed up model convergence (up to 10 times speed up in average). Essentially convolution and batch normalization both apply a linear transformation so you can "fuse" batch normalization into the neighbouring convolutional layer, modifying it's weights which will speed up your model inference.

## Practical tips how to solve arbitrary DL problem
1. start from a reasonable baseline (e.g. resnet50)
2. improve target metrics as much as you can (your model may be big or slow, but you will now how good in theory your results should be)
3. if necessary, compress your architecture and try smaller ones. Archieve reasonable speed and try to retain as much accuracy as you can