In [1]:
import torch
from torch import nn
from torch import Tensor
from typing import Type, Optional, Union, List

## Resnet: Overall Architecture and Specifications

<img src="images/Resnet_architecture.png" width=420, height=1000>

Notes while implementing Resnet
- In the first block of each layer (except for the first block), the first convolutional layer is with stride 2 -> decrease the feature map, increase the number of channels
   - Why? Because after each layer, the size of feature map is reduced by 2
- Each layer has one block in which we have to downsample the input (make the input have the same channels at the output to concatenate), which is the first block in each layer

<!-- ![Resnet-layers](images/Resnet.png) -->
Specification of the number of blocks and layers from each block of different variations of Resnet
<img src="images/Resnet_layers_specs.png" width=800, height=400 />

## Block, a basic Unit in Resnet

- The first block we're going to implement is a **BasicBlock**, which consist of only 2 convolutional layers
- This type of block is used in Resnet18 and Resnet34
- The number of channels in the block's input and output doesn't change

In [None]:
class BasicBlock (nn.Module) :
    def __init__(self, 
                 in_channels: int, 
                 out_channels: int, 
                 downsample: Optional[nn.Sequential] = None,
                 stride: int = 1) -> None:
        super(BasicBlock, self).__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.stride = stride
        # self.expansion = 4 -> there's no expansion of layers in BasicBlock

        self.conv1 = nn.Conv2d (in_channels=self.in_channels, 
                                out_channels=self.out_channels, 
                                kernel_size=3, stride=self.stride)  # stride = 2 only first convolutional layer of first block, except first block
        self.batch_norm1 = nn.BatchNorm2d(self.out_channels)
        self.conv2 = nn.Conv2d (in_channels=self.out_channels,
                                out_channels=self.out_channels,
                                kernel_size=3)
        self.batch_norm2 = nn.BatchNorm2d(self.out_channels)

        self.relu = nn.ReLU(inplace=True)       # inplace: directly modify the input without allocating memory for the output
        
        self.downsample = downsample

    def forward (self, X: Tensor) -> Tensor:
        identity = X

        feat_map1 = self.conv1(X)
        norm1 = self.batch_norm1(feat_map1)
        activation1 = self.relu(norm1)

        feat_map2 = self.conv2(activation1)
        norm2 = self.batch_norm2(feat_map2)
        # activation2 = self.relu(norm2)        # the last output doesn't need ReLU
        output = norm2                          # redundant, included for clear interpretation

        if self.downsample != None:
            identity = self.downsample(identity)
        output += identity
        
        return output

- The second kind of block is **BottleNeck**, which includes three convolutional layers, two with 1x1 kernels, and one with a 3x3 kernel
- The first convolutional layer (with the 1x1 kernel) has stride=2
- The number of channels of the input and output of the block is different in the first block of each "layer" (output channels = input channels x 4); in the successive blocks, this number is similar in both the input and output
    - So to concate the identity matrix (input of each block) to the output, we have to first check if they are already at the same size and have the same channels
    - If that is not the case, we have to **downsample** the input (the identity matrix), the downsample function is passed from the Resnet model

In [None]:
class BottleNeck (nn.Module):
    def __init__(self, 
                 in_channels: int, 
                 out_channels: int,
                 downsample: Optional[nn.Sequential] = None,
                 stride: int = 1) -> None:
        super().__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.stride = stride
        self.expansion = 4

        self.conv1 = nn.Conv2d(in_channels=self.in_channels,
                               out_channels=self.out_channels,
                               kernel_size=1, stride=stride)        # stride of the first convolutional layer can be either 1 or 2
        self.batch_norm1 = nn.BatchNorm2d(num_features=self.out_channels)
        self.conv2 = nn.Conv2d(in_channels=self.out_channels,
                               out_channels=self.out_channels,
                               kernel_size=3, stride=1, padding=1)
        self.batch_norm2 = nn.BatchNorm2d(num_features=self.out_channels)
        self.conv3 = nn.Conv2d(in_channels=self.out_channels,
                               out_channels=self.out_channels*self.expansion,
                               kernel_size=1)                       # default stride is 1
        self.batch_norm3 = nn.BatchNorm2d(num_features=self.out_channels*self.expansion)
        self.relu = nn.ReLU(inplace=True)

        self.downsample = downsample

    def forward(self, X: Tensor) -> Tensor:
        identity = X

        feat_map1 = self.conv1(X)
        norm1 = self.batch_norm1(feat_map1)
        activation1 = self.relu(norm1)

        feat_map2 = self.conv2(activation1)
        norm2 = self.batch_norm2(feat_map2)
        activation2 = self.relu(norm2)

        feat_map3 = self.conv3(activation2)
        norm3 = self.batch_norm3(feat_map3)
        output = norm3

        if self.downsample is not None:
            identity = self.identity()
        output += identity

        return output

In [2]:
class Resnet (nn.Module):
    def __init__(self, 
                 block_t: Type[Union[BasicBlock, BottleNeck]], 
                 n_blocks: List[int],
                 n_classes: int = 1000) -> None:
        super().__init__()
        self.n_blocks = n_blocks
        self.block = block_t
        self.expansion = self.block.expansion
        self.in_channels = 64          # every variant of Resnet has an input size of 64 for the first layer

        self.layer1 = self._make_layer (block=self.block, n_blocks=self.n_blocks[0], 
                                        in_channels=self.in_channels, out_channels=64, 
                                        stride=1)    # the first layer have stride 1 -> no cchange in the output feature map's size
        self.layer2 = self._make_layer (block=self.block, n_blocks=self.n_blocks[1], 
                                        in_channels=self.in_channels, out_channels=128,
                                        stride=2)    # following layers have stride 2 (feature map's size reduce after each layer)
        self.layer3 = self._make_layer (block=self.block, n_blocks=self.n_blocks[2], 
                                        in_channels=self.in_channels, out_channels=256,
                                        stride=2)
        self.layer4 = self._make_layer (block=self.block, n_blocks=self.n_blocks[3], 
                                        in_channels=self.in_channels, out_channels=512,
                                        stride=2)

    def _make_layer (self, 
                     block: Type[Union[BasicBlock, BottleNeck]],
                     n_blocks: int,
                     in_channels: int, 
                     out_channels: int, 
                     stride: int):
        layers = []
        # if stride = 1 -> there's no decrease in feature map -> no need for downsampling
        # if in_channels = out_channels*4 -> we are not in the first block -> no need for downsampling, only do downsampling for the first block
        # -> no need for downsampling
        if stride != 1 or in_channels != out_channels*self.expansion:   
            downsample = nn.Sequential(nn.Conv2d (in_channels=in_channels,             # reduce the feature map by stride (2) -> no padding, stride=stride
                                                  out_channels=out_channels*self.expansion,
                                                  kernel_size=1, stride=stride, padding=0))
        layers.append(block(in_channels=in_channels, out_channels=out_channels, downsample=downsample))
        for _ in range (1, n_blocks):
            layers.append(block(in_channels=in_channels, out_channels=out_channels))
        
        return nn.Sequential(*layers)

    def forward ():
        pass

SyntaxError: invalid syntax. Perhaps you forgot a comma? (1491111416.py, line 14)

Additional notes:
- Conv1x1 is usually used for changing the number of channels
- A ReLU layer with the same input size can be used for multiple layers since it doesn't have learnable parameters
- Default value of: padding=0, stride=1, dilation=1
