## Layer 1
### Basic Block 1
Conv2D: in_channels=64, out_channels=64, kernel_size=(3,3)[come easily from paper], stride=1 [not sure where this was stated, believe it is from Table 1 since it says specifically stride of 2 for other lays], bias=True [defaults to True], padding_mode='zeros [default and stated in paper], padding=1 [stated in sources 1 and 2]

BatchNorm2d: num_features=64 [comes from previous out_channel], momentum=0.9 [states 0.9 in paper, but I see other implementations do 0.1], affine=True [defaults to true], eps=1e-5 [defaults to 1e-5]

ReLU: inplace=False [defaults to False]

Conv2D: in_channels=64, out_channels=64, kernel_size=(3,3), stride=1, bias=True, padding_mode='zeros'

BatchNorm2d: num_features=64, momentum=0.9, affine=True, eps=1e-5 

ReLU: inplace=False

### Basic Block 2

Conv2D: in_channels=64, out_channels=64, kernel_size=(3,3), stride=1, bias=True, padding_mode='zeros'

BatchNorm2d: num_features=64, momentum=0.9, affine=True, eps=1e-5 

ReLU: inplace=False 

Conv2D: in_channels=64, out_channels=64, kernel_size=(3,3), stride=1, bias=True, padding_mode='zeros'

BatchNorm2d: num_features=64, momentum=0.9, affine=True, eps=1e-5 

ReLU: inplace=False 

# Questions
why is in_channels start at 64?
What's the correct momentum, 0.9 or 0.1 to use for BN layers? [1,2,3]
Is there bias?

[1] https://debuggercafe.com/implementing-resnet18-in-pytorch-from-scratch/

[2] https://www.kaggle.com/code/ivankunyankin/resnet18-from-scratch-using-pytorch

[3] https://github.com/jimmyyhwu/resnet18-tf2/blob/master/resnet.py

[4] https://pytorch.org/vision/main/models/generated/torchvision.models.resnet18.html

In [7]:
import torch.nn as nn
import torch
from torch import Tensor
from typing import Type

In [2]:
class BasicBlock(nn.Module): # nn.module is basic class for all neural network models
    def __init__(self, in_channels, out_channels, stride, identity_downsample_function):
        super(BasicBlock, self).__init__() # TODO clarify what this is doing
        
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1) # TODO check to make sure bias is False or True
        self.bn1 = nn.BatchNorm2d(out_channels, momentum=0.9) # TODO double check momentum
        self.relu = nn.ReLU()
        
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(out_channels, momentum=0.9)
        
        self.identity_downsample_function = identity_downsample_function
        
    def forward(self, x: Tensor) -> Tensor:
        identity = x 
        
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        
        out = self.conv2(out)
        out = self.bn2(out)
        
        # the identity shortcuts can be directly used when the input and output are same dimensions.
        # When dimenions increase (output size is halved), the shortcut is convolved with 1x1 filter w/ stride=2
        if self.identity_downsample_function is not None:
            identity = self.identity_downsample_function(x)
             
        out += identity # this is the power of residual networks
        out = self.relu(out)
        return out
        

In [3]:
class ResNet18(nn.Module):
    def __init__(self, image_channels: int, number_classes: int):
        super(ResNet18, self).__init__()
        
        self.conv1 = nn.Conv2d(in_channels=image_channels, # assuming RGB images
                               out_channels=64, 
                               kernel_size=7, 
                               stride=2, 
                               padding=3) # padding has to equal 3 in order to halve the input dimensions, draw it out on 14x14 image to understand that no other padding amount works
        self.bn1 = nn.BatchNorm2d(64, momentum=0.9)
        self.relu = nn.ReLU()
        
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        
        self.layer1 = nn.Sequential(
                        BasicBlock(64, 64, stride=1, identity_downsample_function=None),
                        BasicBlock(64, 64, stride=1, identity_downsample_function=None))
        self.layer2 = nn.Sequential(
                        BasicBlock(64, 128, stride=2, identity_downsample_function=self.create_downsample_function(64, 128)),
                        BasicBlock(128, 128, stride=1, identity_downsample_function=None))
        self.layer3 = nn.Sequential(
                        BasicBlock(128, 256, stride=2, identity_downsample_function=self.create_downsample_function(128, 256)),
                        BasicBlock(256, 256, stride=1, identity_downsample_function=None))
        self.layer4 = nn.Sequential(
                        BasicBlock(256, 512, stride=2, identity_downsample_function=self.create_downsample_function(256, 512)),
                        BasicBlock(512, 512, stride=1, identity_downsample_function=None))
            
        self.avgpool = nn.AdaptiveAvgPool2d((1,1))
        self.fc = nn.Linear(512, number_classes)
            
    def forward(self, x):
            x = self.conv1(x)
            x = self.bn1(x)
            x = self.relu(x)
            x = self.maxpool(x)
            
            x = self.layer1(x)
            x = self.layer2(x)
            x = self.layer3(x)
            x = self.layer4(x)
            
            x = self.avgpool(x)
            x = torch.flatten(x, 1)
            x = self.fc(x)
            
            return x
        
    def create_downsample_function(self, in_channels, out_channels):
        return nn.Sequential(
                    nn.Conv2d(in_channels=in_channels,
                              out_channels=out_channels,
                              kernel_size=1, 
                              stride=2),
                    nn.BatchNorm2d(out_channels, momentum=0.9))

In [4]:
model = ResNet18(3,1000)
print(model)

ResNet18(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3))
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.9, affine=True, track_running_stats=True)
  (relu): ReLU()
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.9, affine=True, track_running_stats=True)
      (relu): ReLU()
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.9, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.9, affine=True, track_running_stats=True)
      (relu): ReLU()
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
   

In [5]:
total_parameters = sum(p.numel() for p in model.parameters())
print("Total parameters: ", total_parameters)
total_trainable_parameters = sum(
        p.numel() for p in model.parameters() if p.requires_grad)
print("Total trainable parameters: ", total_trainable_parameters)
# official torchvision resnet18 mdoel has 11,689,512

Total parameters:  11694312
Total trainable parameters:  11694312


In [8]:
# running a dummy tensor
tensor = torch.rand([1,3,224,224])
outout = model(tensor)