In [1]:
# import
import torch
import torch.nn as nn

In [2]:
"""
Return: depth scaling factor (d), width scaling factor (w), resolution scaling factor (r)
"""
def params(version):
    if version == 'n':
        return 1/3, 1/4, 2.0
    elif version == 's':
        return 1/3, 1/2, 2.0
    elif version == 'm':
        return 2/3, 3/4, 1.5
    elif version == 'l':
        return 1.0, 1.0, 1.0
    elif version == 'x':
        return 1.0, 1.25, 1.0

# 1. Backbone

## Conv
![Conv](images/conv.jpg)

### Conv Block

In [3]:
class Conv(nn.Module):
    """
    in_c: int, number of input channels (typically 3 for RGB images)
    out_c: int, number of output channels (number of filters)
    k: int, size of the kernel
    s: int, stride of the kernel
    p: int, padding of the kernel
    g: int, number of groups
    act: bool, whether to use activation function SiLU
    """
    def __init__(self, in_c, out_c, k = 3, s = 1, p = 1, g = 1, act = True):
        super().__init__()

        # Conv2d: a convolutional layer
        """
        in_c: int, number of input channels
        out_c: int, number of output channels
        k: int, size of the kernel
        s: int, stride of the kernel
        p: int, padding of the kernel
        g: int, number of groups
        bias: bool, whether to use bias
        """
        self.conv = nn.Conv2d(in_c, out_c, k, s, p, bias = False, groups = g)

        # BatchNorm2d: a normalization layer
        """
        num_features: int, number of features
        eps: float, a value added to the denominator for numerical stability
        momentum: float, the value used for the running_mean and running_var computation
        """
        self.bn = nn.BatchNorm2d(num_features = out_c, eps = 0.001, momentum = 0.03)

        # SiLU: an activation function
        """
        inplace: bool, whether to modify the input directly
        """
        self.act = nn.SiLU(inplace = True) if act else nn.Identity()


    def forward(self, x):
        # Conv2d -> BatchNorm2d -> SiLU
        return self.act(self.bn(self.conv(x)))
    


# Sanity check (First Convolutional Layer)
if __name__ == "__main__":
    version = 's'
    d, w, r = params(version)

    print("(0):")

    """
    input channels: 3
    output channels: 64 * width scaling factor (0.5)
    kernel size: 3
    stride: 2
    padding: 1
    groups: 1
    activation: True
    """
    print(Conv(in_c = 3, out_c = int(64*w), k = 3, s = 2, p = 1, g = 1, act = True))

    """
    batch size: 1
    input channels: 3
    image height: 224
    image width: 224
    """
    print(Conv(in_c = 3, out_c = int(64*w), k = 3, s = 2, p = 1, g = 1, act = True)(torch.randn(1, 3, 640, 640)).shape)

(0):
Conv(
  (conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
  (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
  (act): SiLU(inplace=True)
)
torch.Size([1, 32, 320, 320])


## C2f
![Conv](images/c2f.jpg)

### Bottleneck
![Bottleneck](images/bottleneck.jpg)

In [4]:
class Bottleneck(nn.Module):
    """
    in_c: int, number of input channels
    out_c: int, number of output channels
    shortcut: bool, whether to use a residual connection
    """
    def __init__(self, in_c, out_c, shortcut=True):
        super().__init__()

        # Conv1: first convolutional layer
        """
        Explanation can be seen in the Conv class above
        """
        self.conv1 = Conv(in_c, out_c, k = 3, s = 1, p = 1)

        # Conv: second convolutional layer
        """
        Explanation can be seen in the Conv class above
        """
        self.conv2 = Conv(out_c, out_c, k = 3, s = 1, p = 1)

        # shortcut: a residual connection
        self.shortcut = shortcut


    def forward(self, x):
        x_in = x

        # Conv1
        x = self.conv1(x)

        # Conv2
        x = self.conv2(x)

        # Shortcut
        if self.shortcut:
            x = x + x_in

        return x
    


# Sanity check (First Bottleneck in the First C2f block)
if __name__ == "__main__":
    version = 's'
    d, w, r = params(version)

    print("(1):")

    """
    input channels: 64 * width scaling factor (0.5)
    output channels: 64 * width scaling factor (0.5)
    shortcut: True
    """
    print(Bottleneck(in_c = int(64*w), out_c = int(64*w), shortcut = True))

    """
    batch size: 1
    input channels: 64
    image height: 224
    image width: 224
    """
    print(Bottleneck(in_c = int(64*w), out_c = int(64*w), shortcut = True)(torch.randn(1, int(64*w), 224, 224)).shape)

(1):
Bottleneck(
  (conv1): Conv(
    (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
    (act): SiLU(inplace=True)
  )
  (conv2): Conv(
    (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
    (act): SiLU(inplace=True)
  )
)
torch.Size([1, 32, 224, 224])


### C2f Block

In [7]:
class C2f(nn.Module):
    """
    in_c: int, number of input channels
    out_c: int, number of output channels
    num_bottlenecks: int, number of bottlenecks
    shortcut: bool, whether to use a residual connection
    """
    def __init__(self, in_c, out_c, num_bottlenecks, shortcut = True):
        super().__init__()
        
        self.mid_channels = out_c // 2
        self.num_bottlenecks = num_bottlenecks

        # Conv1: first convolutional layer
        """
        Explanation can be seen in the Conv class above
        """
        self.conv1 = Conv(in_c, out_c, k = 1, s = 1, p = 0)
        
        # Bottleneck Sequence
        self.m = nn.ModuleList([Bottleneck(self.mid_channels, self.mid_channels, shortcut) for _ in range(num_bottlenecks)])

        # Conv2: second convolutional layer
        """
        Explanation can be seen in the Conv class above
        """
        self.conv2 = Conv((num_bottlenecks + 2) * out_c // 2, out_c, k = 1, s = 1, p = 0)
    

    def forward(self,x):
        # Conv1
        x = self.conv1(x)

        # Split
        x1, x2 = x[:,:x.shape[1]//2,:,:], x[:,x.shape[1]//2:,:,:]
        outputs = [x1, x2]

        # Bottleneck Sequence
        for i in range(self.num_bottlenecks):
            x1 = self.m[i](x1)
            outputs.insert(0,x1)

        # Concat
        outputs = torch.cat(outputs, dim = 1)

        # Conv2
        out = self.conv2(outputs)

        return out
    


# Sanity check (First C2f block)
if __name__ == "__main__":
    version = 's'
    d, w, r = params(version)

    print("(2):")

    """
    input channels: 64 * width scaling factor (0.5)
    output channels: 128 * width scaling factor (0.5)
    number of bottlenecks: 3
    shortcut: True
    """
    print(C2f(in_c = int(128*w), out_c = int(128*w), num_bottlenecks = 1, shortcut = True))

    """
    batch size: 1
    input channels: 64
    image height: 224
    image width: 224
    """
    print(C2f(in_c = int(128*w), out_c = int(128*w), num_bottlenecks = 1, shortcut = True)(torch.randn(1, int(128*w), 160, 160)).shape)

(2):
C2f(
  (conv1): Conv(
    (conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(64, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
    (act): SiLU(inplace=True)
  )
  (m): ModuleList(
    (0): Bottleneck(
      (conv1): Conv(
        (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (conv2): Conv(
        (conv): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
    )
  )
  (conv2): Conv(
    (conv): Conv2d(96, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(64, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
    (act): SiLU(inplace=True)
  )
)
to