# **Efficientnetv2: Smaller models and faster training**

Tan, M., & Le, Q. V. (2021). Efficientnetv2: Smaller models and faster training. arXiv preprint arXiv:2104.00298.

## **Default Setting**

In [1]:
import torch

print("[VERSION]")
print(f"torch: {torch.__version__}")

[VERSION]
torch: 1.9.0+cu102


## **Modeling**

* Ref:
   * (_make_divisible) https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py#L62-L69

   * (SEBlock) https://github.com/d-li14/efficientnetv2.pytorch/blob/main/effnetv2.py#L46-L61

In [2]:
## ========== ==========
## Conv 3x3 block.
## ========== ==========
def conv_3x3_bn(inp_c: int, tar_c: int, stride: int):
    return torch.nn.Sequential(
        torch.nn.Conv2d(inp_c, tar_c, 3, stride, bias = False),
        torch.nn.BatchNorm2d(tar_c),
        torch.nn.Hardswish())

## ========== ==========
## Conv 1x1 block.
## ========== ==========
def conv_1x1_bn(inp_c: int, tar_c: int, stride: int):
    return torch.nn.Sequential(
        torch.nn.Conv2d(inp_c, tar_c, 1, stride, bias = False),
        torch.nn.BatchNorm2d(tar_c),
        torch.nn.Hardswish())

In [3]:
## ========== ==========
## SE block helper.
## ========== ==========
def _make_divisible(v, divisor, min_value = None):
    if min_value is None:
        min_value = divisor
    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
    ## Make sure that round down does not go down by more than 10%.
    if new_v < 0.9 * v:
        new_v += divisor
    return int(new_v)

## ========== ==========
## SE block class.
## ========== ==========
class SEBlock(torch.nn.Module):
    def __init__(
        self,
        inp_c: int,
        tar_c: int,
        reduction: int = 4, ## i.e. SE0.25 in table 4.
        **kwargs,
    ):
        super(SEBlock, self).__init__(**kwargs)

        self.avg_pool = torch.nn.AdaptiveAvgPool2d(1)
        self.block = torch.nn.Sequential(
            ## Decide the reduction dim from inp_c rather than tar_c.
            torch.nn.Linear(tar_c, _make_divisible(inp_c // reduction, 8)),
            torch.nn.Hardswish(),
            torch.nn.Linear(_make_divisible(inp_c // reduction, 8), tar_c),
            torch.nn.Sigmoid())

    def forward(self, x):
        residual = x

        b, c, _, _ = x.size()
        x = self.avg_pool(x).view(b, c)
        x = self.block(x).view(b, c, 1, 1)

        return x * residual

In [4]:
class FusedMBConv(torch.nn.Module):
    def __init__(
        self,
        inp_c: int,
        tar_c: int,
        stride: int = 1,
        expansion_factor: int = 4,
        **kwargs,
    ):
        super(FusedMBConv, self).__init__(**kwargs)
        assert stride in [1, 2]

        ## Arguments.
        hidden_dim = round(inp_c * expansion_factor)
        self.apply_residual = (stride == 1) and (inp_c == tar_c)

        ## Layers.
        self.block = torch.nn.Sequential(
            ## Conv 3x3.
            torch.nn.Conv2d(inp_c, hidden_dim, 3, stride, 1, bias = False),
            torch.nn.BatchNorm2d(hidden_dim),
            torch.nn.Hardswish(),
            ## SE block.
            ##  - Decide the reduction dim from inp_c rather than tar_c.
            SEBlock(inp_c, hidden_dim),
            ## Conv 1x1.
            torch.nn.Conv2d(hidden_dim, tar_c, 1, bias = False),
            torch.nn.BatchNorm2d(tar_c))

    def forward(self, x):
        if self.apply_residual:
            return x + self.block(x)
        else:
            return self.block(x)


class MBConv(torch.nn.Module):
    def __init__(
        self,
        inp_c: int,
        tar_c: int,
        stride: int = 1,
        expansion_factor: int = 4,
        **kwargs,
    ):
        super(MBConv, self).__init__(**kwargs)
        assert stride in [1, 2]

        ## Arguments.
        hidden_dim = round(inp_c * expansion_factor)
        self.apply_residual = (stride == 1) and (inp_c == tar_c)

        ## Layers.
        self.block = torch.nn.Sequential(
            ## Conv 1x1.
            torch.nn.Conv2d(inp_c, hidden_dim, 1, stride, bias = False),
            torch.nn.BatchNorm2d(hidden_dim),
            torch.nn.Hardswish(),
            ## Depthwise conv 3x3.
            torch.nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups = hidden_dim, bias = False),
            torch.nn.BatchNorm2d(hidden_dim),
            torch.nn.Hardswish(),
            ## SE block.
            ##  - Decide the reduction dim from inp_c rather than tar_c.
            SEBlock(inp_c, hidden_dim),
            ## Conv 1x1.
            torch.nn.Conv2d(hidden_dim, tar_c, 1, bias = False),
            torch.nn.BatchNorm2d(tar_c))

    def forward(self, x):
        if self.apply_residual:
            return x + self.block(x)
        else:
            return self.block(x)

In [5]:
class EfficientNetV2Stage(torch.nn.Module):
    def __init__(
        self,
        block, ## [FusedMBConv, MBConv]
        inp_c: int,
        tar_c: int,
        stride: int,
        num_layers: int,
        expansion_factor: int,
        **kwargs,
    ):
        super(EfficientNetV2Stage, self).__init__(**kwargs)
        
        self.blocks = torch.nn.Sequential(
            block(inp_c, tar_c, stride, expansion_factor),
            *[block(tar_c, tar_c, 1, expansion_factor) for _ in range(num_layers - 1)],
        )

    def forward(self, x):
        return self.blocks(x)

In [6]:
class EfficientNetV2(torch.nn.Module):
    def __init__(
        self,
        model_size: str,
        **kwargs,
    ):
        super(EfficientNetV2, self).__init__(**kwargs)

        ## Get coefficients.
        self.coef = self._get_coef(model_size)

        ## Stem.
        inp_c = _make_divisible(24, 8)
        self.stem = conv_3x3_bn(3, inp_c, 2)
        
        ## Body.
        self.blocks = []
        for (block, tar_c, stride, num_layers, expansion_factor) in self.coef:
            tar_c = _make_divisible(tar_c, 8)
            self.blocks.append(EfficientNetV2Stage(block, inp_c, tar_c, stride, num_layers, expansion_factor))
            inp_c = tar_c
        self.blocks = torch.nn.Sequential(*self.blocks)

        ## Exit.
        self.out_conv = torch.nn.Sequential(
            conv_1x1_bn(inp_c, 1280, 1),
            torch.nn.AdaptiveAvgPool2d((1, 1)))

        self.num_classes = 1000
        self.classifier = torch.nn.Linear(1280, self.num_classes)

    def _get_coef(self, model_size: str):
        model_size = model_size.lower()
        assert model_size in ["s", "m", "l", "xl"]

        ## coef: (block, inp_c, stride, num_layers, expansion_factor)

        ## Latest version (preprint arXiv at Jun 2021) params may be different
        ## with the prevision version (such as May 2021, ...)
        if model_size == "s":
            coef = [[FusedMBConv, 24,  1, 2,  1],
                    [FusedMBConv, 48,  2, 4,  4],
                    [FusedMBConv, 64,  2, 4,  4],
                    [MBConv,      128, 2, 6,  4],
                    [MBConv,      160, 1, 9,  6],
                    [MBConv,      256, 2, 15, 6]]
        elif model_size == "m":
            coef = [[FusedMBConv, 24,  1, 3,  1],
                    [FusedMBConv, 48,  2, 5,  4],
                    [FusedMBConv, 80,  2, 5,  4],
                    [MBConv,      160, 2, 7,  4],
                    [MBConv,      176, 1, 14, 6],
                    [MBConv,      304, 2, 18, 6],
                    [MBConv,      512, 1, 5,  6]]
        elif model_size == "l":
            coef = [[FusedMBConv, 32,  1, 4,  1],
                    [FusedMBConv, 64,  2, 7,  4],
                    [FusedMBConv, 96,  2, 7,  4],
                    [MBConv,      192, 2, 10, 4],
                    [MBConv,      224, 1, 19, 6],
                    [MBConv,      384, 2, 25, 6],
                    [MBConv,      640, 1,  7, 6]]
        else: ## xl
            coef = [[FusedMBConv, 32,  1, 4,  1],
                    [FusedMBConv, 64,  2, 8,  4],
                    [FusedMBConv, 96,  2, 8,  4],
                    [MBConv,      192, 2, 16, 4],
                    [MBConv,      256, 1, 24, 6],
                    [MBConv,      512, 2, 32, 6],
                    [MBConv,      640, 1,  8, 6]]

        return coef


    def forward(self, x):
        x = self.stem(x)
        x = self.blocks(x)
        x = self.out_conv(x)
        x = self.classifier(x.view(x.size(0), -1))
        return x

## **Show Size of Each Models**

* Original Model Size:

   - S: 22M
   - M: 54M
   - L: 120M
   - XL: 208M

In [7]:
foo = EfficientNetV2("xl").to(torch.device("cpu"))
bar = torch.zeros((8, 3, 224, 224)).to(torch.device("cpu"))

foo(bar).size()

torch.Size([32, 1000])

In [10]:
for model_size in ["s", "m", "l", "xl"]:
    ## We don't need to build to calculate params in the model.
    foo = EfficientNetV2(model_size).to(torch.device("cpu"))

    total_params = sum(p.numel() for p in foo.parameters()) / 2 ** 20
    print(f"{'[' + model_size.upper() + ']':<4} # of params: {total_params:.2f}M")

[S]  # of params: 20.52M
[M]  # of params: 52.09M
[L]  # of params: 113.19M
[XL] # of params: 198.67M
