In [1]:
import torch
import torch.nn as nn
from torchsummary import summary
import time

## Dynamic Tanh

In [90]:
class DyT(nn.Module):
    def __init__(self, dims, init_alpha=0.5):
        super().__init__()
        self.alpha = nn.Parameter(torch.ones(1) * init_alpha)
        self.gamma = nn.Parameter(torch.ones(dims))
        self.beta = nn.Parameter(torch.zeros(dims))

    def forward(self, x):
        return self.gamma * torch.tanh(self.alpha * x) + self.beta

In [91]:
class DyT_wrapper(nn.Module):
    def __init__(self, dims, init_alpha=0.5):
        super().__init__()
        self.dyt = DyT(dims, init_alpha)

    def forward(self, x):
        x = torch.movedim(x, 1,-1)
        x = self.dyt(x)
        x = torch.movedim(x, -1, 1)
        return x

## ConvNeXt

In [94]:
class ConvNeXtBlock2D(nn.Module):
  def __init__(self, dim, layer_scale_init_value=1e-6, drop=0.2, **kwargs):
    super().__init__()

    self.dwconv = nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim)  # depthwise conv

    # self.norm = nn.LayerNorm(dim)
    self.norm = DyT(dim)

    self.pwconv1 = nn.Linear(dim, 4 * dim)
    self.act = nn.GELU()
    self.pwconv2 = nn.Linear(4 * dim, dim)

    self.dropout = nn.Dropout2d(p=drop)

    self.gamma = nn.Parameter(layer_scale_init_value * torch.ones((dim,)), requires_grad=True) if layer_scale_init_value > 0 else None


  def forward(self, x):
    residual = x
    x = self.dwconv(x)

    # Transpose for LayerNorm
    x = x.permute(0, 2, 3, 1)
    x = self.norm(x)

    x = self.pwconv1(x)
    x = self.act(x)
    x = self.pwconv2(x)

    x = self.dropout(x)

    if self.gamma is not None:
        x = self.gamma * x

    # Transpose back to (B, C, H, W)
    x = x.permute(0, 3, 1, 2)
    # no drop path y
    return residual + x

In [108]:
block = ConvNeXtBlock2D(96)
summary(block, (96,112,112), device='cpu')

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 96, 112, 112]           4,800
               DyT-2         [-1, 112, 112, 96]               0
            Linear-3        [-1, 112, 112, 384]          37,248
              GELU-4        [-1, 112, 112, 384]               0
            Linear-5         [-1, 112, 112, 96]          36,960
         Dropout2d-6         [-1, 112, 112, 96]               0
Total params: 79,008
Trainable params: 79,008
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 4.59
Forward/backward pass size (MB): 110.25
Params size (MB): 0.30
Estimated Total Size (MB): 115.15
----------------------------------------------------------------


In [96]:
class ConvNext(nn.Module):
    def __init__(self, in_chans=1, dims=[32, 64, 128, 256], stages=[1, 1, 3, 1]):
        super().__init__()

        self.in_chans = in_chans
        self.dims = dims
        self.stages = stages

        self.downsample_layers = nn.ModuleList()  # stem and 3 intermediate downsampling conv layers
        stem = nn.Sequential(
            nn.Conv2d(in_chans, dims[0], kernel_size=4, stride=4),
            # myLayerNorm(dims[0], eps=1e-6)
            DyT_wrapper(dims[0])
        )
        self.downsample_layers.append(stem)
        for i in range(3):
            downsample_layer = nn.Sequential(
                # myLayerNorm(dims[i], eps=1e-6),
                DyT_wrapper(dims[i]),
                nn.Conv2d(dims[i], dims[i + 1], kernel_size=2, stride=2),
            )
            self.downsample_layers.append(downsample_layer)

        self.model_layers = nn.ModuleList()
        for i, stage_length in enumerate(stages):
            stage = nn.ModuleList([ConvNeXtBlock2D(dims[i]) for _ in range(stage_length)])
            self.model_layers.append(stage)

        self.pooling = nn.AdaptiveAvgPool2d((1, 1))
        self.flatten = nn.Flatten()
        # self.final_norm = nn.LayerNorm(dims[-1])
        self.final_norm = DyT_wrapper(dims[-1])

    def forward(self, x):
        for i in range(len(self.dims)):
            x = self.downsample_layers[i](x)
            for layer in self.model_layers[i]:
                x = layer(x)

        x = self.pooling(x)
        x = self.flatten(x)
        x = self.final_norm(x)  # Final normalization
        return x

In [97]:
summary(ConvNext(in_chans=3,dims=[96,192,384,768],stages=[3,3,9,3]), (3,224,224), device="cpu")

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 96, 56, 56]           4,704
               DyT-2           [-1, 56, 56, 96]               0
       DyT_wrapper-3           [-1, 96, 56, 56]               0
            Conv2d-4           [-1, 96, 56, 56]           4,800
               DyT-5           [-1, 56, 56, 96]               0
            Linear-6          [-1, 56, 56, 384]          37,248
              GELU-7          [-1, 56, 56, 384]               0
            Linear-8           [-1, 56, 56, 96]          36,960
         Dropout2d-9           [-1, 56, 56, 96]               0
  ConvNeXtBlock2D-10           [-1, 96, 56, 56]               0
           Conv2d-11           [-1, 96, 56, 56]           4,800
              DyT-12           [-1, 56, 56, 96]               0
           Linear-13          [-1, 56, 56, 384]          37,248
             GELU-14          [-1, 56, 

## CSPNet
Follows C2F block from YOLO

In [109]:
## Follows Conv class from ultralytics loosely
class Conv(nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        # self.c = nn.Linear(in_channels, out_channels)
        self.c = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1)
        self.bn = nn.BatchNorm2d(out_channels)
        self.act = nn.SiLU()

    def forward(self, x):
        # x = torch.movedim(x, 1, -1)
        x = self.c(x)
        # x = torch.movedim(x, -1, 1)
        x = self.bn(x)
        x = self.act(x)
        return x                 

In [110]:
conv = Conv(3, 6)
summary(conv, (3,224,224), device='cpu')

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1          [-1, 6, 224, 224]              24
       BatchNorm2d-2          [-1, 6, 224, 224]              12
              SiLU-3          [-1, 6, 224, 224]               0
Total params: 36
Trainable params: 36
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.57
Forward/backward pass size (MB): 6.89
Params size (MB): 0.00
Estimated Total Size (MB): 7.46
----------------------------------------------------------------


In [111]:
class CSPStage(nn.Module):
    def __init__(self, in_channels, num_blocks, **kwargs):
        super().__init__()
        csp_channels = in_channels * 2
        self.base_layer = Conv(in_channels, csp_channels)
        self.bottlenecks = nn.ModuleList(
            ConvNeXtBlock2D(in_channels, **kwargs) for _ in range(num_blocks)
        )
        self.transition_layer = Conv((2 + num_blocks) * in_channels, in_channels)

    def forward(self, x):
        x = self.base_layer(x)
        y = list(torch.chunk(x, 2, 1))
        y.extend(block(y[-1]) for block in self.bottlenecks)
        y = torch.cat(y, 1)
        y = self.transition_layer(y)
        return y
        

In [112]:
cspStage = CSPStage(96,1)
summary(cspStage, (96,224,224), device='cpu')

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1        [-1, 192, 224, 224]          18,624
       BatchNorm2d-2        [-1, 192, 224, 224]             384
              SiLU-3        [-1, 192, 224, 224]               0
              Conv-4        [-1, 192, 224, 224]               0
            Conv2d-5         [-1, 96, 224, 224]           4,800
               DyT-6         [-1, 224, 224, 96]               0
            Linear-7        [-1, 224, 224, 384]          37,248
              GELU-8        [-1, 224, 224, 384]               0
            Linear-9         [-1, 224, 224, 96]          36,960
        Dropout2d-10         [-1, 224, 224, 96]               0
  ConvNeXtBlock2D-11         [-1, 96, 224, 224]               0
           Conv2d-12         [-1, 96, 224, 224]          27,744
      BatchNorm2d-13         [-1, 96, 224, 224]             192
             SiLU-14         [-1, 96, 2

In [113]:
class CSPConvNext(nn.Module):
    def __init__(self, in_chans=1, dims=[32, 64, 128, 256], stages=[1, 1, 3, 1]):
        super().__init__()

        self.in_chans = in_chans
        self.dims = dims
        self.stages = stages

        self.downsample_layers = nn.ModuleList()  # stem and 3 intermediate downsampling conv layers
        stem = nn.Sequential(
            nn.Conv2d(in_chans, dims[0], kernel_size=4, stride=4),
            # myLayerNorm(dims[0], eps=1e-6)
            DyT_wrapper(dims[0])
        )
        self.downsample_layers.append(stem)
        for i in range(3):
            downsample_layer = nn.Sequential(
                # myLayerNorm(dims[i], eps=1e-6),
                DyT_wrapper(dims[i]),
                nn.Conv2d(dims[i], dims[i + 1], kernel_size=2, stride=2),
            )
            self.downsample_layers.append(downsample_layer)

        self.model_layers = nn.ModuleList()
        for i, stage_length in enumerate(stages):
            # stage = nn.ModuleList([ConvNeXtBlock2D(dims[i]) for _ in range(stage_length)])
            stage = CSPStage(dims[i], stage_length)
            self.model_layers.append(stage)

    def forward(self, x):
        outputs = []
        for i in range(len(self.dims)):
            x = self.downsample_layers[i](x)
            x = self.model_layers[i](x)
            outputs.append(x)

        return outputs

In [120]:
summary(CSPConvNext(in_chans=3,dims=[16,32,64,128],stages=[2,2,6,2]).to(device), (3,640,640))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 16, 160, 160]             784
               DyT-2         [-1, 160, 160, 16]               0
       DyT_wrapper-3         [-1, 16, 160, 160]               0
            Conv2d-4         [-1, 32, 160, 160]             544
       BatchNorm2d-5         [-1, 32, 160, 160]              64
              SiLU-6         [-1, 32, 160, 160]               0
              Conv-7         [-1, 32, 160, 160]               0
            Conv2d-8         [-1, 16, 160, 160]             800
               DyT-9         [-1, 160, 160, 16]               0
           Linear-10         [-1, 160, 160, 64]           1,088
             GELU-11         [-1, 160, 160, 64]               0
           Linear-12         [-1, 160, 160, 16]           1,040
        Dropout2d-13         [-1, 160, 160, 16]               0
  ConvNeXtBlock2D-14         [-1, 16, 1

## I modified Ultralytics instead
Building the rest is a headache

In [2]:
import sys
import os

sys.path.append('/home/abk171/ultralytics')

from ultralytics import YOLO

In [3]:
from ultralytics.nn.modules.block import C3k2NeXt, C2f, DyT

In [14]:
summary(C3k2NeXt(16,16,n=3).to('cuda'), (16,224,224), device='cuda')

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 16, 224, 224]             256
       BatchNorm2d-2         [-1, 16, 224, 224]              32
              SiLU-3         [-1, 16, 224, 224]               0
              SiLU-4         [-1, 16, 224, 224]               0
              Conv-5         [-1, 16, 224, 224]               0
            Conv2d-6          [-1, 8, 224, 224]             400
               DyT-7          [-1, 224, 224, 8]               0
            Linear-8         [-1, 224, 224, 32]             288
              GELU-9         [-1, 224, 224, 32]               0
           Linear-10          [-1, 224, 224, 8]             264
        Dropout2d-11          [-1, 224, 224, 8]               0
ConvNeXtBottleNeck-12          [-1, 8, 224, 224]               0
           Conv2d-13          [-1, 8, 224, 224]             400
              DyT-14          [-1, 224

In [4]:
YOLO("yolo11-next.yaml")



YOLO(
  (model): DetectionModel(
    (model): Sequential(
      (0): Conv(
        (conv): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(16, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (1): Conv(
        (conv): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (2): C3k2NeXt(
        (cv1): Conv(
          (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (act): SiLU(inplace=True)
        )
        (cv2): Conv(
          (conv): Conv2d(48, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(64, eps=0.001, momentum=0.03, affine=True, track_runn