# **This is where the models are build**

In [None]:
import torch.nn as nn
import torch.nn.functional as F
import torch

## Model 1
Model one is a relatively simple model following the alexnet architecture in a small version


In [None]:
class SIMPLE1(nn.Module):
    def __init__(self):
        super(SIMPLE1, self).__init__()

        self.conv1 = nn.Conv2d(3, 32, 3, padding=1, stride = 1, bias=False)
        self.conv1_bn = nn.BatchNorm2d(32)

        self.conv2 = nn.Conv2d(32, 64, 3, padding=1, bias=False)
        self.conv2_bn = nn.BatchNorm2d(64)

        self.conv3 = nn.Conv2d(64, 128, 3, padding=1, bias=False)
        self.conv3_bn = nn.BatchNorm2d(128)

        self.conv4 = nn.Conv2d(128, 164, 3, padding=1, bias=False)
        self.conv4_bn = nn.BatchNorm2d(164)

        self.conv5 = nn.Conv2d(164, 176, 3, padding=1, bias=False)
        self.conv5_bn = nn.BatchNorm2d(176)

        # unchanged: 176 * 8 * 8 = 11264
        self.fc1 = nn.Linear(11264, 200, bias=False)
        self.fc1_bn = nn.BatchNorm1d(200)

    def get_logits(self, x):
        x = (x - 0.5) * 2.0

        #conv 1
        x = F.relu(self.conv1_bn(self.conv1(x)))
        x = F.max_pool2d(x, 2) #256 -> 128

        #conv 2
        x = F.relu(self.conv2_bn(self.conv2(x)))
        x = F.max_pool2d(x, 2) #128 -> 64

        #conv 3
        x = F.relu(self.conv3_bn(self.conv3(x)))
        x = F.max_pool2d(x, 2) #64 -> 32

        # Conv 4
        x = F.relu(self.conv4_bn(self.conv4(x)))
        x = F.max_pool2d(x, 2)   # 32 -> 16

        # Conv 5
        x = F.relu(self.conv5_bn(self.conv5(x)))
        x = F.max_pool2d(x, 2)   # 16 -> 8

        # x is now (batch, 176, 8, 8)
        x = torch.flatten(x, 1)  # (batch, 11264)

        logits = self.fc1_bn(self.fc1(x))
        return logits

    def forward(self, x):
        logits = self.get_logits(x)
        return logits

## **Model2: more efficient FC layer, Dropout added and double convolution blocks**
above the FC layer is using a flattening method, though it works its verry inefficient, in networks like resnet and efficientnet a global averagepooling filter is used. Another interesting feature that comes from alexnet is dropout, which prevents overfitting by dropping parameters by chance in training. One further improvement over the above model is the use of some extra conv layers in between pooling to capture more information, but scaling slower.
**GAP:** Generalizes location (Feature location doesnt matter), perfect for classification, especially of birds where orientation/location doesnt matter. Also reduces parameters by a ton (fc layer goes from millions to thousands). Adaptive pooling is used because it doesnt break on changes before.
**Dropout:** Prevents overfitting by forcing the model to learn form different features and not just depend on one (during training randomly drops features). Potential to use lower dropout for first few epochs then higher later.
**Stageblocks:** Stacks double convolution layers to enrich information before sizing up.


In [64]:
class CLASSIC1(nn.Module):
    def __init__(self):
        super(CLASSIC1, self).__init__()

        # 5 stages with double conv + pooling
        self.stage1 = self.conv_block(3, 32)
        self.stage2 = self.conv_block(32, 64)
        self.stage3 = self.conv_block(64, 128)
        self.stage4 = self.conv_block(128, 256) #adapt to 256 and 512 to conform to memory norms (powers of 2)
        self.stage5 = self.conv_block(256, 512)

        self.gap = nn.AdaptiveAvgPool2d(1)   # Global Average Pooling reduces parameters and betters generalization
        self.dropout = nn.Dropout(p=0.4) # Prevents overfitting, with some reduced probability to allow quicker learning and not over-regularize
        self.fc1 = nn.Linear(512, 200)

    @staticmethod
    def conv_block(in_ch, out_ch): #first building block for conv layers (stack of 2)
        # Standard pattern: Conv -> BN -> ReLU -> Conv -> BN -> ReLU -> MaxPool
        return nn.Sequential(
            nn.Conv2d(in_ch, out_ch, 3, padding=1, bias=False),
            nn.BatchNorm2d(out_ch),
            nn.ReLU(inplace=True),

            nn.Conv2d(out_ch, out_ch, 3, padding=1, bias=False),
            nn.BatchNorm2d(out_ch),
            nn.ReLU(inplace=True),

            nn.MaxPool2d(2)
        )

    def forward(self, x): #replaced forward + logit with just forward
        x = (x - 0.5) * 2.0

        x = self.stage1(x)
        x = self.stage2(x)
        x = self.stage3(x)
        x = self.stage4(x)
        x = self.stage5(x)
        # x is now (Batch_Size, 512, 1, 1) assuming input was 32x32 (5 max pools)

        x = self.gap(x)
        x = torch.flatten(x, 1)
        x = self.dropout(x)
        logits = self.fc1(x)
        return logits

## MODEL3: RESIDUALS! and some efficientnet magic
This is where the big stuff starts, we start adding residual connections, basically skips that make networks better to train and able to go deeper.
To make this a little easier to work with, instead of defining blocks inside of the class we use extra classes for our different types. Now it becomes clear that every neural network is just a bunch of building blocks that can be stacked on top of eachother, like the first residual block is a mini network by itself.
^CLASSICRES

Now we're gonna go a bit deeper on the developments that based themselves on resnet, changes that will be made are:



In [65]:
class ResidualBlock(nn.Module):
    def __init__(self, in_ch, out_ch):
        super().__init__()
        self.conv1 = nn.Conv2d(in_ch, out_ch, 3, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_ch)
        self.conv2 = nn.Conv2d(out_ch, out_ch, 3, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_ch)

        # shortcut: identity if channels match, otherwise 1x1 conv
        self.shortcut = nn.Identity() if in_ch == out_ch else nn.Conv2d(in_ch, out_ch, 1, bias=False)

        self.relu = nn.ReLU(inplace=True)
        self.pool = nn.MaxPool2d(2)

    def forward(self, x):
        identity = self.shortcut(x)
        out = self.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += identity
        out = self.relu(out)
        out = self.pool(out)
        return out

class CLASSICRES(nn.Module):
    def __init__(self, num_classes=200):
        super().__init__()

        # Stage-level residual blocks
        self.stage1 = ResidualBlock(3, 32)       # 256 -> 128
        self.stage2 = ResidualBlock(32, 64)      # 128 -> 64
        self.stage3 = ResidualBlock(64, 128)     # 64 -> 32
        self.stage4 = ResidualBlock(128, 256)    # 32 -> 16
        self.stage5 = ResidualBlock(256, 512)    # 16 -> 8

        # Classifier
        self.gap = nn.AdaptiveAvgPool2d(1)
        self.dropout = nn.Dropout(p=0.4)
        self.fc = nn.Linear(512, num_classes)

    def forward(self, x):
        x = (x - 0.5) * 2.0  # normalize to [-1, 1]

        x = self.stage1(x)
        x = self.stage2(x)
        x = self.stage3(x)
        x = self.stage4(x)
        x = self.stage5(x)

        x = self.gap(x)                  # (B, 512, 1, 1)
        x = torch.flatten(x, 1)          # (B, 512)
        x = self.dropout(x)
        logits = self.fc(x)
        return logits

### Modernizing resnet
We add squeeze and excitations modules, inspired by implementations of efficientnet and SENet. These blocks are used as a sort of attention mechanism where it uses global average to get an idea of the importance of a channel and then expands these channels and weight the original feature maps. This is a cheap improvement

In [66]:
class SEBlock(nn.Module):
    def __init__(self, channels, reduction=16):
        super().__init__()
        self.fc1 = nn.Linear(channels, channels // reduction)
        self.fc2 = nn.Linear(channels // reduction, channels)
        self.activation = nn.SiLU()       # <-- changed from ReLU
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        b, c, h, w = x.size()
        # Squeeze: global average pooling
        y = x.mean(dim=(2, 3))           # (B, C)
        # Excitation: MLP
        y = self.fc2(self.activation(self.fc1(y)))  # (B, C)
        y = self.sigmoid(y).view(b, c, 1, 1)
        # Scale: multiply original feature map
        return x * y


In [67]:
class ResidualBlock(nn.Module):
    def __init__(self, in_ch, out_ch, use_se=False, use_pool=False):
        super().__init__()
        self.use_se = use_se
        self.use_pool = use_pool

        self.conv1 = nn.Conv2d(in_ch, out_ch, 3, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_ch)
        self.conv2 = nn.Conv2d(out_ch, out_ch, 3, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_ch)

        self.shortcut = nn.Identity() if in_ch == out_ch else nn.Conv2d(in_ch, out_ch, 1, bias=False)
        self.act = nn.SiLU(inplace=True)  # <-- changed from ReLU
        if use_pool:
            self.pool = nn.MaxPool2d(2)
        if use_se:
            self.se = SEBlock(out_ch)

    def forward(self, x):
        identity = self.shortcut(x)
        out = self.act(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += identity

        if self.use_se:
            out = self.se(out)

        out = self.act(out)
        if self.use_pool:
            out = self.pool(out)
        return out


In [68]:
class MODERNRES(nn.Module):
    def __init__(self, num_classes=200):
        super().__init__()

        # Stage-level residual blocks
        self.stage1 = ResidualBlock(3, 32, use_se=True)
        self.stage2 = ResidualBlock(32, 64, use_se=True)
        self.stage3 = ResidualBlock(64, 96, use_se=True)
        self.stage4 = ResidualBlock(96, 128, use_se=True)
        self.stage5 = ResidualBlock(128, 160, use_se=True)

        # Classifier
        self.gap = nn.AdaptiveAvgPool2d(1)
        self.dropout = nn.Dropout(p=0.4)
        self.fc = nn.Linear(160, num_classes)

    def forward(self, x):
        x = (x - 0.5) * 2.0  # normalize to [-1, 1]

        x = self.stage1(x)
        x = self.stage2(x)
        x = self.stage3(x)
        x = self.stage4(x)
        x = self.stage5(x)

        x = self.gap(x)
        x = torch.flatten(x, 1)
        x = self.dropout(x)
        logits = self.fc(x)
        return logits

In [69]:
class MODERNRESDEEP(nn.Module):
    def __init__(self, num_classes=200):
        super().__init__()

        # Stage-level residual blocks
        self.stage1 = ResidualBlock(3, 32, use_se=True)
        self.stage2 = ResidualBlock(32, 64, use_se=True, use_pool = False)
        self.stage3 = ResidualBlock(64, 96, use_se=True)
        self.stage4 = ResidualBlock(96, 128, use_se=True, use_pool = False)
        self.stage5 = ResidualBlock(128, 160, use_se=True)
        self.stage6 = ResidualBlock(160, 192, use_se=True, use_pool = False)
        self.stage7 = ResidualBlock(192, 224, use_se=True)
        self.stage8 = ResidualBlock(224, 256, use_se=True, use_pool = False)
        self.stage9 = ResidualBlock(256, 288, use_se=True)

        # Classifier
        self.gap = nn.AdaptiveAvgPool2d(1)
        self.dropout = nn.Dropout(p=0.4)
        self.fc = nn.Linear(288, num_classes)

    def forward(self, x):
        x = (x - 0.5) * 2.0  # normalize to [-1, 1]

        x = self.stage1(x)
        x = self.stage2(x)
        x = self.stage3(x)
        x = self.stage4(x)
        x = self.stage5(x)
        x = self.stage6(x)
        x = self.stage7(x)
        x = self.stage8(x)
        x = self.stage9(x)

        x = self.gap(x)
        x = torch.flatten(x, 1)
        x = self.dropout(x)
        logits = self.fc(x)
        return logits

In [70]:
# Imports
import torch
import torch.nn as nn
from torchsummary import summary  # pip install torchsummary

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Initialize and move model to device
print_model = MODERNRESDEEP()

# Print model summary
# Input shape: 3 channels, 256x256 image
summary(print_model, (3, 256, 256))


Using device: cpu
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 32, 256, 256]              96
            Conv2d-2         [-1, 32, 256, 256]             864
       BatchNorm2d-3         [-1, 32, 256, 256]              64
              SiLU-4         [-1, 32, 256, 256]               0
            Conv2d-5         [-1, 32, 256, 256]           9,216
       BatchNorm2d-6         [-1, 32, 256, 256]              64
            Linear-7                    [-1, 2]              66
              SiLU-8                    [-1, 2]               0
            Linear-9                   [-1, 32]              96
          Sigmoid-10                   [-1, 32]               0
          SEBlock-11         [-1, 32, 256, 256]               0
             SiLU-12         [-1, 32, 256, 256]               0
    ResidualBlock-13         [-1, 32, 256, 256]               0
           Conv2d-14 