# Model architecture analysis

In [1]:
import os
import sys

import torch
import torchvision.models as models
import torch.nn as nn
import timm
import vit_pytorch 
from torchsummary import summary


# To add src to the path so that we can import modules
current_directory = os.getcwd()
if not current_directory.endswith("emotion_recognition"):
    sys.path.append(os.path.join(current_directory, 'emotion_recognition'))

try:
    from src import NUMBER_OF_EMOT, MODELS_DIR
    import src.models.architectures as arch
except ModuleNotFoundError:
    print("Ensure that src is added to PATH and restart the kernel")
    print(sys.path)

# Take GPU
if not torch.cuda.is_available():
       raise RuntimeError("Enable GPU support")
device = torch.device("cuda")

## ResNet50

In [2]:
model = models.resnet50(weights = "DEFAULT").to(device)
print(summary(model, (3, 224, 224))) # Summary of the model with input size (3, 224, 224)

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 112, 112]           9,408
       BatchNorm2d-2         [-1, 64, 112, 112]             128
              ReLU-3         [-1, 64, 112, 112]               0
         MaxPool2d-4           [-1, 64, 56, 56]               0
            Conv2d-5           [-1, 64, 56, 56]           4,096
       BatchNorm2d-6           [-1, 64, 56, 56]             128
              ReLU-7           [-1, 64, 56, 56]               0
            Conv2d-8           [-1, 64, 56, 56]          36,864
       BatchNorm2d-9           [-1, 64, 56, 56]             128
             ReLU-10           [-1, 64, 56, 56]               0
           Conv2d-11          [-1, 256, 56, 56]          16,384
      BatchNorm2d-12          [-1, 256, 56, 56]             512
           Conv2d-13          [-1, 256, 56, 56]          16,384
      BatchNorm2d-14          [-1, 256,

We try changing last FC layer in order to adapt to our task. 

In [3]:
model.fc = nn.Linear(2048, NUMBER_OF_EMOT)
model = model.to(device)

In [4]:
try:
    model(torch.rand((1, 3, 224, 224)).to(device))
    print("Image size is compatible with layer sizes.")
except RuntimeError as e:
    e = str(e)
    if e.endswith("Output size is too small"):
        print("Image size is too small.")
    elif "shapes cannot be multiplied" in e:
        required_shape = e[e.index("x") + 1:].split(" ")[0]
        print(f"Linear layer needs to have size: {required_shape}")
    else:
        print(f"Error not understood: {e}")

Image size is compatible with layer sizes.


## ResNext50_32x4d

In [5]:
model = models.resnext50_32x4d(weights = "DEFAULT").to(device)
print(summary(model, (3, 224, 224))) # Summary of the model with input size (3, 224, 224)

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 112, 112]           9,408
       BatchNorm2d-2         [-1, 64, 112, 112]             128
              ReLU-3         [-1, 64, 112, 112]               0
         MaxPool2d-4           [-1, 64, 56, 56]               0
            Conv2d-5          [-1, 128, 56, 56]           8,192
       BatchNorm2d-6          [-1, 128, 56, 56]             256
              ReLU-7          [-1, 128, 56, 56]               0
            Conv2d-8          [-1, 128, 56, 56]           4,608
       BatchNorm2d-9          [-1, 128, 56, 56]             256
             ReLU-10          [-1, 128, 56, 56]               0
           Conv2d-11          [-1, 256, 56, 56]          32,768
      BatchNorm2d-12          [-1, 256, 56, 56]             512
           Conv2d-13          [-1, 256, 56, 56]          16,384
      BatchNorm2d-14          [-1, 256,

In [6]:
model.fc = nn.Linear(2048, NUMBER_OF_EMOT)
model = model.to(device)

In [21]:
try:
    model(torch.rand((1, 3, 224, 224)).to(device))
    print("Image size is compatible with layer sizes.")
except RuntimeError as e:
    e = str(e)
    if e.endswith("Output size is too small"):
        print("Image size is too small.")
    elif "shapes cannot be multiplied" in e:
        required_shape = e[e.index("x") + 1:].split(" ")[0]
        print(f"Linear layer needs to have size: {required_shape}")
    else:
        print(f"Error not understood: {e}")

Image size is compatible with layer sizes.


## EfficientNet B2 8

In [22]:
model = timm.create_model('tf_efficientnet_b2', pretrained=False)
model.classifier=nn.Sequential(nn.Linear(in_features=1408, out_features=NUMBER_OF_EMOT)) #1792 #1280 #1536
model.to(device)
print(model.__class__.__name__)

EfficientNet


In [23]:
print(summary(model, (3, 224, 224))) # Summary of the model with input size (3, 224, 224)

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
        Conv2dSame-1         [-1, 32, 112, 112]             864
          Identity-2         [-1, 32, 112, 112]               0
              SiLU-3         [-1, 32, 112, 112]               0
    BatchNormAct2d-4         [-1, 32, 112, 112]              64
            Conv2d-5         [-1, 32, 112, 112]             288
          Identity-6         [-1, 32, 112, 112]               0
              SiLU-7         [-1, 32, 112, 112]               0
    BatchNormAct2d-8         [-1, 32, 112, 112]              64
            Conv2d-9              [-1, 8, 1, 1]             264
             SiLU-10              [-1, 8, 1, 1]               0
           Conv2d-11             [-1, 32, 1, 1]             288
          Sigmoid-12             [-1, 32, 1, 1]               0
    SqueezeExcite-13         [-1, 32, 112, 112]               0
           Conv2d-14         [-1, 16, 1

In [24]:
weights_path = os.path.join(MODELS_DIR,"EfficientNetB2", "enet_b2_8_best.pt")
print("Weights obtained from: https://github.com/av-savchenko/face-emotion-recognition/blob/main/models/affectnet_emotions/enet_b2_8_best.pt")
model = torch.load(weights_path)
print(summary(model, (3, 260, 260))) # Summary of the model with input size (3, 224, 224)

Weights obtained from: https://github.com/av-savchenko/face-emotion-recognition/blob/main/models/affectnet_emotions/enet_b2_8_best.pt
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
        Conv2dSame-1         [-1, 32, 130, 130]             864
          Identity-2         [-1, 32, 130, 130]               0
              SiLU-3         [-1, 32, 130, 130]               0
    BatchNormAct2d-4         [-1, 32, 130, 130]              64
            Conv2d-5         [-1, 32, 130, 130]             288
          Identity-6         [-1, 32, 130, 130]               0
              SiLU-7         [-1, 32, 130, 130]               0
    BatchNormAct2d-8         [-1, 32, 130, 130]              64
            Conv2d-9              [-1, 8, 1, 1]             264
             SiLU-10              [-1, 8, 1, 1]               0
           Conv2d-11             [-1, 32, 1, 1]             288
          Sigmoid-12             

In [25]:
model, device = arch.model_creation(arch_type = "efficientnet_b2", weights = "affectnet_cat_emot")
print(model.classifier)

Using CUDA with 1 GPUs
Using CUDA device:NVIDIA GeForce RTX 2080 Ti
Weights obtained from: https://github.com/av-savchenko/face-emotion-recognition/blob/main/models/affectnet_emotions/enet_b2_8_best.pt
Sequential(
  (0): Linear(in_features=1408, out_features=8, bias=True)
  (1): RearrangeLayer()
)


## VIT
