# Model architecture analysis

In [1]:
import os
import sys

import torch
import torchvision.models as models
import torch.nn as nn
import timm
import numpy as np

from torchsummary import summary


# To add src to the path so that we can import modules
current_directory = os.getcwd()
if not current_directory.endswith("emotion_recognition"):
    sys.path.append(os.path.join(current_directory, 'emotion_recognition'))

try:
    from src import NUMBER_OF_EMOT, MODELS_DIR
    import src.models.architectures as arch
except ModuleNotFoundError:
    print("Ensure that src is added to PATH and restart the kernel")
    print(sys.path)

# Take GPU
if not torch.cuda.is_available():
       raise RuntimeError("Enable GPU support")
device = torch.device("cuda")

## ResNet50

In [2]:
model = models.resnet50(weights = "DEFAULT").to(device)
print(summary(model, (3, 224, 224))) # Summary of the model with input size (3, 224, 224)

Downloading: "https://download.pytorch.org/models/resnet50-11ad3fa6.pth" to /home/usuaris/imatge/armand.de.asis/.cache/torch/hub/checkpoints/resnet50-11ad3fa6.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 158MB/s] 


----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 112, 112]           9,408
       BatchNorm2d-2         [-1, 64, 112, 112]             128
              ReLU-3         [-1, 64, 112, 112]               0
         MaxPool2d-4           [-1, 64, 56, 56]               0
            Conv2d-5           [-1, 64, 56, 56]           4,096
       BatchNorm2d-6           [-1, 64, 56, 56]             128
              ReLU-7           [-1, 64, 56, 56]               0
            Conv2d-8           [-1, 64, 56, 56]          36,864
       BatchNorm2d-9           [-1, 64, 56, 56]             128
             ReLU-10           [-1, 64, 56, 56]               0
           Conv2d-11          [-1, 256, 56, 56]          16,384
      BatchNorm2d-12          [-1, 256, 56, 56]             512
           Conv2d-13          [-1, 256, 56, 56]          16,384
      BatchNorm2d-14          [-1, 256,

We try changing last FC layer in order to adapt to our task. 

In [3]:
model.fc = nn.Linear(2048, NUMBER_OF_EMOT)
model = model.to(device)

In [4]:
try:
    model(torch.rand((1, 3, 224, 224)).to(device))
    print("Image size is compatible with layer sizes.")
except RuntimeError as e:
    e = str(e)
    if e.endswith("Output size is too small"):
        print("Image size is too small.")
    elif "shapes cannot be multiplied" in e:
        required_shape = e[e.index("x") + 1:].split(" ")[0]
        print(f"Linear layer needs to have size: {required_shape}")
    else:
        print(f"Error not understood: {e}")

Image size is compatible with layer sizes.


Try hooks:

In [5]:
# A simple hook class that returns the input and output of a layer during forward/backward pass
class Hook():
    def __init__(self, module):
        self.hook = module.register_forward_hook(self.hook_fn)
    def hook_fn(self, module, input, output):
        self.output = output
    def close(self):
        self.hook.remove()

In [6]:
for epoch in range(10):
    # Register the hook for the last layer of layer4 (Bottleneck-172)
    handle = Hook(model.layer4[-1])
    # Run the model
    input_tensor = torch.rand(48, 3, 224, 224).to(device)  # Batch size 32

    output = model(input_tensor)
    print(output.size())
    print(handle.output.size())
    handle.close()

torch.Size([48, 8])
torch.Size([48, 2048, 7, 7])
torch.Size([48, 8])
torch.Size([48, 2048, 7, 7])
torch.Size([48, 8])
torch.Size([48, 2048, 7, 7])
torch.Size([48, 8])
torch.Size([48, 2048, 7, 7])
torch.Size([48, 8])
torch.Size([48, 2048, 7, 7])
torch.Size([48, 8])
torch.Size([48, 2048, 7, 7])
torch.Size([48, 8])
torch.Size([48, 2048, 7, 7])
torch.Size([48, 8])
torch.Size([48, 2048, 7, 7])
torch.Size([48, 8])
torch.Size([48, 2048, 7, 7])
torch.Size([48, 8])
torch.Size([48, 2048, 7, 7])


## ResNext50_32x4d

In [7]:
model = models.resnext50_32x4d(weights = "DEFAULT").to(device)
print(summary(model, (3, 224, 224))) # Summary of the model with input size (3, 224, 224)

Downloading: "https://download.pytorch.org/models/resnext50_32x4d-1a0047aa.pth" to /home/usuaris/imatge/armand.de.asis/.cache/torch/hub/checkpoints/resnext50_32x4d-1a0047aa.pth
100%|██████████| 95.8M/95.8M [00:00<00:00, 159MB/s] 


----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 112, 112]           9,408
       BatchNorm2d-2         [-1, 64, 112, 112]             128
              ReLU-3         [-1, 64, 112, 112]               0
         MaxPool2d-4           [-1, 64, 56, 56]               0
            Conv2d-5          [-1, 128, 56, 56]           8,192
       BatchNorm2d-6          [-1, 128, 56, 56]             256
              ReLU-7          [-1, 128, 56, 56]               0
            Conv2d-8          [-1, 128, 56, 56]           4,608
       BatchNorm2d-9          [-1, 128, 56, 56]             256
             ReLU-10          [-1, 128, 56, 56]               0
           Conv2d-11          [-1, 256, 56, 56]          32,768
      BatchNorm2d-12          [-1, 256, 56, 56]             512
           Conv2d-13          [-1, 256, 56, 56]          16,384
      BatchNorm2d-14          [-1, 256,

In [8]:
model.fc = nn.Linear(2048, NUMBER_OF_EMOT)
model = model.to(device)

In [9]:
try:
    model(torch.rand((1, 3, 224, 224)).to(device))
    print("Image size is compatible with layer sizes.")
except RuntimeError as e:
    e = str(e)
    if e.endswith("Output size is too small"):
        print("Image size is too small.")
    elif "shapes cannot be multiplied" in e:
        required_shape = e[e.index("x") + 1:].split(" ")[0]
        print(f"Linear layer needs to have size: {required_shape}")
    else:
        print(f"Error not understood: {e}")

Image size is compatible with layer sizes.


## EfficientNet B2 8

In [10]:
model = timm.create_model('tf_efficientnet_b2', pretrained=False)
model.classifier=nn.Sequential(nn.Linear(in_features=1408, out_features=NUMBER_OF_EMOT)) #1792 #1280 #1536
model.to(device)
print(model.__class__.__name__)

EfficientNet


In [11]:
print(summary(model, (3, 224, 224))) # Summary of the model with input size (3, 224, 224)

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
        Conv2dSame-1         [-1, 32, 112, 112]             864
          Identity-2         [-1, 32, 112, 112]               0
              SiLU-3         [-1, 32, 112, 112]               0
    BatchNormAct2d-4         [-1, 32, 112, 112]              64
            Conv2d-5         [-1, 32, 112, 112]             288
          Identity-6         [-1, 32, 112, 112]               0
              SiLU-7         [-1, 32, 112, 112]               0
    BatchNormAct2d-8         [-1, 32, 112, 112]              64
            Conv2d-9              [-1, 8, 1, 1]             264
             SiLU-10              [-1, 8, 1, 1]               0
           Conv2d-11             [-1, 32, 1, 1]             288
          Sigmoid-12             [-1, 32, 1, 1]               0
    SqueezeExcite-13         [-1, 32, 112, 112]               0
           Conv2d-14         [-1, 16, 1

In [12]:
weights_path = os.path.join(MODELS_DIR,"EfficientNetB2", "enet_b2_8_best.pt")
print("Weights obtained from: https://github.com/av-savchenko/face-emotion-recognition/blob/main/models/affectnet_emotions/enet_b2_8_best.pt")
model = torch.load(weights_path)
print(summary(model, (3, 260, 260))) # Summary of the model with input size (3, 224, 224)

Weights obtained from: https://github.com/av-savchenko/face-emotion-recognition/blob/main/models/affectnet_emotions/enet_b2_8_best.pt
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
        Conv2dSame-1         [-1, 32, 130, 130]             864
          Identity-2         [-1, 32, 130, 130]               0
              SiLU-3         [-1, 32, 130, 130]               0
    BatchNormAct2d-4         [-1, 32, 130, 130]              64
            Conv2d-5         [-1, 32, 130, 130]             288
          Identity-6         [-1, 32, 130, 130]               0
              SiLU-7         [-1, 32, 130, 130]               0
    BatchNormAct2d-8         [-1, 32, 130, 130]              64
            Conv2d-9              [-1, 8, 1, 1]             264
             SiLU-10              [-1, 8, 1, 1]               0
           Conv2d-11             [-1, 32, 1, 1]             288
          Sigmoid-12             

In [13]:
efficientnet, device = arch.model_creation(arch_type = "efficientnet_b2", weights = "affectnet_cat_emot")
print(model.classifier)


Using CUDA with 1 GPUs
Using CUDA device:NVIDIA GeForce GTX 1080 Ti
Weights obtained from: https://github.com/av-savchenko/face-emotion-recognition/blob/main/models/affectnet_emotions/enet_b2_8_best.pt
Linear(in_features=1408, out_features=8, bias=True)


## DeiT


In [14]:
model, device = arch.model_creation(arch_type="deit_tiny", weights="imagenet")

batch_size = 96
for epoch in range(10):
    # Run the model
    input_tensor = torch.rand(batch_size, 3, 224, 224).to(device)  # Batch size 32
    pred, dist_pred = model(input_tensor)
    print("Epoch ", epoch, ":", sep ="")
    print(pred.size())
    print(dist_pred.size())

Using CUDA with 1 GPUs
Using CUDA device:NVIDIA GeForce GTX 1080 Ti


model.safetensors:   0%|          | 0.00/23.7M [00:00<?, ?B/s]

Epoch 0:
torch.Size([96, 8])
torch.Size([96, 8])
Epoch 1:
torch.Size([96, 8])
torch.Size([96, 8])
Epoch 2:
torch.Size([96, 8])
torch.Size([96, 8])
Epoch 3:
torch.Size([96, 8])
torch.Size([96, 8])
Epoch 4:
torch.Size([96, 8])
torch.Size([96, 8])
Epoch 5:
torch.Size([96, 8])
torch.Size([96, 8])
Epoch 6:
torch.Size([96, 8])
torch.Size([96, 8])
Epoch 7:
torch.Size([96, 8])
torch.Size([96, 8])
Epoch 8:
torch.Size([96, 8])
torch.Size([96, 8])
Epoch 9:
torch.Size([96, 8])
torch.Size([96, 8])


In [16]:
model_t = timm.create_model('timm/deit_tiny_distilled_patch16_224.fb_in1k', pretrained=True).to(device)
model_s = timm.create_model('timm/deit_small_distilled_patch16_224.fb_in1k', pretrained=True).to(device)
model_b = timm.create_model('timm/deit_base_distilled_patch16_224.fb_in1k', pretrained=True).to(device)
print(summary(model_t, (3, 224, 224))) # Summary of the model with input size (3, 224, 224)
print(model)

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1          [-1, 192, 14, 14]         147,648
          Identity-2             [-1, 196, 192]               0
        PatchEmbed-3             [-1, 196, 192]               0
           Dropout-4             [-1, 198, 192]               0
          Identity-5             [-1, 198, 192]               0
          Identity-6             [-1, 198, 192]               0
         LayerNorm-7             [-1, 198, 192]             384
            Linear-8             [-1, 198, 576]         111,168
          Identity-9           [-1, 3, 198, 64]               0
         Identity-10           [-1, 3, 198, 64]               0
           Linear-11             [-1, 198, 192]          37,056
          Dropout-12             [-1, 198, 192]               0
        Attention-13             [-1, 198, 192]               0
         Identity-14             [-1, 1

In [17]:
import numpy as np
from numpy.linalg import norm

In [18]:
model = timm.create_model('deit_base_distilled_patch16_224.fb_in1k', pretrained=True, num_classes = 8)
positional_embeddings = model.pos_embed

print(positional_embeddings.size())
dist_token = model.dist_token
print(dist_token.size())
cls_token = model.cls_token
print(cls_token.size())
cos_sim = nn.CosineSimilarity(dim=0, eps=1e-6)
cosine = cos_sim(cls_token.squeeze(),dist_token.squeeze())
print(cosine)
cls_token = cls_token.squeeze().detach().numpy()
dist_token = dist_token.squeeze().detach().numpy()
cosine = np.dot(cls_token,dist_token)/(norm(cls_token)*norm(dist_token))
print(cosine)

torch.Size([1, 198, 768])
torch.Size([1, 1, 768])
torch.Size([1, 1, 768])
tensor(0.6802, grad_fn=<SumBackward1>)
0.68021625


In [19]:
# Iterate over the model's parameters
for name, param in model.named_parameters():
    print('Name:', name)
    print('Size:', param.size())

Name: cls_token
Size: torch.Size([1, 1, 768])
Name: pos_embed
Size: torch.Size([1, 198, 768])
Name: dist_token
Size: torch.Size([1, 1, 768])
Name: patch_embed.proj.weight
Size: torch.Size([768, 3, 16, 16])
Name: patch_embed.proj.bias
Size: torch.Size([768])
Name: blocks.0.norm1.weight
Size: torch.Size([768])
Name: blocks.0.norm1.bias
Size: torch.Size([768])
Name: blocks.0.attn.qkv.weight
Size: torch.Size([2304, 768])
Name: blocks.0.attn.qkv.bias
Size: torch.Size([2304])
Name: blocks.0.attn.proj.weight
Size: torch.Size([768, 768])
Name: blocks.0.attn.proj.bias
Size: torch.Size([768])
Name: blocks.0.norm2.weight
Size: torch.Size([768])
Name: blocks.0.norm2.bias
Size: torch.Size([768])
Name: blocks.0.mlp.fc1.weight
Size: torch.Size([3072, 768])
Name: blocks.0.mlp.fc1.bias
Size: torch.Size([3072])
Name: blocks.0.mlp.fc2.weight
Size: torch.Size([768, 3072])
Name: blocks.0.mlp.fc2.bias
Size: torch.Size([768])
Name: blocks.1.norm1.weight
Size: torch.Size([768])
Name: blocks.1.norm1.bias
Size: