# Inspect YOLOv2 Model

In [9]:
import os
import numpy as np
from PIL import Image
from matplotlib import pyplot as plt
import torch
from torch import Tensor
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data.dataloader import DataLoader
from torchvision.datasets import VOCDetection, wrap_dataset_for_transforms_v2
from torchvision.transforms import v2
from torchvision.tv_tensors._dataset_wrapper import VOC_DETECTION_CATEGORIES, VOC_DETECTION_CATEGORY_TO_IDX
from torchvision import tv_tensors
from torchvision.utils import draw_bounding_boxes
from torchvision.ops import box_convert, clip_boxes_to_image, nms, batched_nms
from torchmetrics.detection import MeanAveragePrecision
from torchinfo import summary
import sys
sys.path.append('../')
from models.yolov2 import Yolov2Config, Yolov2
from torch.utils.tensorboard import SummaryWriter
# palette is a list of color tuples, which is used for visualization.
palette =  [(106, 0, 228), (119, 11, 32), (165, 42, 42), (0, 0, 192),
            (197, 226, 255), (0, 60, 100), (0, 0, 142), (255, 77, 255),
            (153, 69, 1), (120, 166, 157), (0, 182, 199),
            (0, 226, 252), (182, 182, 255), (0, 0, 230), (220, 20, 60),
            (163, 255, 0), (0, 82, 0), (3, 95, 161), (0, 80, 100),
            (183, 130, 88)]

## Inspect structure

In [2]:
img_h, img_w = 448, 448
n_class = 20
n_box_per_cell = 5

model_args = dict(
    img_h=img_h,
    img_w=img_w,
    n_class=n_class,
    n_box_per_cell=n_box_per_cell,
)

n_grid_h, n_grid_w = img_h * 14 // 416, img_w * 14 // 416

model_config = Yolov2Config(**model_args)
model = Yolov2(model_config)

batch_size = 1
summary(model, input_size=[(batch_size, 3, img_h, img_w), (batch_size, n_grid_h, n_grid_w, n_box_per_cell, 6)], device='cuda',
        col_names=("input_size", "output_size", "num_params", "kernel_size", "mult_adds", "trainable"), verbose=2,
        depth=4, row_settings=("depth", "var_names"));

number of parameters: 67.14M
Layer (type (var_name):depth-idx)             Input Shape               Output Shape              Param #                   Kernel Shape              Mult-Adds                 Trainable
Yolov2 (Yolov2)                               [1, 3, 448, 448]          [1, 14, 14, 5, 25]        --                        --                        --                        True
├─Darknet19Backbone (backbone): 1-1           [1, 3, 448, 448]          [1, 1024, 14, 14]         --                        --                        --                        True
│    └─conv1.conv.weight                                                                          ├─864                     [32, 3, 3, 3]
│    └─conv1.bn.weight                                                                            ├─32                      [32]
│    └─conv1.bn.bias                                                                              ├─32                      [32]
│    └─conv2.conv.weight   

In [3]:
writer = SummaryWriter()
imgs = torch.randn(batch_size, 3, img_h, img_w)
targets = torch.randn(batch_size, n_grid_h, n_grid_w, n_box_per_cell, 6)
model.to('cpu')
writer.add_graph(model, [imgs, targets])
writer.close()

  anchors = torch.tensor(self.config.anchors, dtype=dtype, device=device)  # size(n_box_per_cell, 2)
  loss = torch.tensor(0.0, dtype=dtype, device=device)
  loss_noobj = torch.tensor(0.0, dtype=dtype, device=device)
  loss_obj = torch.tensor(0.0, dtype=dtype, device=device)
  loss_class = torch.tensor(0.0, dtype=dtype, device=device)
  loss_xy = torch.tensor(0.0, dtype=dtype, device=device)
  loss_wh = torch.tensor(0.0, dtype=dtype, device=device)


In [7]:
logits, loss, _, _, _, _, _ = model(imgs, targets)
print(logits.shape)
print(loss.shape)

torch.Size([1, 14, 14, 5, 25])
torch.Size([])


## Inspect state_dict

In [8]:
import tempfile

# Save state_dict as a temporary file
with tempfile.NamedTemporaryFile(suffix='.pth') as temp_file:
    torch.save(model.state_dict(), temp_file.name)
    state_dict = torch.load(temp_file.name)

# Print the keys of the state_dict
for key in state_dict.keys():
    print(key)


backbone.conv1.conv.weight
backbone.conv1.bn.weight
backbone.conv1.bn.bias
backbone.conv1.bn.running_mean
backbone.conv1.bn.running_var
backbone.conv1.bn.num_batches_tracked
backbone.conv2.conv.weight
backbone.conv2.bn.weight
backbone.conv2.bn.bias
backbone.conv2.bn.running_mean
backbone.conv2.bn.running_var
backbone.conv2.bn.num_batches_tracked
backbone.conv3.conv.weight
backbone.conv3.bn.weight
backbone.conv3.bn.bias
backbone.conv3.bn.running_mean
backbone.conv3.bn.running_var
backbone.conv3.bn.num_batches_tracked
backbone.conv4.conv.weight
backbone.conv4.bn.weight
backbone.conv4.bn.bias
backbone.conv4.bn.running_mean
backbone.conv4.bn.running_var
backbone.conv4.bn.num_batches_tracked
backbone.conv5.conv.weight
backbone.conv5.bn.weight
backbone.conv5.bn.bias
backbone.conv5.bn.running_mean
backbone.conv5.bn.running_var
backbone.conv5.bn.num_batches_tracked
backbone.conv6.conv.weight
backbone.conv6.bn.weight
backbone.conv6.bn.bias
backbone.conv6.bn.running_mean
backbone.conv6.bn.runnin

## Inspect gradients

In [16]:
class DebugDependencyYolov2(Yolov2):
    """Yolov2 with dummy loss function for debugging dependencies."""
    def __init__(self, loss_img_idx, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_img_idx = loss_img_idx

    def _compute_loss(self, logits: Tensor, targets: Tensor) -> Tensor:
        """
        Compute the dummy loss as sum of all outputs of img i.
        Args:
            logits (Tensor): size(N, n_grid_h, n_grid_w, n_box_per_cell, (5 + n_class))
            targets (Tensor): size(N, n_grid_h, n_grid_w, n_box_per_cell, 6)
        Returns:
            loss (Tensor): size(,)
            loss_noobj (Tensor): size(,)
            loss_obj (Tensor): size(,)
            loss_class (Tensor): size(,)
            loss_xy (Tensor): size(,)
            loss_wh (Tensor): size(,)
        """
        return torch.sum(logits[self.loss_img_idx]), None, None, None, None, None


To verify no inadvertently mix information across the batch dimension, run the backward pass all the way to the input, and ensure that you get a non-zero gradient only on the i-th input.

In [17]:
batch_size = 5
X = torch.randn(batch_size, 3, img_h, img_w, requires_grad=True)
Y =  targets = torch.randn(batch_size, n_grid_h, n_grid_w, n_box_per_cell, 6)

In [18]:
print("Setup:")
torch.manual_seed(1337);  # since kaiming_normal_ weight init is random
loss_img_idx = 2
debug_dependency_model = DebugDependencyYolov2(loss_img_idx, model_config)
debug_dependency_model.eval()  # since batchnorm will mixed variables between images in a batch

optimizer_type = 'adam'
learning_rate = 3e-4
beta1 = 0.9
beta2 = 0.999
weight_decay = 5e-4
device_type = 'cpu'
use_fused = False
optimizer = debug_dependency_model.configure_optimizers(optimizer_type, learning_rate, (beta1, beta2), weight_decay, device_type, use_fused)

print("\n\nForward pass:")
logits, loss, _, _, _, _, _ = debug_dependency_model(X, Y)
print(f"{logits.shape=}")
print(f"{loss=}")

print("\n\nBackward pass & Gradients of X::")
optimizer.zero_grad(set_to_none=True)
X.grad = None  # zero out X gradients
loss.backward()

print(f"{X.grad.shape=}")
for idx_img in range(batch_size):
    print(f"Zero gradient of {idx_img}'th img: {torch.all(X.grad[idx_img]==0)}")

print("\n\nOptimizer step:")
optimizer.step()
print(f"Loss after optimized: {debug_dependency_model(X, Y)[1]}")

Setup:
number of parameters: 67.14M
num decayed parameter tensors: 22, with 67,116,896 parameters
num non-decayed parameter tensors: 43, with 20,669 parameters
using fused Adam: False


Forward pass:
logits.shape=torch.Size([5, 14, 14, 5, 25])
loss=tensor(-277.3766, grad_fn=<SumBackward0>)


Backward pass & Gradients of X::
X.grad.shape=torch.Size([5, 3, 448, 448])
Zero gradient of 0'th img: True
Zero gradient of 1'th img: True
Zero gradient of 2'th img: False
Zero gradient of 3'th img: True
Zero gradient of 4'th img: True


Optimizer step:
Loss after optimized: -1272555.5


## Inspect output, post-processing, prediction