# Inspect Darknet53 Model

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
from torchinfo import summary
import sys
sys.path.append('../')
from torch.utils.tensorboard import SummaryWriter

## Inspect Structure & In/Out Shape

In [2]:
from models.darknet53 import Darknet53Config, Darknet53

img_h, img_w, n_class = 256, 256, 1000
model_args = dict(
    img_h=img_h,
    img_w=img_w,
    n_class=n_class
)

model_config = Darknet53Config(**model_args)
model = Darknet53(model_config)

batch_size = 1
summary(model, input_size=[(batch_size, 3, img_h, img_w)], device='cpu',
        col_names=("input_size", "output_size", "num_params", "kernel_size", "mult_adds", "trainable"), verbose=2,
        depth=4, row_settings=("depth", "var_names"));

number of parameters: 41.61M
Layer (type (var_name):depth-idx)                  Input Shape               Output Shape              Param #                   Kernel Shape              Mult-Adds                 Trainable
Darknet53 (Darknet53)                              [1, 3, 256, 256]          [1, 1000]                 --                        --                        --                        True
├─Darknet53Backbone (backbone): 1-1                [1, 3, 256, 256]          [1, 1024, 8, 8]           --                        --                        --                        True
│    └─conv0.conv.weight                                                                               ├─864                     [32, 3, 3, 3]
│    └─conv0.bn.weight                                                                                 ├─32                      [32]
│    └─conv0.bn.bias                                                                                   ├─32                      [3

In [3]:
writer = SummaryWriter()
imgs = torch.randn(batch_size, 3, img_h, img_w)
targets = torch.randint(0, n_class, (batch_size,))
writer.add_graph(model, [imgs, targets])
writer.close()

In [4]:
logits, loss = model(imgs, targets)
print(logits.shape)
print(loss.shape)

torch.Size([1, 1000])
torch.Size([])


## Inspect state_dict

In [5]:
import tempfile

# Save state_dict as a temporary file
with tempfile.NamedTemporaryFile(suffix='.pth') as temp_file:
    torch.save(model.state_dict(), temp_file.name)
    state_dict = torch.load(temp_file.name)

# Print the keys of the state_dict
for key in state_dict.keys():
    print(key)


backbone.conv0.conv.weight
backbone.conv0.bn.weight
backbone.conv0.bn.bias
backbone.conv0.bn.running_mean
backbone.conv0.bn.running_var
backbone.conv0.bn.num_batches_tracked
backbone.conv1.conv.weight
backbone.conv1.bn.weight
backbone.conv1.bn.bias
backbone.conv1.bn.running_mean
backbone.conv1.bn.running_var
backbone.conv1.bn.num_batches_tracked
backbone.stage1_block1.conv1x1.conv.weight
backbone.stage1_block1.conv1x1.bn.weight
backbone.stage1_block1.conv1x1.bn.bias
backbone.stage1_block1.conv1x1.bn.running_mean
backbone.stage1_block1.conv1x1.bn.running_var
backbone.stage1_block1.conv1x1.bn.num_batches_tracked
backbone.stage1_block1.conv3x3.conv.weight
backbone.stage1_block1.conv3x3.bn.weight
backbone.stage1_block1.conv3x3.bn.bias
backbone.stage1_block1.conv3x3.bn.running_mean
backbone.stage1_block1.conv3x3.bn.running_var
backbone.stage1_block1.conv3x3.bn.num_batches_tracked
backbone.conv2.conv.weight
backbone.conv2.bn.weight
backbone.conv2.bn.bias
backbone.conv2.bn.running_mean
backbo

## Inspect gradients

In [6]:
class DebugDependencyDarknet53(Darknet53):
    """Darknet53 with dummy loss function for debugging dependencies."""
    def __init__(self, loss_img_idx, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_img_idx = loss_img_idx

    def _compute_loss(self, logits: Tensor, targets: Tensor) -> Tensor:
        """
        Compute the dummy loss as sum of all outputs of img i.
        Args:
            logits (Tensor): size(N, n_class)
            targets (Tensor): size(N,)
        Returns:
            loss (Tensor): size(,)
        """
        return torch.sum(logits[self.loss_img_idx])


To verify no inadvertently mix information across the batch dimension, run the backward pass all the way to the input, and ensure that you get a non-zero gradient only on the i-th input.

In [7]:
batch_size = 5
X = torch.randn(batch_size, 3, img_h, img_w, requires_grad=True)
Y =  torch.randint(0, n_class, (batch_size,))

In [8]:
print("Setup:")
torch.manual_seed(1337);  # since kaiming_normal_ weight init is random
loss_img_idx = 2
debug_dependency_model = DebugDependencyDarknet53(loss_img_idx, model_config)
debug_dependency_model.eval()  # since batchnorm will mixed variables between images in a batch

optimizer_type = 'adam'
learning_rate = 3e-4
beta1 = 0.9
beta2 = 0.999
weight_decay = 5e-4
device_type = 'cpu'
use_fused = False
optimizer = debug_dependency_model.configure_optimizers(optimizer_type, learning_rate, (beta1, beta2), weight_decay, device_type, use_fused)

print("\n\nForward pass:")
logits, loss = debug_dependency_model(X, Y)
print(f"{logits.shape=}")
print(f"{loss=}")

print("\n\nBackward pass & Gradients of X::")
optimizer.zero_grad(set_to_none=True)
X.grad = None  # zero out X gradients
loss.backward()

print(f"{X.grad.shape=}")
for idx_img in range(batch_size):
    print(f"Zero gradient of {idx_img}'th img: {torch.all(X.grad[idx_img]==0)}")

print("\n\nOptimizer step:")
optimizer.step()
print(f"Loss after optimized: {debug_dependency_model(X, Y)[1]}")

Setup:
number of parameters: 41.61M
num decayed parameter tensors: 53, with 41,573,216 parameters
num non-decayed parameter tensors: 105, with 36,712 parameters
using fused Adam: False


Forward pass:
logits.shape=torch.Size([5, 1000])
loss=tensor(-5807.6699, grad_fn=<SumBackward0>)


Backward pass & Gradients of X::
X.grad.shape=torch.Size([5, 3, 256, 256])
Zero gradient of 0'th img: True
Zero gradient of 1'th img: True
Zero gradient of 2'th img: False
Zero gradient of 3'th img: True
Zero gradient of 4'th img: True


Optimizer step:
Loss after optimized: -94209768.0


## Inspect Results

In [None]:
# TODO