# Inspect Vgg16 Model

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
from torchinfo import summary
import sys
sys.path.append('../')
from torch.utils.tensorboard import SummaryWriter

In [2]:
from models.vgg16 import Vgg16Config, Vgg16

img_h, img_w, n_class = 224, 224, 1000
model_args = dict(
    img_h=img_h,
    img_w=img_w,
    n_class=n_class
)

model_config = Vgg16Config(**model_args)
model = Vgg16(model_config)

batch_size = 1
summary(model, input_size=[(batch_size, 3, img_h, img_w)], device='cpu',
        col_names=("input_size", "output_size", "num_params", "kernel_size", "mult_adds", "trainable"), verbose=2,
        depth=4, row_settings=("depth", "var_names"));

number of parameters: 138.37M
Layer (type (var_name):depth-idx)             Input Shape               Output Shape              Param #                   Kernel Shape              Mult-Adds                 Trainable
Vgg16 (Vgg16)                                 [1, 3, 224, 224]          [1, 1000]                 --                        --                        --                        True
├─VGG (model): 1-1                            [1, 3, 224, 224]          [1, 1000]                 --                        --                        --                        True
│    └─features.0.weight                                                                          ├─1,728                   [64, 3, 3, 3]
│    └─features.0.bias                                                                            ├─64                      [64]
│    └─features.1.weight                                                                          ├─64                      [64]
│    └─features.1.bias    

In [3]:
writer = SummaryWriter()
imgs = torch.randn(batch_size, 3, img_h, img_w)
targets = torch.randint(0, n_class, (batch_size,))
writer.add_graph(model, [imgs, targets])
writer.close()

In [4]:
logits, loss = model(imgs, targets)
print(logits.shape)
print(loss.shape)

torch.Size([1, 1000])
torch.Size([])


## Inspect state_dict

In [5]:
import tempfile

# Save state_dict as a temporary file
with tempfile.NamedTemporaryFile(suffix='.pth') as temp_file:
    torch.save(model.state_dict(), temp_file.name)
    state_dict = torch.load(temp_file.name)

# Print the keys of the state_dict
for key in state_dict.keys():
    print(key)


model.features.0.weight
model.features.0.bias
model.features.1.weight
model.features.1.bias
model.features.1.running_mean
model.features.1.running_var
model.features.1.num_batches_tracked
model.features.3.weight
model.features.3.bias
model.features.4.weight
model.features.4.bias
model.features.4.running_mean
model.features.4.running_var
model.features.4.num_batches_tracked
model.features.7.weight
model.features.7.bias
model.features.8.weight
model.features.8.bias
model.features.8.running_mean
model.features.8.running_var
model.features.8.num_batches_tracked
model.features.10.weight
model.features.10.bias
model.features.11.weight
model.features.11.bias
model.features.11.running_mean
model.features.11.running_var
model.features.11.num_batches_tracked
model.features.14.weight
model.features.14.bias
model.features.15.weight
model.features.15.bias
model.features.15.running_mean
model.features.15.running_var
model.features.15.num_batches_tracked
model.features.17.weight
model.features.17.bia

## Inspec gradients

In [6]:
class DebugDependencyVgg16(Vgg16):
    """Darknet19 with dummy loss function for debugging dependencies."""
    def __init__(self, loss_img_idx, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_img_idx = loss_img_idx

    def _compute_loss(self, logits: Tensor, targets: Tensor) -> Tensor:
        """
        Compute the dummy loss as sum of all outputs of img i.
        Args:
            logits (Tensor): size(N, n_class)
            targets (Tensor): size(N,)
        Returns:
            loss (Tensor): size(,)
        """
        return torch.sum(logits[self.loss_img_idx])


To verify no inadvertently mix information across the batch dimension, run the backward pass all the way to the input, and ensure that you get a non-zero gradient only on the i-th input.

In [7]:
batch_size = 5
X = torch.randn(batch_size, 3, img_h, img_w, requires_grad=True)
Y =  torch.randint(0, n_class, (batch_size,))

In [8]:
print("Setup:")
torch.manual_seed(1337);  # since kaiming_normal_ weight init is random
loss_img_idx = 2
debug_dependency_model = DebugDependencyVgg16(loss_img_idx, model_config)
debug_dependency_model.eval()  # since batchnorm will mixed variables between images in a batch

optimizer_type = 'adam'
learning_rate = 3e-4
beta1 = 0.9
beta2 = 0.999
weight_decay = 5e-4
device_type = 'cpu'
use_fused = False
optimizer = debug_dependency_model.configure_optimizers(optimizer_type, learning_rate, (beta1, beta2), weight_decay, device_type, use_fused)

print("\n\nForward pass:")
logits, loss = debug_dependency_model(X, Y)
print(f"{logits.shape=}")
print(f"{loss=}")

print("\n\nBackward pass & Gradients of X::")
optimizer.zero_grad(set_to_none=True)
X.grad = None  # zero out X gradients
loss.backward()

print(f"{X.grad.shape=}")
for idx_img in range(batch_size):
    print(f"Zero gradient of {idx_img}'th img: {torch.all(X.grad[idx_img]==0)}")

print("\n\nOptimizer step:")
optimizer.step()
print(f"Loss after optimized: {debug_dependency_model(X, Y)[1]}")

Setup:
number of parameters: 138.37M
num decayed parameter tensors: 16, with 138,344,128 parameters
num non-decayed parameter tensors: 42, with 21,864 parameters
using fused Adam: False


Forward pass:
logits.shape=torch.Size([5, 1000])
loss=tensor(4.0809, grad_fn=<SumBackward0>)


Backward pass & Gradients of X::
X.grad.shape=torch.Size([5, 3, 224, 224])
Zero gradient of 0'th img: True
Zero gradient of 1'th img: True
Zero gradient of 2'th img: False
Zero gradient of 3'th img: True
Zero gradient of 4'th img: True


Optimizer step:
Loss after optimized: -708.4712524414062
