## Explore which features can be extracted from the DINO backbone and their dimensionality

In [1]:
import torch
torch.hub.set_dir("../pretrained_weights")

In [2]:
# load the backbone model
device = 'cpu' #0 if torch.cuda.is_available() else "cpu"
dino_backbone = torch.hub.load('facebookresearch/dinov2', 'dinov2_vits14').to(device).eval()

Downloading: "https://github.com/facebookresearch/dinov2/zipball/main" to ../pretrained_weights/main.zip
Downloading: "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_pretrain.pth" to ../pretrained_weights/checkpoints/dinov2_vits14_pretrain.pth
100%|█████████████████████████████████████████████████████████████████████████████| 84.2M/84.2M [00:00<00:00, 104MB/s]


In [8]:
print(dino_backbone)

DinoVisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 384, kernel_size=(14, 14), stride=(14, 14))
    (norm): Identity()
  )
  (blocks): ModuleList(
    (0-11): 12 x NestedTensorBlock(
      (norm1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (attn): MemEffAttention(
        (qkv): Linear(in_features=384, out_features=1152, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=384, out_features=384, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): LayerScale()
      (drop_path1): Identity()
      (norm2): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=384, out_features=1536, bias=True)
        (act): GELU(approximate='none')
        (fc2): Linear(in_features=1536, out_features=384, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
      (ls2): LayerScale()
      (drop_path2): Identity()
    )
  )
  (n

In [6]:
# First, the default backbone.forward method returns class token
in_tensor = torch.randn(8, 3, 224, 224).to(device) # BxCxHxW
out = dino_backbone(in_tensor)
out.shape

torch.Size([8, 384])

In [7]:
# The backbone.forward_features method returns multiple outputs from the final DINO block
out_features = dino_backbone.forward_features(in_tensor)
for k in out_features.keys():
    shape = None if out_features[k] is None else out_features[k].shape
    print(f"{k}: shape:{shape}")

x_norm_clstoken: shape:torch.Size([8, 384])
x_norm_regtokens: shape:torch.Size([8, 0, 384])
x_norm_patchtokens: shape:torch.Size([8, 256, 384])
x_prenorm: shape:torch.Size([8, 257, 384])
masks: shape:None


In [9]:
# The class token is equivalent to the default forward method output
out == out_features['x_norm_clstoken']

tensor([[True, True, True,  ..., True, True, True],
        [True, True, True,  ..., True, True, True],
        [True, True, True,  ..., True, True, True],
        ...,
        [True, True, True,  ..., True, True, True],
        [True, True, True,  ..., True, True, True],
        [True, True, True,  ..., True, True, True]])

In [10]:
# The official DINOv2 repo has a function called forward_features_list, but it is not used in their experiments
# Instead, they use get_intermediate_layers, so let's try that
# The default configuration for this function is n=1, reshape=False, return_class_token=False, norm=True
n_last_blocks = 3
out_layers = dino_backbone.get_intermediate_layers(in_tensor, n=n_last_blocks, reshape=False, return_class_token=True, norm=True)

In [11]:
# The interpretation of the out_layers is as follows:
# out_layers[i][0] is the patch tokens for the i-th layer, but counted forwards from the n_last_blocks
# Eg. if n_last_blocks=4, then i=0 would be taking from the 4th-last block, i=3 would be taking from the last block
# out_layers[i][1] is the class token for the i-th layer
# If reshape=True, that means the patch tokens are reshaped to 16x16
print(out_layers[0][0].shape)
print(out_layers[0][1].shape)

torch.Size([8, 256, 384])
torch.Size([8, 384])


In [12]:
# Class token equivalence
out_layers[2][1] == out

tensor([[True, True, True,  ..., True, True, True],
        [True, True, True,  ..., True, True, True],
        [True, True, True,  ..., True, True, True],
        ...,
        [True, True, True,  ..., True, True, True],
        [True, True, True,  ..., True, True, True],
        [True, True, True,  ..., True, True, True]])

In [13]:
# Patch tokens equivalence
out_layers[2][0] == out_features['x_norm_patchtokens']

tensor([[[True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         ...,
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True]],

        [[True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         ...,
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True]],

        [[True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         ...,
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True],
         [True, True, True,  ..., True, True, True]],