In [21]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as T
import time
import einops as E

In [22]:
def tokens_to_output(output_type, dense_tokens, cls_token, feat_hw):
    if output_type == "cls":
        assert cls_token is not None
        output = cls_token
    elif output_type == "gap":
        output = dense_tokens.mean(dim=1)
    elif output_type == "dense":
        h, w = feat_hw
        dense_tokens = E.rearrange(dense_tokens, "b (h w) c -> b c h w", h=h, w=w)
        output = dense_tokens.contiguous()
    elif output_type == "dense-cls":
        assert cls_token is not None
        h, w = feat_hw
        dense_tokens = E.rearrange(dense_tokens, "b (h w) c -> b c h w", h=h, w=w)
        cls_token = cls_token[:, :, None, None].repeat(1, 1, h, w)
        output = torch.cat((dense_tokens, cls_token), dim=1).contiguous()
    else:
        raise ValueError()

    return output
    

def debug_forward():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # 创建模拟输入
    x = torch.randn(1, 3, 512, 512).to(device)

    torch.hub.set_dir('/home/yijing/workspace/torch_cache')
    # 加载模型
    model = torch.hub.load('facebookresearch/dinov2', 'dinov2_vits14').to(device)
    print(type(model))
    print(dir(model))

    # 模拟 center_padding
    def mock_center_padding(x):
        _, _, h, w = x.shape
        pad_h = (14 - h % 14) % 14
        pad_w = (14 - w % 14) % 14
        
        pad_t = pad_h // 2
        pad_b = pad_h - pad_t
        pad_l = pad_w // 2
        pad_r = pad_w - pad_l
         
        return F.pad(x, (pad_l, pad_r, pad_t, pad_b))
    
    # 模拟填充
    x_padded = mock_center_padding(x)
    print('Padded shape:', x_padded.shape)
    x = model.prepare_tokens_with_masks(x_padded, None)
    embeds = []
    for i, blk in enumerate(model.blocks):
        x = blk(x)
        print('x.shape: ', x.shape)
        embeds.append(x)

    num_spatial = 37 * 37
    outputs = []
    for i, x_i in enumerate(embeds):
        cls_tok = x_i[:, 0]
        print('cls_tok.shape: ', cls_tok.shape)
        # ignoring register tokens
        spatial = x_i[:, -1 * num_spatial :]
        h, w = 37, 37
        x_i = tokens_to_output("dense", spatial, cls_tok, (h, w))
        print('x_i.shape: ', x_i.shape)
        outputs.append(x_i)

# 运行调试
debug_forward()

Using cache found in /home/yijing/workspace/torch_cache/facebookresearch_dinov2_main


<class 'dinov2.models.vision_transformer.DinoVisionTransformer'>
['T_destination', '__annotations__', '__call__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_apply', '_backward_hooks', '_backward_pre_hooks', '_buffers', '_call_impl', '_compiled_call_impl', '_forward_hooks', '_forward_hooks_always_called', '_forward_hooks_with_kwargs', '_forward_pre_hooks', '_forward_pre_hooks_with_kwargs', '_get_backward_hooks', '_get_backward_pre_hooks', '_get_intermediate_layers_chunked', '_get_intermediate_layers_not_chunked', '_get_name', '_is_full_backward_hook', '_load_from_state_dict', '_load_state_dict_post_hooks', '_load_state_dict_pre_hooks', '_maybe

In [10]:
# def debug_forward():
#     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#     # 创建模拟输入
#     x = torch.randn(1, 3, 512, 512).to(device)

#     torch.hub.set_dir('/home/yijing/workspace/torch_cache')
#     # 加载模型
#     model = torch.hub.load("mhamilton723/FeatUp", 'dinov2', trust_repo=True).to(device)

#     # 模拟 center_padding
#     def mock_center_padding(x):
#         _, _, h, w = x.shape
#         pad_h = (14 - h % 14) % 14
#         pad_w = (14 - w % 14) % 14
        
#         pad_t = pad_h // 2
#         pad_b = pad_h - pad_t
#         pad_l = pad_w // 2
#         pad_r = pad_w - pad_l
        
#         return F.pad(x, (pad_l, pad_r, pad_t, pad_b))
    
#     # 模拟填充
#     x_padded = mock_center_padding(x)
#     print('Padded shape:', x_padded.shape)
    
#     # 前向传播
#     feat = model(x_padded)
#     print('Feature shape:', feat.shape)

# # 运行调试
# debug_forward()

Using cache found in /home/yijing/workspace/torch_cache/mhamilton723_FeatUp_main
Using cache found in /home/yijing/workspace/torch_cache/facebookresearch_dinov2_main


Padded shape: torch.Size([1, 3, 518, 518])
Feature shape: torch.Size([1, 384, 592, 592])


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


/home/yijing/.conda/envs/boosting3DOF/lib/python3.9/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: libtorch_cuda_cu.so: cannot open shared object file: No such file or directory
  warn(f"Failed to load image Python extension: {e}")
Using cache found in /home/yijing/workspace/torch_cache/mhamilton723_FeatUp_main
Using cache found in /home/yijing/workspace/torch_cache/facebookresearch_dinov2_main
Padded shape: torch.Size([1, 3, 518, 518])
Feature shape: torch.Size([1, 384, 592, 592])
/home/yijing/.conda/envs/boosting3DOF/lib/python3.9/site-packages/torch/functional.py:534: UserWarning: torch.meshgrid: in an upcoming release, it will be required to pass the indexing argument. (Triggered internally at ../aten/src/ATen/native/TensorShape.cpp:3595.)
  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]