In [None]:
# load libs 

from pathlib import Path
import torch 
from tqdm import tqdm
import timm 
import os
import csv
import pickle

from dinov2_ood_utilities.custom_datasets import CustomizedImageFolder, CustomizedImageFolderForImagenetV2 

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# load general list for wnid to index mapping: Index(wnid) -> pos. in list

class_to_index_mapping = []

with open('../resources/imagenet_train_class_to_index_mapping.csv', 'r') as class_index_table:
    class_index_reader = csv.reader(class_index_table, delimiter=';')
    for inet_class, _ in class_index_reader: 
        class_to_index_mapping.append(inet_class)


In [2]:
# prepare datasets and dataloaders 

timm_model = 'vit_small_patch14_dinov2'
timm_model_conf = timm.data.resolve_model_data_config(timm_model)
print(f'Before change: {timm_model_conf}')
timm_model_conf['input_size'] = (3, 518, 518)

timm_transform = timm.data.create_transform(**timm_model_conf, is_training=False)

print(f'Following transform will be applied: {timm_transform}')
print(timm_model_conf)


with open('../resources/imagenet_1k_label_order.txt', 'r') as label_order_file:
    inet_1k_labels = label_order_file.readlines()
    inet_1k_labels = [label_order_line.split()[0] for label_order_line in inet_1k_labels]

# datasets 

inet_v2_70 = CustomizedImageFolderForImagenetV2(not_processed_imagenet_classes=inet_1k_labels,
                                                root='/home/stud/afroehli/datasets/ImagenetV2/imagenetv2-threshold0.7-format-val', 
                                                transform=timm_transform)
inet_v2_mf = CustomizedImageFolderForImagenetV2(not_processed_imagenet_classes=inet_1k_labels,
                                                root='/home/stud/afroehli/datasets/ImagenetV2/imagenetv2-matched-frequency-format-val', 
                                                transform=timm_transform)
inet_v2_top = CustomizedImageFolderForImagenetV2(not_processed_imagenet_classes=inet_1k_labels,
                                                 root='/home/stud/afroehli/datasets/ImagenetV2/imagenetv2-top-images-format-val', 
                                                 transform=timm_transform)
inet_1k_val_resized = CustomizedImageFolder(not_processed_imagenet_classes=inet_1k_labels, 
                                            root='/home/stud/afroehli/datasets/ImageNet1k/imagenet1k/ILSVRC/Data/CLS-LOC/val_sorted', 
                                            transform=timm_transform)

inet_1k_train = CustomizedImageFolder(not_processed_imagenet_classes=inet_1k_labels, 
                                      root='/home/stud/afroehli/datasets/ImageNet1k/imagenet1k/ILSVRC/Data/CLS-LOC/train', 
                                      transform=timm_transform)

inet_r_path = Path('/home/stud/afroehli/datasets/ImagenetR_orig/imagenet-r')
inet_r_path.resolve()
inet_r_labels = os.listdir(inet_r_path)
inet_r = CustomizedImageFolder(not_processed_imagenet_classes=inet_r_labels, root=inet_r_path, transform=timm_transform)

# dataloaders

inet_1k_val_loader = torch.utils.data.DataLoader(dataset=inet_1k_val_resized, shuffle=False, batch_size=128, num_workers=8, pin_memory=True)

inet_v2_70_loader = torch.utils.data.DataLoader(dataset=inet_v2_70, shuffle=False, batch_size=128, num_workers=8, pin_memory=True)

inet_v2_mf_loader = torch.utils.data.DataLoader(dataset=inet_v2_mf, shuffle=False, batch_size=128, num_workers=8, pin_memory=True)

inet_v2_top_loader = torch.utils.data.DataLoader(dataset=inet_v2_top, shuffle=False, batch_size=128, num_workers=8, pin_memory=True)

inet_1k_train_loader = torch.utils.data.DataLoader(dataset=inet_1k_train, shuffle=False, batch_size=128, num_workers=8, pin_memory=True)

inet_r_loader = torch.utils.data.DataLoader(dataset=inet_r, shuffle=False, batch_size=128, num_workers=8, pin_memory=True)

print(f'inet-1k: {len(inet_1k_val_resized)}')
print(f'inet_v2_70: {len(inet_v2_70)}')
print(f'inet_1k_train: {len(inet_1k_train)}')
print(f'inet_r: {len(inet_r)}')


Before change: {'input_size': (3, 224, 224), 'interpolation': 'bicubic', 'mean': (0.485, 0.456, 0.406), 'std': (0.229, 0.224, 0.225), 'crop_pct': 0.875, 'crop_mode': 'center'}
Following transform will be applied: Compose(
    Resize(size=592, interpolation=bicubic, max_size=None, antialias=True)
    CenterCrop(size=(518, 518))
    MaybeToTensor()
    Normalize(mean=tensor([0.4850, 0.4560, 0.4060]), std=tensor([0.2290, 0.2240, 0.2250]))
)
{'input_size': (3, 518, 518), 'interpolation': 'bicubic', 'mean': (0.485, 0.456, 0.406), 'std': (0.229, 0.224, 0.225), 'crop_pct': 0.875, 'crop_mode': 'center'}
inet-1k: 50000
inet_v2_70: 10000
inet_1k_train: 1281167
inet_r: 30000


In [3]:
# define model to be used 

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device used: {device}')

vision_transformer = torch.hub.load('facebookresearch/dinov2', 'dinov2_vits14')
vision_transformer.eval()
vision_transformer.to(device)



Device used: cuda


Using cache found in /home/stud/afroehli/.cache/torch/hub/facebookresearch_dinov2_main


DinoVisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 384, kernel_size=(14, 14), stride=(14, 14))
    (norm): Identity()
  )
  (blocks): ModuleList(
    (0-11): 12 x NestedTensorBlock(
      (norm1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (attn): MemEffAttention(
        (qkv): Linear(in_features=384, out_features=1152, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=384, out_features=384, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): LayerScale()
      (drop_path1): Identity()
      (norm2): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=384, out_features=1536, bias=True)
        (act): GELU(approximate='none')
        (fc2): Linear(in_features=1536, out_features=384, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
      (ls2): LayerScale()
      (drop_path2): Identity()
    )
  )
  (n

In [6]:
# compute new embeddings 

one_layer_pth = '/home/stud/afroehli/coding/model_results/dinov2_vits14/cls_plus_patch_one_lay'

inet_1k_val_loader_tuple = (inet_1k_val_loader, f'{one_layer_pth}/inet_1k_val_cls_pt.pkl')
# inet_1k_train_loader_tuple = (inet_1k_train_loader, f'{one_layer_pth}/inet_1k_train_cls_pt.pkl')
inet_v2_70_loader_tuple = (inet_v2_70_loader, f'{one_layer_pth}/inet_v2_70_cls_pt.pkl')
inet_v2_mf_loader_tuple = (inet_v2_mf_loader, f'{one_layer_pth}/inet_v2_mf_cls_pt.pkl')
inet_v2_top_loader_tuple = (inet_v2_top_loader, f'{one_layer_pth}/inet_v2_top_cls_pt.pkl')
inet_r_loader_tuple = (inet_r_loader, f'{one_layer_pth}/inet_r_cls_pt.pkl')

dataloaders = [inet_1k_val_loader_tuple, inet_v2_70_loader_tuple, inet_v2_mf_loader_tuple, inet_v2_top_loader_tuple, inet_r_loader_tuple]

max_layers = 1

with torch.no_grad():

    for loader, str_path in dataloaders:
            
        model_results = dict()

        print(f'Next calculate results for dataset: {str_path.split('/')[-1].removesuffix('.pkl')}')

        for samples, sample_indices in (pbar := tqdm(loader, ncols=100)):

            samples = torch.unbind(samples, dim=0)
            # get output of last max_layers
            batch_out = []
            for sample in [sample.unsqueeze(0).to(device) for sample in samples]:
                sample_out = vision_transformer.get_intermediate_layers(sample, max_layers, return_class_token=True)
                batch_out.append(sample_out)

            # store [last layer (mean-patch-tokens, cls-token), scnd-last layer ...]
            model_out_converted = []
            for sample_out in batch_out:
                proc_out = []
                for i in range(max_layers):
                    pt_tokens = sample_out[i][0]
                    cls_token = sample_out[i][1]
                    proc_out.append((torch.mean(pt_tokens, dim=1).cpu().detach().numpy(), cls_token.cpu().detach().numpy()))
                model_out_converted.append(proc_out)

            # transform sample-index to wnid-string
            wnid_per_sample = [class_to_index_mapping[int(sample_index)] for sample_index in sample_indices]

            for n, sample_out_conv in enumerate(model_out_converted):
                sample_item_wnid = wnid_per_sample[n]
                try: 
                    model_results[sample_item_wnid].append(sample_out_conv)
                except KeyError:
                    model_results[sample_item_wnid] = [sample_out_conv]

        with open(str_path, 'wb') as pkl_file:
            pickle.dump(model_results, pkl_file, pickle.HIGHEST_PROTOCOL)

Next calculate results for dataset: inet_1k_val_cls_pt


  2%|â–‰                                                              | 6/391 [00:17<18:22,  2.86s/it]


KeyboardInterrupt: 

In [None]:
# only intended for embeddings corresponding to ImageNet-1k training set 
# check if all embeddings have been computed 

with open('../resources/imagenet_1k_label_order.txt', 'r') as label_order_file:
    inet_1k_labels = label_order_file.readlines()
    inet_1k_labels = [label_order_line.split()[0] for label_order_line in inet_1k_labels]
    
with open('/home/afroehli/coding/model_results/dinov2_vits14/inet_1k_train_timm_trans.pkl', 'rb') as pkl_file:
    inet_1k_train_model_results = pickle.load(pkl_file)

incomplete_classes = 0
for wnid in inet_1k_labels:
    expec_embeds = len(os.listdir(f'/home/afroehli/datasets/ImageNet1k/imagenet1k/ILSVRC/Data/CLS-LOC/train/{wnid}'))

    if expec_embeds != len(inet_1k_train_model_results[wnid]):
        incomplete_classes += 1 

print(f'Number of incomplete classes: {incomplete_classes}')


Number of incomplete classes: 0
