## Prepare environment

In [None]:
!pip install ftfy regex tqdm
!pip install git+https://github.com/openai/CLIP.git

## Import packages

In [None]:
import os
import clip
import torch
from PIL import Image
import torch.nn.functional as F

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load('ViT-B/32', device)

## SET ID to evaluate

In [4]:
# SET UP THE DIRECTORY
dir_to_eval = '/root/test'
id_to_eval = 1451 # give the ID HERE

In [5]:
full_path_for_id_gt = os.path.join(dir_to_eval, 'gt', str(id_to_eval))
full_path_for_id_pseudo = os.path.join(dir_to_eval, 'pseudo', str(id_to_eval))

## CLIP feature similarity averaging

In [8]:
# from locale import normalize
sim=0
with torch.no_grad():
    for idx in range(1, 6):
        tmp_gt = os.path.join(full_path_for_id_gt, str(idx)+'.png')
        tmp_ps = os.path.join(full_path_for_id_pseudo, str(idx)+'.png')
        im_gt = Image.open(tmp_gt)
        im_ps = Image.open(tmp_ps)
        
        # im_gt = im_gt / 256
        # im_ps = im_ps / 256
        im_gt = preprocess(im_gt).unsqueeze(0).to(device)
        im_ps = preprocess(im_ps).unsqueeze(0).to(device)
        # print(im_gt.shape, im_ps.shape)
        
        gt_ft = model.encode_image(im_gt)
        ps_ft = model.encode_image(im_ps)
        
        sim += F.cosine_similarity(gt_ft, ps_ft)
        
        
print('AVERAGE SIMILARITY: ', sim/5)

AVERAGE SIMILARITY:  tensor([0.8662], device='cuda:0', dtype=torch.float16)


## CHECK direction similarity in latent space

In [10]:
# from locale import normalize
sim=0
with torch.no_grad():
    for idx in range(1, 5):
        tmp_gt = os.path.join(full_path_for_id_gt, str(idx)+'.png')
        tmp_ps = os.path.join(full_path_for_id_pseudo, str(idx)+'.png')
        tmp_gt_next = os.path.join(full_path_for_id_gt, str(idx+1)+'.png')
        tmp_ps_next = os.path.join(full_path_for_id_pseudo, str(idx+1)+'.png')
        im_gt = Image.open(tmp_gt)
        im_ps = Image.open(tmp_ps)
        im_gt_n = Image.open(tmp_gt_next)
        im_ps_n = Image.open(tmp_ps_next)
        
        # im_gt = im_gt / 256
        # im_ps = im_ps / 256
        im_gt = preprocess(im_gt).unsqueeze(0).to(device)
        im_ps = preprocess(im_ps).unsqueeze(0).to(device)
        im_gt_n = preprocess(im_gt_n).unsqueeze(0).to(device)
        im_ps_n = preprocess(im_ps_n).unsqueeze(0).to(device)
        # print(im_gt.shape, im_ps.shape)
        
        gt_ft = model.encode_image(im_gt)
        ps_ft = model.encode_image(im_ps)
        
        gt_ft_n = model.encode_image(im_gt_n)
        ps_ft_n = model.encode_image(im_ps_n)
        
        gt_diff = gt_ft_n - gt_ft
        ps_diff = ps_ft_n - ps_ft
        
        sim += F.cosine_similarity(gt_diff, ps_diff)
        
        
print('AVERAGE DIFF_SIMILARITY: ', sim/5)

AVERAGE DIFF_SIMILARITY:  tensor([-0.0054], device='cuda:0', dtype=torch.float16)
