In [16]:
import numpy as np
import json
import torch
import torch.nn as nn
from networks import SCLIPNN
import clip
from PIL import Image
from sentence_transformers import SentenceTransformer
import torchvision.transforms.functional as fn

In [17]:
device = "cuda" if torch.cuda.is_available() else "cpu"
sbert_model = SentenceTransformer('distiluse-base-multilingual-cased-v1')
clip_model, preprocess = clip.load("ViT-B/32", device=device)

In [18]:
f_json =  open('coco/pairs.json',mode='r',encoding='utf-8')
pairs_data = json.load(f_json)
images = []
captions = []
for pair in pairs_data:
    images.append(pair["image_id"])
    captions.append(pair["caption"])

In [11]:
def reshape(im):
    print("This is size of original image:",im.size, "\n")
    width, height = im.size
    # print("W: {} and H: {}".format(width, height))
    if width > 1000 or height > 1000:
        scale = 3
    elif width > 500 or height > 500:
        scale = 2
    else:
        scale = 1    
    new_width = int(width / scale)
    new_height = int(height / scale)
    #image = preprocess(im)
    image = fn.resize(im, size=[new_width])
    print("This is size of resized image:",image.size, "\n")
    return image

In [13]:
def get_logits(image_features, text_features):
    # normalized features
    if text_features.dtype == torch.int64:
        text_features = text_features.type(torch.FloatTensor)
    image_features = (image_features / image_features.norm(dim=-1, keepdim=True)).to(device)
    text_features = (text_features / text_features.norm(dim=-1, keepdim=True)).to(device)

    # cosine similarity as logits
    logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
    logit_scale = logit_scale.exp().to(device)
    logits_per_image = logit_scale * image_features @ text_features.t()
    logits_per_text = logits_per_image.t()

    # shape = [global_batch_size, global_batch_size]
    return logits_per_image, logits_per_text  

In [None]:
def sbert_to_clip(sbert_features):
    input_size = sbert_features.shape[1]
    PATH = "models/best_model.pt"
    model = SCLIPNN(input_size,850)
    model.load_state_dict(torch.load(PATH))
    model.eval()
    output = model(sbert_features)
    return output

In [None]:
with torch.no_grad():
    print("Sentence in Spanish: {}".format(spanish_sentences))
    print("Sentence in English: {}".format(english_sentences))
    sp_sbert_features = torch.from_numpy(sbert_model.encode(spanish_sentences))
    sp_sbert_features = sbert_to_clip(sp_sbert_features).type(torch.float16)
    en_sbert_features = torch.from_numpy(sbert_model.encode(english_sentences))
    en_sbert_features = sbert_to_clip(en_sbert_features).type(torch.float16)
    sp_clip_features = clip_model.encode_text(sp_text)
    en_clip_features = clip_model.encode_text(en_text)
    for name, im in images.items():
        image = preprocess(im).unsqueeze(0).to(device)
        image_features = clip_model.encode_image(image)
        sp_logits_image_clip, sp_logits_text_clip = get_logits(image_features, sp_clip_features)
        sp_logits_image_sbert, sp_logits_text_sbert = get_logits(image_features, sp_sbert_features)
        sp_probs_clip = sp_logits_image_clip.softmax(dim=-1).cpu().numpy()
        sp_probs_sbert = sp_logits_image_sbert.softmax(dim=-1).cpu().numpy()
        en_logits_image_clip, en_logits_text_clip = get_logits(image_features, en_clip_features)
        en_logits_image_sbert, en_logits_text_sbert = get_logits(image_features, en_sbert_features)
        en_probs_clip = en_logits_image_clip.softmax(dim=-1).cpu().numpy()
        en_probs_sbert = en_logits_image_sbert.softmax(dim=-1).cpu().numpy()
        print('-'*30, ' Image: {} '.format(name),'-'*30)
        print("Spanish CLIP probs:", sp_probs_clip[0]) 
        print("Spanish SBERT probs:", sp_probs_sbert[0]) 
        print("English CLIP probs:", en_probs_clip[0]) 
        print("English SBERT probs:", en_probs_sbert[0]) 