In [1]:
import torch
from transformers import AutoImageProcessor, AutoModelForZeroShotImageClassification, AutoTokenizer, ZeroShotImageClassificationPipeline, SiglipProcessor
from modeling_siglip import SiglipModel
from torch2trt import torch2trt, TRTModule
from PIL import Image, ImageDraw
import numpy as np
from modeling_siglip import SiglipModel

processor = SiglipProcessor.from_pretrained('siglip-large-epoch5-augv2-upscale_0.892_cont_5ep_0.905')

In [None]:
model = SiglipModel.from_pretrained('siglip-large-epoch5-augv2-upscale_0.892_cont_5ep_0.905', torch_dtype=torch.float16).cuda().eval()
model.logit_scale.exp().item(), model.logit_bias.item()

In [3]:
logit_scale_exp = torch.tensor([118.3125], device='cuda', dtype=torch.float16, requires_grad=False)
logit_bias = torch.tensor([-12.6640625], device='cuda', dtype=torch.float16, requires_grad=False)

In [4]:
vision_trt = TRTModule()
vision_trt.load_state_dict(torch.load('vision_trt.pth'))
text_trt = TRTModule()
text_trt.load_state_dict(torch.load('text_trt.pth'))

<All keys matched successfully>

In [5]:
image = Image.open("til_siglip_ds_x4v3_v2/image_0_0.jpg")
image = np.asarray(image)
image = torch.tensor(image, dtype=torch.float16, device='cuda').permute(2, 0, 1)
image.shape

torch.Size([3, 608, 192])

In [6]:
feats = processor(images=[image, image], text=['This is a photo of grey missile.'], padding=True, return_tensors='pt').to('cuda')

In [8]:
vision_input = feats['pixel_values'].type(torch.float16)
text_input = feats['input_ids'][:64]  # truncate text input to 64 tokens
image_feat = vision_trt(vision_input)['pooler_output']
text_feat = text_trt(text_input)['pooler_output']
image_feat /= image_feat.norm(p=2, dim=-1, keepdim=True)
text_feat /= text_feat.norm(p=2, dim=-1, keepdim=True)
similarity_score = image_feat @ text_feat.T * logit_scale_exp + logit_bias  # sigmoid is not needed here

tensor([[1.4305e-06],
        [1.4305e-06]], device='cuda:0', dtype=torch.float16)