In [3]:
from huggingface_hub import hf_hub_download
import open_clip
import shutil
import os

In [None]:
repo_id = "timm/ViT-B-16-SigLIP-512"
open_clip_repo_id = f"hf-hub:{repo_id}"

model, preprocess = open_clip.create_model_from_pretrained(open_clip_repo_id)
tokenizer = open_clip.get_tokenizer(open_clip_repo_id)

file_names = [
    "open_clip_config.json",
    "open_clip_pytorch_model.bin",
    "special_tokens_map.json",
    "tokenizer.json",
    "tokenizer_config.json",
]

model_dir = "model"
for file_name in file_names:
    src_path = hf_hub_download(repo_id, file_name)
    dst_path = os.path.join(model_dir, file_name)
    dst_dir = os.path.dirname(dst_path)
    if not os.path.exists(dst_dir):
        os.makedirs(dst_dir)
    shutil.copy(src_path, dst_path)

In [4]:
model_name = "ViT-B-16-SigLIP-512"
model, _, preprocess = open_clip.create_model_and_transforms(
    model_name, pretrained="model/open_clip_pytorch_model.bin"
)
tokenizer = open_clip.tokenizer.HFTokenizer("model")
text = "a red shirt"
tokens = tokenizer(text, context_length=64)
tokens, tokens.shape
value = model.encode_text(tokens).mean().item()

In [17]:
# model, preprocess = open_clip.create_model_from_pretrained("hf-hub:timm/ViT-B-16-SigLIP-512")
# tokenizer = open_clip.get_tokenizer("hf-hub:timm/ViT-B-16-SigLIP-512")

In [7]:
text = "a red shirt"
tokens = tokenizer(text, context_length=64)
tokens, tokens.shape

(tensor([[ 262,  266, 1226, 5089,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1]]),
 torch.Size([1, 64]))

In [8]:
model.encode_text(tokens).shape

torch.Size([1, 768])

In [4]:
model2, preprocess2 = open_clip.create_model_from_pretrained(
    "hf-hub:timm/ViT-B-16-SigLIP-512"
)
tokenizer2 = open_clip.get_tokenizer("hf-hub:timm/ViT-B-16-SigLIP-512")
text = "a red shirt"
tokens2 = tokenizer2(text)
tokens2, tokens2.shape

(tensor([[ 262,  266, 1226, 5089,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
             1,    1,    1,    1]]),
 torch.Size([1, 64]))

In [5]:
model2.encode_text(tokens2).shape

torch.Size([1, 768])

In [12]:
import torch

torch.equal(model.encode_text(tokens), model2.encode_text(tokens2))

True

In [15]:
text = "a red shirt a red shirt a red shirt a red shirt a red shirt a red shirt a red shirt a red shirt a red shirt a red shirt a red shirt a red shirt a red shirt a red shirt a red shirt a red shirt a red shirt a red shirt a red shirt a red shirt a red shirt a red shirt a red shirt a red shirt a red shirt a red shirt a red shirt a red shirt a red shirt a red shirt a red shirt a red shirt a red shirt a red shirt a red shirt a red shirt a red shirt"
tokens2 = tokenizer2(text)
tokens2, tokens2.shape

(tensor([[ 262,  266, 1226, 5089,  262,  266, 1226, 5089,  262,  266, 1226, 5089,
           262,  266, 1226, 5089,  262,  266, 1226, 5089,  262,  266, 1226, 5089,
           262,  266, 1226, 5089,  262,  266, 1226, 5089,  262,  266, 1226, 5089,
           262,  266, 1226, 5089,  262,  266, 1226, 5089,  262,  266, 1226, 5089,
           262,  266, 1226, 5089,  262,  266, 1226, 5089,  262,  266, 1226, 5089,
           262,  266, 1226,    1]]),
 torch.Size([1, 64]))

In [22]:
model.encode_text(tokenizer("a red shirt", context_length=64)).mean().item()

0.00645834906026721

In [9]:
tokenizer.tokenizer.tokenize("red shirt")

['▁red', '▁shirt']