In [1]:
import os

import torch
import torch.nn.functional as F
from PIL import Image
from transformers import AutoModel, AutoTokenizer


In [2]:
def weighted_mean_pooling(hidden, attention_mask):
    attention_mask_ = attention_mask * attention_mask.cumsum(dim=1)
    s = torch.sum(hidden * attention_mask_.unsqueeze(-1).float(), dim=1)
    d = attention_mask_.sum(dim=1, keepdim=True).float()
    reps = s / d
    return reps


In [3]:
@torch.no_grad()
def encode(text_or_image_list, model, tokenizer):
    if isinstance(text_or_image_list[0], str):
        inputs = {
            "text": text_or_image_list,
            "image": [None] * len(text_or_image_list),
            "tokenizer": tokenizer,
        }
    else:
        inputs = {
            "text": [""] * len(text_or_image_list),
            "image": text_or_image_list,
            "tokenizer": tokenizer,
        }

    outputs = model(**inputs)
    attention_mask = outputs.attention_mask
    hidden = outputs.last_hidden_state

    reps = weighted_mean_pooling(hidden, attention_mask)
    embeddings = F.normalize(reps, p=2, dim=1).detach().cpu().numpy()
    return embeddings


In [6]:
model_name_or_path = "openbmb/VisRAG"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
model = AutoModel.from_pretrained(
    model_name_or_path, torch_dtype=torch.bfloat16, trust_remote_code=True
).to("cuda")
model.eval()


A new version of the following files was downloaded from https://huggingface.co/openbmb/VisRAG:
- modeling_minicpmv.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/54.6k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

VisRAG_Ret(
  (llm): MiniCPMForCausalLM(
    (model): MiniCPMModel(
      (embed_tokens): Embedding(122753, 2304)
      (layers): ModuleList(
        (0-39): 40 x MiniCPMDecoderLayer(
          (self_attn): MiniCPMSdpaAttention(
            (q_proj): Linear(in_features=2304, out_features=2304, bias=False)
            (k_proj): Linear(in_features=2304, out_features=2304, bias=False)
            (v_proj): Linear(in_features=2304, out_features=2304, bias=False)
            (o_proj): Linear(in_features=2304, out_features=2304, bias=False)
            (rotary_emb): MiniCPMRotaryEmbedding()
          )
          (mlp): MiniCPMMLP(
            (gate_proj): Linear(in_features=2304, out_features=5760, bias=False)
            (up_proj): Linear(in_features=2304, out_features=5760, bias=False)
            (down_proj): Linear(in_features=5760, out_features=2304, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): MiniCPMRMSNorm()
          (post_attention_layernorm): M

In [9]:
script_dir = "."  # os.path.dirname(os.path.realpath(__file__))
queries = ["What does a dog look like?"]
passages = [
    Image.open(os.path.join(script_dir, "cat.jpeg")).convert("RGB"),
    Image.open(os.path.join(script_dir, "dog.jpg")).convert("RGB"),
]

INSTRUCTION = "Represent this query for retrieving relevant documents: "
queries = [INSTRUCTION + query for query in queries]

embeddings_query = encode(queries, model, tokenizer)
embeddings_doc = encode(passages, model, tokenizer)

scores = embeddings_query @ embeddings_doc.T
print(scores.tolist())

[[0.25723618268966675, 0.3377824127674103]]
