In [1]:
import torch
import torch.nn.functional as F

from torch import Tensor
from transformers import AutoTokenizer, AutoModel

In [49]:
# For encoder models only. Sets the index where the classification token is located.
CLASSIFICATION_TOKEN_INDEX = {
    'BAAI/bge-en-icl': -1,
    # 'nvidia/NV-Embed-v2': 0
}
DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.mps.is_available() else 'cpu'
print(f'Using device: {DEVICE}')

Using device: mps


In [50]:
model_checkpoint = 'BAAI/bge-en-icl'

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModel.from_pretrained(model_checkpoint).to(DEVICE)
cls_index = CLASSIFICATION_TOKEN_INDEX[model_checkpoint]
model.eval()
print(f"Model loaded on {model.device}")
print(f"cls_index: {cls_index}")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Model loaded on mps:0
cls_index: -1


In [55]:
def vectorize(texts: list[str], tokenizer: AutoTokenizer, model: AutoModel, cls_idx: int) -> Tensor:
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True)
    inputs = {key: value.to(DEVICE) for key, value in inputs.items()}
    with torch.no_grad():
        return model(**inputs).last_hidden_state[:, cls_index, :]

In [None]:
def embed_with_bge_en_icl(texts: list[str], tokenizer: AutoTokenizer, model: AutoModel) -> Tensor:
    inputs = tokenizer(texts, return_tensors='pt',
                       padding=True, truncation=True)
    # Move inputs to the model's compute device
    inputs = {key: value.to(model.device) for key, value in inputs.items()}

    # Set model to evaluation mode and get last vector for each input
    model.eval()
    with torch.no_grad():
        return model(**inputs).last_hidden_state[:, -1, :]
    
def embed_with_NV_Embed_v2(texts: list[str], tokenizer: AutoTokenizer, model: AutoModel) -> Tensor:
    model.encode(
        texts, instruction=query_prefix, max_length=max_length)

In [59]:
model.device

device(type='mps', index=0)

In [56]:
vectors = vectorize(texts=['a', 'hello, world'],
                    tokenizer=tokenizer,
                    model=model,
                    cls_idx=cls_index)

In [57]:
text = ['a', 'hello, world']
inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True).to(DEVICE)
with torch.no_grad():
    outputs = model(**inputs).last_hidden_state
print(outputs.shape)
print(outputs[:, -1, ])

torch.Size([2, 6, 4096])
tensor([[-1.5674,  0.2064,  1.0076,  ...,  2.1652, -3.0944,  0.3509],
        [ 2.3507, -1.0540, -0.8063,  ..., -0.2687,  0.7767,  0.5697]],
       device='mps:0')


In [46]:
# Each query needs to be accompanied by an corresponding instruction describing the task.
task_name_to_instruct = {
    "example": "Given a question, retrieve passages that answer the question", }

query_prefix = "Instruct: "+task_name_to_instruct["example"]+"\nQuery: "
queries = [
    'are judo throws allowed in wrestling?',
    'how to become a radiology technician in michigan?'
]

# No instruction needed for retrieval passages
passage_prefix = ""
passages = [
    "Since you're reading this, you are probably someone from a judo background or someone who is just wondering how judo techniques can be applied under wrestling rules. So without further ado, let's get to the question. Are Judo throws allowed in wrestling? Yes, judo throws are allowed in freestyle and folkstyle wrestling. You only need to be careful to follow the slam rules when executing judo throws. In wrestling, a slam is lifting and returning an opponent to the mat with unnecessary force.",
    "Below are the basic steps to becoming a radiologic technologist in Michigan:Earn a high school diploma. As with most careers in health care, a high school education is the first step to finding entry-level employment. Taking classes in math and science, such as anatomy, biology, chemistry, physiology, and physics, can help prepare students for their college studies and future careers.Earn an associate degree. Entry-level radiologic positions typically require at least an Associate of Applied Science. Before enrolling in one of these degree programs, students should make sure it has been properly accredited by the Joint Review Committee on Education in Radiologic Technology (JRCERT).Get licensed or certified in the state of Michigan."
]

# load model with tokenizer
model = AutoModel.from_pretrained('nvidia/NV-Embed-v2', trust_remote_code=True)

# get the embeddings
max_length = 32768
query_embeddings = model.encode(
    queries, instruction=query_prefix, max_length=max_length)
passage_embeddings = model.encode(
    passages, instruction=passage_prefix, max_length=max_length)

# normalize embeddings
query_embeddings = F.normalize(query_embeddings, p=2, dim=1)
passage_embeddings = F.normalize(passage_embeddings, p=2, dim=1)

# get the embeddings with DataLoader (spliting the datasets into multiple mini-batches)
# batch_size=2
# query_embeddings = model._do_encode(queries, batch_size=batch_size, instruction=query_prefix, max_length=max_length, num_workers=32, return_numpy=True)
# passage_embeddings = model._do_encode(passages, batch_size=batch_size, instruction=passage_prefix, max_length=max_length, num_workers=32, return_numpy=True)

scores = (query_embeddings @ passage_embeddings.T) * 100
print(scores.tolist())

model.safetensors.index.json:   0%|          | 0.00/28.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/789M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/997 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

  'input_ids': torch.tensor(batch_dict.get('input_ids').to(batch_dict.get('input_ids')).long()),
  self.gen = func(*args, **kwds)


[[87.42694854736328, 0.46282508969306946], [0.9652770757675171, 86.03724670410156]]


In [48]:
query_embeddings.shape

torch.Size([2, 4096])

In [60]:
from sentence_transformers import SentenceTransformer

# Each query needs to be accompanied by an corresponding instruction describing the task.
task_name_to_instruct = {
    "example": "Given a question, retrieve passages that answer the question", }

query_prefix = "Instruct: "+task_name_to_instruct["example"]+"\nQuery: "
queries = [
    'are judo throws allowed in wrestling?',
    'how to become a radiology technician in michigan?'
]

# No instruction needed for retrieval passages
passages = [
    "Since you're reading this, you are probably someone from a judo background or someone who is just wondering how judo techniques can be applied under wrestling rules. So without further ado, let's get to the question. Are Judo throws allowed in wrestling? Yes, judo throws are allowed in freestyle and folkstyle wrestling. You only need to be careful to follow the slam rules when executing judo throws. In wrestling, a slam is lifting and returning an opponent to the mat with unnecessary force.",
    "Below are the basic steps to becoming a radiologic technologist in Michigan:Earn a high school diploma. As with most careers in health care, a high school education is the first step to finding entry-level employment. Taking classes in math and science, such as anatomy, biology, chemistry, physiology, and physics, can help prepare students for their college studies and future careers.Earn an associate degree. Entry-level radiologic positions typically require at least an Associate of Applied Science. Before enrolling in one of these degree programs, students should make sure it has been properly accredited by the Joint Review Committee on Education in Radiologic Technology (JRCERT).Get licensed or certified in the state of Michigan."
]

# load model with tokenizer
model = SentenceTransformer('nvidia/NV-Embed-v2', trust_remote_code=True)
model.max_seq_length = 32768
model.tokenizer.padding_side = "right"


def add_eos(input_examples):
  input_examples = [
      input_example + model.tokenizer.eos_token for input_example in input_examples]
  return input_examples


# get the embeddings
batch_size = 2
query_embeddings = model.encode(add_eos(
    queries), batch_size=batch_size, prompt=query_prefix, normalize_embeddings=True)
passage_embeddings = model.encode(
    add_eos(passages), batch_size=batch_size, normalize_embeddings=True)

scores = (query_embeddings @ passage_embeddings.T) * 100
print(scores.tolist())

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/171 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/60.1k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

1_Pooling/config.json:   0%|          | 0.00/298 [00:00<?, ?B/s]

  self.gen = func(*args, **kwds)


IndexError: Dimension out of range (expected to be in range of [-3, 2], but got 3)

In [62]:
model.encode(passages)

  self.gen = func(*args, **kwds)


IndexError: Dimension out of range (expected to be in range of [-3, 2], but got 3)