In [1]:
import torch
import torch.nn.functional as F

from torch import Tensor
from transformers import AutoTokenizer, AutoModel

In [34]:
# For encoder models only. Sets the index where the classification token is located.
CLASSIFICATION_TOKEN_INDEX = {
    'BAAI/bge-en-icl': -1,
}
DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.mps.is_available() else 'cpu'
print(f'Using device: {DEVICE}')

Using device: mps


In [35]:
model_checkpoint = 'BAAI/bge-en-icl'

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModel.from_pretrained(model_checkpoint).to(DEVICE)
cls_index = CLASSIFICATION_TOKEN_INDEX[model_checkpoint]
model.eval()
print(f"Model loaded on {model.device}")
print(f"cls_index: {cls_index}")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Model loaded on mps:0
cls_index: -1


In [31]:
def vectorize(texts: list[str], tokenizer: AutoTokenizer, model: AutoModel, cls_idx: int) -> Tensor:
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, device=DEVICE)
    with torch.no_grad():
        return model(**inputs).last_hidden_state[:, cls_index, :]

In [32]:
vectors = vectorize(texts=['a', 'hello, world'],
                    tokenizer=tokenizer,
                    model=model,
                    cls_idx=cls_index)

In [38]:
text = ['a', 'hello, world']
inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True).to(DEVICE)
with torch.no_grad():
    outputs = model(**inputs).last_hidden_state
print(outputs.shape)
print(outputs[:, -1, ])

torch.Size([2, 6, 4096])
tensor([[-1.5674,  0.2064,  1.0076,  ...,  2.1652, -3.0944,  0.3509],
        [ 2.3507, -1.0540, -0.8063,  ..., -0.2687,  0.7767,  0.5697]],
       device='mps:0')


In [44]:
torch.norm(vectors[1], p=2)

tensor(228.9328)