### 1. Using HuggingFace Transformers

In [7]:
from transformers import AutoModel, AutoTokenizer
import torch

model = AutoModel.from_pretrained('infgrad/stella-large-zh-v2')
tokenizer = AutoTokenizer.from_pretrained('infgrad/stella-large-zh-v2')
sentences = ["数据1", "数据ABCDEFGH"]
batch_data = tokenizer(
    text=sentences,
    padding="longest",
    return_tensors="pt",
    max_length=1024,
    truncation=True,
)
attention_mask = batch_data["attention_mask"]
model_output = model(**batch_data)
last_hidden = model_output.last_hidden_state.masked_fill(~attention_mask[..., None].bool(), 0.0)
vectors = last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
vectors = torch.nn.functional.normalize(vectors, p=2, dim=1)
# vectors = normalize(vectors, norm="l2", axis=1, )
print(vectors.shape)  # 2,768

config.json:   0%|          | 0.00/882 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/652M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/315 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/439k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

torch.Size([2, 1024])


In [3]:
from transformers import AutoTokenizer, AutoModel
import torch

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f'--device={device}')


# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-large-zh-v1.5')
tokenizer.padding_side = 'right'
model = AutoModel.from_pretrained('BAAI/bge-large-zh-v1.5').half()

model.to(device) 
model.requires_grad_(False)
model.eval()

--device=cuda:0
[2023-11-17 01:52:12,325] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(21128, 1024, padding_idx=0)
    (position_embeddings): Embedding(512, 1024)
    (token_type_embeddings): Embedding(2, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-23): 24 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inpl

In [7]:
sent_all = [sent[:512] for sent in sentences]

In [8]:
# Tokenize sentences
encoded_input = tokenizer(sent_all, padding=True, truncation=True, return_tensors='pt').to(device)
# for s2p(short query to long passage) retrieval task, add an instruction to query (not add instruction for passages)
# encoded_input = tokenizer([instruction + q for q in queries], padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)
    sentence_embeddings = model_output[0][:, 0]
    # Perform pooling. In this case, cls pooling

# Perform pooling. In this case, max pooling.
sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1).cpu().numpy()
print("Sentence embeddings:", sentence_embeddings)

Sentence embeddings: [[ 0.002934   0.02063   -0.02824   ...  0.00651    0.01074   -0.03973  ]
 [ 0.0097    -0.0037    -0.03003   ... -0.001525  -0.0102    -0.04834  ]
 [-0.01348    0.02986   -0.03256   ...  0.0196     0.04263   -0.02225  ]
 [-0.01704    0.0004883 -0.00702   ... -0.01162    0.0344    -0.02483  ]
 [-0.00435    0.0079    -0.014854  ... -0.01718    0.02173   -0.0107   ]]


In [4]:
for sent in sentences:
    # Tokenize sentences
    encoded_input = tokenizer(sent[:512], padding=True, truncation=True, return_tensors='pt').to(device)
    # for s2p(short query to long passage) retrieval task, add an instruction to query (not add instruction for passages)
    # encoded_input = tokenizer([instruction + q for q in queries], padding=True, truncation=True, return_tensors='pt')
    
    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)
        sentence_embeddings = model_output[0][:, 0]
        # Perform pooling. In this case, cls pooling
    
    # Perform pooling. In this case, max pooling.
    sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1).cpu().numpy()
    print("Sentence embeddings:", sentence_embeddings)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Sentence embeddings: [[ 0.002947  0.02058  -0.02829  ...  0.006523  0.010765 -0.0397  ]]
Sentence embeddings: [[ 0.00975  -0.00364  -0.03001  ... -0.001495 -0.01018  -0.04834 ]]
Sentence embeddings: [[-0.01349  0.02988 -0.03265 ...  0.01965  0.0427  -0.02226]]
Sentence embeddings: [[-0.01706   0.000493 -0.00702  ... -0.0116    0.03436  -0.02483 ]]
Sentence embeddings: [[-0.00428   0.007904 -0.01481  ... -0.01714   0.02173  -0.010704]]
