In [1]:
# Load model directly
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-large-en-v1.5")
model = AutoModel.from_pretrained("BAAI/bge-large-en-v1.5")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load model from HuggingFace Hub
#tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-large-zh-v1.5')
#model = AutoModel.from_pretrained('BAAI/bge-large-zh-v1.5')
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-large-en-v1.5")
model = AutoModel.from_pretrained("BAAI/bge-large-en-v1.5")

model.eval()

# Sentences we want sentence embeddings for
#sentences = ["样例数据-1", "样例数据-2"]
sentences = ["i like you", "i want to kill you"]

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
# for s2p(short query to long passage) retrieval task, add an instruction to query (not add instruction for passages)
# encoded_input = tokenizer([instruction + q for q in queries], padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)
    # Perform pooling. In this case, cls pooling.
    sentence_embeddings = model_output[0][:, 0]
# normalize embeddings
sentence_embeddings = torch.nn.functional.normalize(sentence_embeddings, p=2, dim=1)
print("Sentence embeddings:", sentence_embeddings)


Sentence embeddings: tensor([[ 0.0356,  0.0118,  0.0048,  ..., -0.0115, -0.0170, -0.0047],
        [ 0.0287, -0.0219, -0.0046,  ..., -0.0400, -0.0092,  0.0066]])


In [3]:
import torch
import math
import numpy as np
def get_length(embedding_1d):
    sum = 0
    for i in embedding_1d:
        sum+=(i**2)
    return math.sqrt(sum)
def normalise_embedding(embedding_1d):
    length = get_length(embedding_1d)
    for i in range(len(embedding_1d)):
        embedding_1d[i] /= length
def get_normalise_embedding(embedding_1d):
    if type(embedding_1d) is torch.Tensor:
        temp_embedding_1d = (embedding_1d.detach().numpy()).copy()
    else:
        temp_embedding_1d = embedding_1d.copy()
    length = get_length(temp_embedding_1d)
    for i in range(len(temp_embedding_1d)):
        temp_embedding_1d[i] /= length
    return temp_embedding_1d


def cosine_sim(embedding_1, embedding_2):
    embedding_1 = get_normalise_embedding(embedding_1)
    embedding_2 = get_normalise_embedding(embedding_2)
    sim_sum = 0
    for e_1, e_2 in zip(embedding_1, embedding_2):
        sim_sum += (e_1*e_2)
    return sim_sum
def norm_ed_cosine_sim(embedding_1, embedding_2):
    sim_sum = 0
    for e_1, e_2 in zip(embedding_1, embedding_2):
        sim_sum += (e_1*e_2)
    return sim_sum

In [11]:
model_output

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.6573,  0.2178,  0.0891,  ..., -0.2121, -0.3139, -0.0860],
         [ 0.3542,  0.2991,  0.4106,  ..., -0.1956, -0.4351, -0.1892],
         [ 0.2387,  0.2182,  0.5341,  ..., -0.0778, -0.8269, -0.8682],
         ...,
         [ 0.4592,  0.6778,  0.0939,  ..., -0.1172, -0.4387, -0.5490],
         [ 0.4478,  0.3215, -0.0040,  ..., -0.0741, -0.4161, -0.1557],
         [ 0.4373,  0.3175,  0.0090,  ..., -0.0785, -0.4132, -0.1504]],

        [[ 0.5374, -0.4100, -0.0864,  ..., -0.7484, -0.1726,  0.1235],
         [ 0.4699, -0.3668,  0.3440,  ..., -0.7859, -0.2531,  0.0738],
         [ 0.1904, -0.1502,  0.3137,  ..., -0.4904, -0.7493,  0.1013],
         ...,
         [ 0.4872, -0.3801, -0.0678,  ..., -0.6916, -0.1074,  0.1410],
         [ 0.5912, -0.4069,  0.1806,  ..., -0.6609, -0.0664,  0.3569],
         [ 0.2079, -0.2927, -0.0442,  ..., -1.2791, -0.5430, -0.1691]]]), pooler_output=tensor([[-0.9754, -0.6804, -0.6011,  .

In [13]:
model_output[0], model_output.last_hidden_state # last_hidden_state

(tensor([[[ 0.6573,  0.2178,  0.0891,  ..., -0.2121, -0.3139, -0.0860],
          [ 0.3542,  0.2991,  0.4106,  ..., -0.1956, -0.4351, -0.1892],
          [ 0.2387,  0.2182,  0.5341,  ..., -0.0778, -0.8269, -0.8682],
          ...,
          [ 0.4592,  0.6778,  0.0939,  ..., -0.1172, -0.4387, -0.5490],
          [ 0.4478,  0.3215, -0.0040,  ..., -0.0741, -0.4161, -0.1557],
          [ 0.4373,  0.3175,  0.0090,  ..., -0.0785, -0.4132, -0.1504]],
 
         [[ 0.5374, -0.4100, -0.0864,  ..., -0.7484, -0.1726,  0.1235],
          [ 0.4699, -0.3668,  0.3440,  ..., -0.7859, -0.2531,  0.0738],
          [ 0.1904, -0.1502,  0.3137,  ..., -0.4904, -0.7493,  0.1013],
          ...,
          [ 0.4872, -0.3801, -0.0678,  ..., -0.6916, -0.1074,  0.1410],
          [ 0.5912, -0.4069,  0.1806,  ..., -0.6609, -0.0664,  0.3569],
          [ 0.2079, -0.2927, -0.0442,  ..., -1.2791, -0.5430, -0.1691]]]),
 tensor([[[ 0.6573,  0.2178,  0.0891,  ..., -0.2121, -0.3139, -0.0860],
          [ 0.3542,  0.2991,

In [12]:
model_output[1], model_output.pooler_output # pooler_output

(tensor([[-0.9754, -0.6804, -0.6011,  ..., -0.2353,  0.9816, -0.8564],
         [-0.8543, -0.5125, -0.8204,  ..., -0.0988,  0.9674, -0.7822]]),
 tensor([[-0.9754, -0.6804, -0.6011,  ..., -0.2353,  0.9816, -0.8564],
         [-0.8543, -0.5125, -0.8204,  ..., -0.0988,  0.9674, -0.7822]]))

In [36]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load model from HuggingFace Hub
#tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-large-zh-v1.5')
#model = AutoModel.from_pretrained('BAAI/bge-large-zh-v1.5')
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-large-en-v1.5")
model = AutoModel.from_pretrained("BAAI/bge-large-en-v1.5")

model.eval()

def get_sentenceS_embedding(sentenceS): ## already normalised due to "torch.nn.functional.normalize" function
    # Tokenize sentences
    encoded_input = tokenizer(sentenceS, padding=True, truncation=True, return_tensors='pt')
    # for s2p(short query to long passage) retrieval task, add an instruction to query (not add instruction for passages)
    # encoded_input = tokenizer([instruction + q for q in queries], padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)
        # Perform pooling. In this case, cls pooling.
        sentenceS_embeddings = model_output[0][:, 0]
    # normalize embeddings
    sentenceS_embeddings = torch.nn.functional.normalize(sentenceS_embeddings, p=2, dim=1)
    #print("SentenceS embeddings:", sentenceS_embeddings)

    return sentenceS_embeddings ## if not input a list of sentences, then just one

In [23]:
encoded_input, model_output.last_hidden_state

({'input_ids': tensor([[ 101, 1045, 2066, 2017,  102,    0,    0],
         [ 101, 1045, 2215, 2000, 3102, 2017,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0],
         [0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 0, 0],
         [1, 1, 1, 1, 1, 1, 1]])},
 tensor([[[ 0.6573,  0.2178,  0.0891,  ..., -0.2121, -0.3139, -0.0860],
          [ 0.3542,  0.2991,  0.4106,  ..., -0.1956, -0.4351, -0.1892],
          [ 0.2387,  0.2182,  0.5341,  ..., -0.0778, -0.8269, -0.8682],
          ...,
          [ 0.4592,  0.6778,  0.0939,  ..., -0.1172, -0.4387, -0.5490],
          [ 0.4478,  0.3215, -0.0040,  ..., -0.0741, -0.4161, -0.1557],
          [ 0.4373,  0.3175,  0.0090,  ..., -0.0785, -0.4132, -0.1504]],
 
         [[ 0.5374, -0.4100, -0.0864,  ..., -0.7484, -0.1726,  0.1235],
          [ 0.4699, -0.3668,  0.3440,  ..., -0.7859, -0.2531,  0.0738],
          [ 0.1904, -0.1502,  0.3137,  ..., -0.4904, -0.7493,  0.1013],
          ...,
          [ 0.4872, -0.3801, -

In [22]:
len(model_output.last_hidden_state), len(model_output.last_hidden_state[0]), len(model_output.last_hidden_state[1]), len(model_output.last_hidden_state[0][0]), len(model_output.last_hidden_state[1][0])

(2, 7, 7, 1024, 1024)

In [28]:
model_output.last_hidden_state[0]

tensor([[ 0.6573,  0.2178,  0.0891,  ..., -0.2121, -0.3139, -0.0860],
        [ 0.3542,  0.2991,  0.4106,  ..., -0.1956, -0.4351, -0.1892],
        [ 0.2387,  0.2182,  0.5341,  ..., -0.0778, -0.8269, -0.8682],
        ...,
        [ 0.4592,  0.6778,  0.0939,  ..., -0.1172, -0.4387, -0.5490],
        [ 0.4478,  0.3215, -0.0040,  ..., -0.0741, -0.4161, -0.1557],
        [ 0.4373,  0.3175,  0.0090,  ..., -0.0785, -0.4132, -0.1504]])

In [29]:
model_output.last_hidden_state[1]

tensor([[ 0.5374, -0.4100, -0.0864,  ..., -0.7484, -0.1726,  0.1235],
        [ 0.4699, -0.3668,  0.3440,  ..., -0.7859, -0.2531,  0.0738],
        [ 0.1904, -0.1502,  0.3137,  ..., -0.4904, -0.7493,  0.1013],
        ...,
        [ 0.4872, -0.3801, -0.0678,  ..., -0.6916, -0.1074,  0.1410],
        [ 0.5912, -0.4069,  0.1806,  ..., -0.6609, -0.0664,  0.3569],
        [ 0.2079, -0.2927, -0.0442,  ..., -1.2791, -0.5430, -0.1691]])

In [26]:
model_output.last_hidden_state[:, 0]

tensor([[ 0.6573,  0.2178,  0.0891,  ..., -0.2121, -0.3139, -0.0860],
        [ 0.5374, -0.4100, -0.0864,  ..., -0.7484, -0.1726,  0.1235]])

In [37]:
get_sentenceS_embedding("hi there hero")

tensor([[ 0.0619,  0.0241, -0.0185,  ...,  0.0011, -0.0292, -0.0021]])

In [38]:
get_sentenceS_embedding(["hi there hero", "yolo is the name"]) ## if accept list, then 2 output, but if not then yea!

tensor([[ 0.0619,  0.0241, -0.0185,  ...,  0.0011, -0.0292, -0.0021],
        [ 0.0253,  0.0182, -0.0108,  ..., -0.0258,  0.0063, -0.0036]])

In [39]:
def sent_cos_sim(t1, t2):
    return cosine_sim(get_sentenceS_embedding(t1)[0], get_sentenceS_embedding(t2)[0])

In [40]:
sent_cos_sim("tax relief", "employee benefit")

0.6938344566627492

In [41]:
sent_cos_sim("tax relief for employees", "employee benefit")

0.801148301470235

In [42]:
sent_cos_sim("the world is green", "i hate soda")

0.4368856924218676

In [49]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 1024, padding_idx=0)
    (position_embeddings): Embedding(512, 1024)
    (token_type_embeddings): Embedding(2, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-23): 24 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, 

In [48]:
model.base_model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 1024, padding_idx=0)
    (position_embeddings): Embedding(512, 1024)
    (token_type_embeddings): Embedding(2, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-23): 24 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, 

In [54]:
def sent_cos_sim_v2_add_padding(t1, t2):
    sent_emb_list = get_sentenceS_embedding([t1, t2])
    return cosine_sim(sent_emb_list[0], sent_emb_list[1])

In [55]:
sent_cos_sim_v2_add_padding("tax relief", "employee benefit")

0.6938344566627492

In [56]:
sent_cos_sim_v2_add_padding("tax relief for employees", "employee benefit")

0.801148303952786

In [57]:
sent_cos_sim_v2_add_padding("the world is green", "i hate soda")

0.43688567712712256

In [None]:
def sent_cos_sim(t1, t2):
    return cosine_sim(get_sentenceS_embedding(t1)[0], get_sentenceS_embedding(t2)[0])

In [85]:
def generic_sent_cos_sim(model_emb_func, t1, t2, additional_nesting = False):
    if additional_nesting:
        return cosine_sim(model_emb_func(t1)[0], model_emb_func(t2)[0])    
    return cosine_sim(model_emb_func(t1), model_emb_func(t2))

def test_emb_model(model_emb_func, sent_pair_comparison_list, sorting = False, additional_nesting = False):
    ending_dict = {}
    for comp1, comp2 in sent_pair_comparison_list:
        ending_dict[(comp1, comp2)] = generic_sent_cos_sim(model_emb_func, comp1, comp2, additional_nesting)
    if sorting:
        sorted_ending_dict = {comps:comps_res for comps, comps_res in (sorted(ending_dict.items(), key=lambda dict_item: dict_item[1], reverse = True))}
        return sorted_ending_dict
    return ending_dict
def test_emb_model_results(ending_dict, sorting = False):
    print("Similarity level:")
    #res_sum = 0
    if sorting:
        for comps, res in (sorted(ending_dict.items(), key= lambda dict_item: dict_item[1], reverse=True)):
            ## print(f"{comps[0]:20.5}-{comps[1]:5.20}: {res:.5}") # "{:min_pad.max_pad}", max pad is essentially also the max number of chars permitted!!
            print(f"{comps[0]:20.20} /-/ {comps[1]:20.20} : {res:.5}")
            #res_sum += res
    else:
        for comps, res in ending_dict.items():
            ## print(f"{comps[0]:20.5}-{comps[1]:5.20}: {res:.5}") # "{:min_pad.max_pad}", max pad is essentially also the max number of chars permitted!!
            print(f"{comps[0]:20.20} /-/ {comps[1]:20.20} : {res:.5}")
            #res_sum += res
        
    ## res_sum no purpose and stuff yet since no measure of accuracy present, like it should/should not match, dont know so cannot say the sum is good or not, etc or avg, but later on can try with these and maybe weighted based on certain comparisons more impt?
        

In [86]:
comparison_pair_list = [("tax relief", "employee benefit"), ("tax relief for employees", "employee benefit"), ("the world is green", "i hate soda")]
comparison_pair_list.append(("yearly flight tickets home for employees", "home passage"))
comparison_pair_list.append(("yearly flight tickets home for employees", "subsidised flights to employee home country"))
comparison_pair_list.append(("night shift", "overtime pay"))

In [108]:
 ## a bit useless since when test with "get_tok_emb", the padding does not affect cls token embedding so... yeaaa
def test_emb_model_for_v2_only(sent_pair_comparison_list, sorting = False):
    ending_dict = {}
    for comp1, comp2 in sent_pair_comparison_list:
        ending_dict[(comp1, comp2)] = sent_cos_sim_v2_add_padding(comp1, comp2)
    if sorting:
        sorted_ending_dict = {comps:comps_res for comps, comps_res in (sorted(ending_dict.items(), key=lambda dict_item: dict_item[1], reverse = True))}
        return sorted_ending_dict
    return ending_dict

In [93]:
res_dict = test_emb_model(get_sentenceS_embedding, comparison_pair_list, sorting=False , additional_nesting=True)
test_emb_model_results(res_dict, sorting=True)

Similarity level:
tax relief for emplo /-/ employee benefit     : 0.80115
yearly flight ticket /-/ subsidised flights t : 0.78594
tax relief           /-/ employee benefit     : 0.69383
yearly flight ticket /-/ home passage         : 0.60292
night shift          /-/ overtime pay         : 0.59808
the world is green   /-/ i hate soda          : 0.43689


In [94]:
res_dict_2 = test_emb_model_for_v2_only(comparison_pair_list, sorting=False)
test_emb_model_results(res_dict_2, sorting=True)

Similarity level:
tax relief for emplo /-/ employee benefit     : 0.80115
yearly flight ticket /-/ subsidised flights t : 0.78594
tax relief           /-/ employee benefit     : 0.69383
yearly flight ticket /-/ home passage         : 0.60292
night shift          /-/ overtime pay         : 0.59808
the world is green   /-/ i hate soda          : 0.43689


In [95]:
get_sentenceS_embedding("hi there")

tensor([[ 0.0332,  0.0097,  0.0034,  ..., -0.0026, -0.0248, -0.0165]])

In [96]:
get_sentenceS_embedding("hey there vixen")

tensor([[ 0.0479,  0.0263,  0.0269,  ..., -0.0055, -0.0038, -0.0152]])

In [98]:
get_sentenceS_embedding(["hi there", "hey there vixen"])

tensor([[ 0.0332,  0.0097,  0.0034,  ..., -0.0026, -0.0248, -0.0165],
        [ 0.0479,  0.0263,  0.0269,  ..., -0.0055, -0.0038, -0.0152]])

In [99]:
def get_tok_emb(sentenceS): ## already normalised due to "torch.nn.functional.normalize" function
    # Tokenize sentences
    encoded_input = tokenizer(sentenceS, padding=True, truncation=True, return_tensors='pt')
    # for s2p(short query to long passage) retrieval task, add an instruction to query (not add instruction for passages)
    # encoded_input = tokenizer([instruction + q for q in queries], padding=True, truncation=True, return_tensors='pt')

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)
        return model_output[0]

In [100]:
get_tok_emb("hi there")

tensor([[[ 0.6008,  0.1745,  0.0613,  ..., -0.0468, -0.4488, -0.2984],
         [ 0.5065,  0.3489,  0.3199,  ..., -0.1757, -0.3878, -0.2594],
         [ 0.3713,  0.2677,  0.2782,  ..., -0.0365, -0.7608, -0.1022],
         [ 0.6859,  0.3338,  0.4754,  ...,  0.1192, -0.8447, -0.4623]]])

In [101]:
get_tok_emb("hey there vixen")

tensor([[[ 0.8593,  0.4729,  0.4823,  ..., -0.0984, -0.0689, -0.2731],
         [ 0.6250,  0.7473,  0.8183,  ..., -0.1553, -0.0921, -0.4358],
         [ 0.7799,  0.5088,  0.6466,  ..., -0.3468, -0.1345, -0.1214],
         ...,
         [ 0.6439,  0.3157,  0.6030,  ..., -0.3038,  0.5707, -0.6640],
         [ 0.4212,  0.4156,  0.6218,  ..., -0.4362,  0.3738, -0.2459],
         [ 0.4339,  0.2644,  0.6481,  ..., -0.3074,  0.0161, -0.4605]]])

In [107]:
get_tok_emb(["hi there", "hey there vixen"])[:,:5]

tensor([[[ 0.6008,  0.1745,  0.0613,  ..., -0.0468, -0.4488, -0.2984],
         [ 0.5065,  0.3489,  0.3199,  ..., -0.1757, -0.3878, -0.2594],
         [ 0.3713,  0.2677,  0.2782,  ..., -0.0365, -0.7608, -0.1022],
         [ 0.6859,  0.3338,  0.4754,  ...,  0.1192, -0.8447, -0.4623],
         [ 0.6471,  0.2915, -0.1407,  ...,  0.0160, -0.7138, -0.1993]],

        [[ 0.8593,  0.4729,  0.4823,  ..., -0.0984, -0.0689, -0.2731],
         [ 0.6250,  0.7473,  0.8183,  ..., -0.1553, -0.0921, -0.4358],
         [ 0.7799,  0.5088,  0.6466,  ..., -0.3468, -0.1345, -0.1214],
         [ 0.2075,  0.4856,  0.5409,  ..., -0.0751,  0.4361, -0.2125],
         [ 0.6439,  0.3157,  0.6030,  ..., -0.3038,  0.5707, -0.6640]]])