https://osanseviero.github.io/hackerllama/blog/posts/sentence_embeddings/ (ref)

In [2]:
!pip install sentence_transformers

Installing collected packages: nvidia-nvtx-cu12, nvidia-nvjitlink-cu12, nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, nvidia-cusparse-cu12, nvidia-cudnn-cu12, nvidia-cusolver-cu12, sentence_transformers
Successfully installed nvidia-cublas-cu12-12.1.3.1 nvidia-cuda-cupti-cu12-12.1.105 nvidia-cuda-nvrtc-cu12-12.1.105 nvidia-cuda-runtime-cu12-12.1.105 nvidia-cudnn-cu12-8.9.2.26 nvidia-cufft-cu12-11.0.2.54 nvidia-curand-cu12-10.3.2.106 nvidia-cusolver-cu12-11.4.5.107 nvidia-cusparse-cu12-12.1.0.106 nvidia-nccl-cu12-2.19.3 nvidia-nvjitlink-cu12-12.4.99 nvidia-nvtx-cu12-12.1.105 sentence_transformers-2.6.0


In [8]:
from sentence_transformers import SentenceTransformer, util

In [15]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [5]:
sentences = ["The weather today is beautiful", "It's raining!", "Dogs are awesome"]
embeddings = model.encode(sentences)
embeddings.shape

(3, 384)

In [12]:
first_embedding = model.encode('today is sunny day')
for embedding, sentence in zip(embeddings, sentences):
  cos_score = util.pytorch_cos_sim(first_embedding, embedding)
  print(f"score:{cos_score} ({sentence})")

score:tensor([[0.7190]]) (The weather today is beautiful)
score:tensor([[0.3898]]) (It's raining!)
score:tensor([[0.1043]]) (Dogs are awesome)


In [10]:
faq = {
    "How do I get a replacement Medicare card?": "If your Medicare card was lost, stolen, or destroyed, you can request a replacement online at Medicare.gov.",
    "How do I sign up for Medicare?": "If you already get Social Security benefits, you do not need to sign up for Medicare. We will automatically enroll you in Original Medicare (Part A and Part B) when you become eligible. We will mail you the information a few months before you become eligible.",
    "What are Medicare late enrollment penalties?": "In most cases, if you don’t sign up for Medicare when you’re first eligible, you may have to pay a higher monthly premium. Find more information at https://faq.ssa.gov/en-us/Topic/article/KA-02995",
    "Will my Medicare premiums be higher because of my higher income?": "Some people with higher income may pay a larger percentage of their monthly Medicare Part B and prescription drug costs based on their income. We call the additional amount the income-related monthly adjustment amount.",
    "What is Medicare and who can get it?": "Medicare is a health insurance program for people age 65 or older. Some younger people are eligible for Medicare including people with disabilities, permanent kidney failure and amyotrophic lateral sclerosis (Lou Gehrig’s disease or ALS). Medicare helps with the cost of health care, but it does not cover all medical expenses or the cost of most long-term care.",
}

In [11]:
corpus_embeddings = model.encode(list(faq.keys()))
print(corpus_embeddings.shape)

(5, 384)


In [13]:
user_question = "Do I need to pay more after a raise?"
user_question_embedding = model.encode(user_question)

In [14]:
similarities = util.semantic_search(user_question_embedding, corpus_embeddings, top_k=5)
similarities

[[{'corpus_id': 3, 'score': 0.4642062783241272},
  {'corpus_id': 4, 'score': 0.11628524214029312},
  {'corpus_id': 2, 'score': 0.09916316717863083},
  {'corpus_id': 1, 'score': 0.09463591873645782},
  {'corpus_id': 0, 'score': 0.07962210476398468}]]

In [19]:
for i, result in enumerate(similarities[0]):
  corpus_id = result['corpus_id']
  question = list(faq.keys())[corpus_id]
  answer = list(faq.values())[corpus_id]
  score = result['score']
  print(f"score:{score}\nquestion:{question}\nanswer:{answer}\n")

score:0.4642062783241272
question:Will my Medicare premiums be higher because of my higher income?
answer:Some people with higher income may pay a larger percentage of their monthly Medicare Part B and prescription drug costs based on their income. We call the additional amount the income-related monthly adjustment amount.

score:0.11628524214029312
question:What is Medicare and who can get it?
answer:Medicare is a health insurance program for people age 65 or older. Some younger people are eligible for Medicare including people with disabilities, permanent kidney failure and amyotrophic lateral sclerosis (Lou Gehrig’s disease or ALS). Medicare helps with the cost of health care, but it does not cover all medical expenses or the cost of most long-term care.

score:0.09916316717863083
question:What are Medicare late enrollment penalties?
answer:In most cases, if you don’t sign up for Medicare when you’re first eligible, you may have to pay a higher monthly premium. Find more information

In [20]:
from transformers import AutoModel, AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased")

In [39]:
text = "The king and the queen are happy."
t = tokenizer.tokenize(text,add_special_tokens=True)
print(len(t))
t

10


['[CLS]', 'the', 'king', 'and', 'the', 'queen', 'are', 'happy', '.', '[SEP]']

In [41]:
encoded_input = tokenizer(text, return_tensors="pt")
output = model(**encoded_input)
output['last_hidden_state'].shape

torch.Size([1, 10, 768])

In [42]:
king_embedding = output['last_hidden_state'][0][2]
queen_embedding = output['last_hidden_state'][0][5]
print(f'score:{util.pytorch_cos_sim(king_embedding, queen_embedding)} ')

score:tensor([[0.7921]], grad_fn=<MmBackward0>) 


In [43]:
happy_embedding = output.last_hidden_state[0][7]  # happy
util.pytorch_cos_sim(king_embedding, happy_embedding)

tensor([[0.5239]], grad_fn=<MmBackward0>)

In [47]:
text = "The angry and unhappy king"
encoded_input = tokenizer(text, return_tensors="pt")
output = model(**encoded_input)
output["last_hidden_state"].shape


torch.Size([1, 7, 768])

In [48]:
tokenizer.tokenize(text, add_special_tokens=True)

['[CLS]', 'the', 'angry', 'and', 'unhappy', 'king', '[SEP]']

In [49]:
king_embedding_2 = output["last_hidden_state"][0][5]
util.pytorch_cos_sim(king_embedding, king_embedding_2)

tensor([[0.5740]], grad_fn=<MmBackward0>)

In [51]:
tokenizer.tokenize('tokenization')

['token', '##ization']

In [52]:
text = "this is about tokenization"
encoded_input = tokenizer(text, return_tensors="pt")
output = model(**encoded_input)

In [53]:
tokenizer.tokenize(text, add_special_tokens=True)

['[CLS]', 'this', 'is', 'about', 'token', '##ization', '[SEP]']

In [54]:
word_token_indices = [4, 5]
word_embeddings = output["last_hidden_state"][0, word_token_indices]
word_embeddings.shape

torch.Size([2, 768])

In [56]:
import torch
torch.mean(word_embeddings, dim=0).shape

torch.Size([768])

In [65]:
def get_word_embedding(text, word):
    encoded_input = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        output = model(**encoded_input)

    word_ids = tokenizer.encode(
        word, add_special_tokens=False
    )
    word_token_indices = [
        i
        for i, token_id in enumerate(encoded_input["input_ids"][0])
        if token_id in word_ids
    ]

    word_embeddings = output["last_hidden_state"][0, word_token_indices]
    return torch.mean(word_embeddings, dim=0)

In [63]:
text = "this is about tokenization"
get_word_embedding(text, 'tokenization').shape

[19204, 3989]


torch.Size([768])

In [61]:
torch.allclose(torch.mean(word_embeddings, dim=0), get_word_embedding(text, 'tokenization'))

True

In [66]:
util.pytorch_cos_sim(
    get_word_embedding("The king is angry", "king"),
    get_word_embedding("The queen is angry", "queen"),
)

tensor([[0.8564]])

In [67]:
util.pytorch_cos_sim(
    get_word_embedding("The king is happy", "king"),
    get_word_embedding("The queen is angry", "queen"),
)

tensor([[0.8273]])

In [69]:
util.pytorch_cos_sim(
    get_word_embedding("The king and the queen are happy.", "king"),
    get_word_embedding("The angry and unhappy king", "king"),
)

tensor([[0.5740]])

In [70]:
util.pytorch_cos_sim(
    get_word_embedding("The river bank", "bank"),
    get_word_embedding("The savings bank", "bank"),
)

tensor([[0.7587]])

In [71]:
encode_input = tokenizer('Thisn is an  example sentence', return_tensors='pt')
model_output = model(**encoded_input)
sentence_embedding = model_output['last_hidden_state'][:, 0, :]
sentence_embedding.shape

torch.Size([1, 768])

In [73]:
def cls_pooling(model_output):
    return model_output["last_hidden_state"][:, 0, :]

def get_sentence_embedding(text):
    encoded_input = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        model_output = model(**encoded_input)
    return cls_pooling(model_output)

In [75]:
embeddings = [get_sentence_embedding(sentence) for sentence in sentences]
query_embedding = get_sentence_embedding("Today is a sunny day")
for embedding, sentence in zip(embeddings, sentences):
    similarity = util.pytorch_cos_sim(query_embedding, embedding)
    print(similarity, sentence)

tensor([[0.9261]]) The weather today is beautiful
tensor([[0.8903]]) It's raining!
tensor([[0.9317]]) Dogs are awesome


In [77]:
def cls_pooling(model_output):
    return model.pooler(model_output["last_hidden_state"])  # we changed this


embeddings = [get_sentence_embedding(sentence) for sentence in sentences]
query_embedding = get_sentence_embedding("Today is a sunny day")
for embedding, sentence in zip(embeddings, sentences):
    similarity = util.pytorch_cos_sim(query_embedding, embedding)
    print(similarity, sentence)

tensor([[0.9673]], grad_fn=<MmBackward0>) The weather today is beautiful
tensor([[0.9029]], grad_fn=<MmBackward0>) It's raining!
tensor([[0.8930]], grad_fn=<MmBackward0>) Dogs are awesome


In [78]:
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
encoded_input = tokenizer("Today is a sunny day", return_tensors="pt")
model_output = model(**encoded_input)

In [79]:
token_embeddings = model_output["last_hidden_state"]
token_embeddings.shape

torch.Size([1, 7, 384])

In [80]:
mean_embedding = torch.mean(token_embeddings, dim=1)
mean_embedding.shape

torch.Size([1, 384])

In [82]:
import torch.nn.functional as F
normalized_embedding = F.normalize(mean_embedding)
normalized_embedding.shape

torch.Size([1, 384])

In [84]:
def mean_pooling(model_output):
    return torch.mean(model_output["last_hidden_state"], dim=1)

def get_sentence_embedding(text):
    encoded_input = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        model_output = model(**encoded_input)
    sentence_embeddings = mean_pooling(model_output)
    return F.normalize(sentence_embeddings)

get_sentence_embedding("Today is a sunny day").shape

torch.Size([1, 384])

In [99]:
def mean_pooling(model_output, attention_mask):
  token_embeddings = model_output['last_hidden_state']
  input_mask_expanded = (
      attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
  )
  return torch.sum(token_embeddings, 1)/torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [100]:
def get_sentence_embedding(sentences):
  encoded_input = tokenizer(
      sentences, padding=True, truncation=True, return_tensors="pt"
  )
  with torch.no_grad():
    model_output = model(**encoded_input)

  sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

  return  F.normalize(sentence_embeddings)

In [102]:
query_embedding = get_sentence_embedding("Today is a sunny day")[0]
query_embedding.shape

torch.Size([384])

In [103]:
embeddings = [get_sentence_embedding(sentence) for sentence in sentences]
for embedding, sentence in zip(embeddings, sentences):
    similarity = util.pytorch_cos_sim(query_embedding, embedding)
    print(similarity, sentence)

tensor([[0.7344]]) The weather today is beautiful
tensor([[0.4180]]) It's raining!
tensor([[0.1060]]) Dogs are awesome


In [105]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
query_embedding = model.encode("Today is a sunny day")
embeddings = model.encode(sentences)
for embedding, sentence in zip(embeddings, sentences):
    similarity = util.pytorch_cos_sim(query_embedding, embedding)
    print(similarity, sentence)

tensor([[0.7344]]) The weather today is beautiful
tensor([[0.4180]]) It's raining!
tensor([[0.1060]]) Dogs are awesome


In [None]:
!pip install datasets

In [108]:
from datasets import load_dataset

In [None]:
dataset = load_dataset('quora', split='train')
dataset

In [110]:
dataset.to_pandas().head(2)

Unnamed: 0,questions,is_duplicate
0,"{'id': [1, 2], 'text': ['What is the step by s...",False
1,"{'id': [3, 4], 'text': ['What is the story of ...",False


In [111]:
corpus_questions = []
for d in dataset:
    corpus_questions.append(d["questions"]["text"][0])
    corpus_questions.append(d["questions"]["text"][1])
corpus_questions = list(set(corpus_questions))
len(corpus_questions)

537362

In [114]:
model = SentenceTransformer("quora-distilbert-multilingual")
questions_to_embed = 100000
corpus_embeddings = model.encode(
    corpus_questions[:questions_to_embed],
    show_progress_bar=True,
    convert_to_tensor=True,
)

Batches:   0%|          | 0/3125 [00:00<?, ?it/s]

Batches:   0%|          | 0/3125 [00:00<?, ?it/s]

In [117]:
corpus_embeddings.shape

torch.Size([100000, 768])

In [118]:
import time

def search(query):
    start_time = time.time()
    query_embedding = model.encode(query, convert_to_tensor=True)
    results = util.semantic_search(query_embedding, corpus_embeddings)
    end_time = time.time()

    print("Results (after {:.3f} seconds):".format(end_time - start_time))
    for result in results[0][:5]:
        print(
            "{:.3f}\t{}".format(result["score"], corpus_questions[result["corpus_id"]])
        )

In [123]:
search("How can I learn javascript online?")

Results (after 0.963 seconds):
0.975	What are some of the good sites to learn javascript?
0.955	What is the easiest way to learn java programming?
0.946	What is the best way to learn advance java?
0.942	How do I learn java programming as a freshman?
0.941	What are the best websites for learning java?


In [125]:
search("elonmusk?")

Results (after 1.788 seconds):
0.964	????
0.962	Teeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeest la la?
0.956	Que es permanente parcial?
0.954	What is SHIFT?
0.954	U visa in usa?
