In [1]:
from sentence_transformers import SentenceTransformer, util

In [2]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [3]:
sentences = ["The weather today is beautiful", "It's raining!", "Dogs are awesome"]
embeddings = model.encode(sentences)
query_embedding = model.encode("today is sunny day!")

for sentence, emb in zip(sentences, embeddings):
    score = util.pytorch_cos_sim(query_embedding, emb)
    print(f"score: {score}", sentence)

score: tensor([[0.6973]]) The weather today is beautiful
score: tensor([[0.5133]]) It's raining!
score: tensor([[0.1889]]) Dogs are awesome


In [4]:
faq = {
    "How do I get a replacement Medicare card?": "If your Medicare card was lost, stolen, or destroyed, you can request a replacement online at Medicare.gov.",
    "How do I sign up for Medicare?": "If you already get Social Security benefits, you do not need to sign up for Medicare. We will automatically enroll you in Original Medicare (Part A and Part B) when you become eligible. We will mail you the information a few months before you become eligible.",
    "What are Medicare late enrollment penalties?": "In most cases, if you don’t sign up for Medicare when you’re first eligible, you may have to pay a higher monthly premium. Find more information at https://faq.ssa.gov/en-us/Topic/article/KA-02995",
    "Will my Medicare premiums be higher because of my higher income?": "Some people with higher income may pay a larger percentage of their monthly Medicare Part B and prescription drug costs based on their income. We call the additional amount the income-related monthly adjustment amount.",
    "What is Medicare and who can get it?": "Medicare is a health insurance program for people age 65 or older. Some younger people are eligible for Medicare including people with disabilities, permanent kidney failure and amyotrophic lateral sclerosis (Lou Gehrig’s disease or ALS). Medicare helps with the cost of health care, but it does not cover all medical expenses or the cost of most long-term care.",
}

In [5]:
corpus_embeddings = model.encode(list(faq.keys()))
query_emb = model.encode('do i have to pay more after my raise?')
similarities = util.semantic_search(query_emb, corpus_embeddings, top_k=3)

for i, res in enumerate(similarities[0]):
    id, score = res['corpus_id'], res['score']
    query =  list(faq.keys())[id]
    answer = faq[query]
    print(f"score:{score:.2f}, query: {query}, answer: {answer}\n")

score:0.46, query: Will my Medicare premiums be higher because of my higher income?, answer: Some people with higher income may pay a larger percentage of their monthly Medicare Part B and prescription drug costs based on their income. We call the additional amount the income-related monthly adjustment amount.

score:0.14, query: What is Medicare and who can get it?, answer: Medicare is a health insurance program for people age 65 or older. Some younger people are eligible for Medicare including people with disabilities, permanent kidney failure and amyotrophic lateral sclerosis (Lou Gehrig’s disease or ALS). Medicare helps with the cost of health care, but it does not cover all medical expenses or the cost of most long-term care.

score:0.13, query: What are Medicare late enrollment penalties?, answer: In most cases, if you don’t sign up for Medicare when you’re first eligible, you may have to pay a higher monthly premium. Find more information at https://faq.ssa.gov/en-us/Topic/artic

In [7]:
from transformers import AutoTokenizer, AutoModel

In [9]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

In [16]:
text = 'The king and the queen are happy.'
tokens = tokenizer.tokenize(text, add_special_tokens=True)
print(tokens)

['[CLS]', 'the', 'king', 'and', 'the', 'queen', 'are', 'happy', '.', '[SEP]']


In [17]:
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
print(output['last_hidden_state'].shape)

torch.Size([1, 10, 768])


In [22]:
king_emb = output['last_hidden_state'][0][2]
queen_emb = output['last_hidden_state'][0][5]
happy_emb = output['last_hidden_state'][0][7]

print(f'similarity (king vs queen): {util.pytorch_cos_sim(king_emb, queen_emb)[0][0]}')
print(f'similarity (king vs happy): {util.pytorch_cos_sim(king_emb, happy_emb)[0][0]}')

similarity (king vs queen): 0.7920713424682617
similarity (king vs happy): 0.5239201784133911


In [23]:
text = "The angry and unhappy king"
encoded_input = tokenizer(text, return_tensors='pt')
output  = model(**encoded_input)
print(output['last_hidden_state'].shape)

torch.Size([1, 7, 768])


In [24]:
tokenizer.tokenize(text, add_special_tokens=True)

['[CLS]', 'the', 'angry', 'and', 'unhappy', 'king', '[SEP]']

In [25]:
king2_emb = output['last_hidden_state'][0][5]
print(f'similarity (king1 vs king2): {util.pytorch_cos_sim(king_emb, king2_emb)[0][0]}')

similarity (king1 vs king2): 0.5740004777908325


In [26]:
print(tokenizer.tokenize('tokenization'))

['token', '##ization']


In [28]:
text ="""
<html>
</html>
"""
tokenizer.tokenize(text, add_special_tokens=True)

['[CLS]', '<', 'html', '>', '<', '/', 'html', '>', '[SEP]']

In [66]:
text = 'this is about tokenization'
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)

In [67]:
tokenizer.tokenize(text, add_special_tokens=True)

['[CLS]', 'this', 'is', 'about', 'token', '##ization', '[SEP]']

In [68]:
word_token_indices = [4, 5]
word_embeddings = output['last_hidden_state'][0, word_token_indices]
word_embeddings.shape

torch.Size([2, 768])

In [69]:
import torch
word_embeddings = torch.mean(word_embeddings, dim=0)

In [70]:
tokenizer.encode(text, add_special_tokens=False)

[2023, 2003, 2055, 19204, 3989]

In [71]:
tokenizer.encode('tokenization', add_special_tokens=False)

[19204, 3989]

In [72]:
def get_word_embedding(text, word):
    encoded_input = tokenizer(text, return_tensors='pt')
    with torch.no_grad():
        output = model(**encoded_input)
    word_ids = tokenizer.encode(word, add_special_tokens=False)
    word_indices = [i for i, token_id in enumerate(encoded_input['input_ids'][0]) if token_id in word_ids]
    word_embeddings = output['last_hidden_state'][0, word_indices]
    return torch.mean(word_embeddings, dim=0)

In [73]:
word_embeddings2 = get_word_embedding(text, 'tokenization')
print(word_embeddings2.shape)
print(torch.allclose(word_embeddings, word_embeddings2))

torch.Size([768])
True


In [74]:
util.pytorch_cos_sim(
    get_word_embedding('the king is angry', 'king'),
    get_word_embedding('the queen is angry', 'queen')
)

tensor([[0.8564]])

In [75]:
util.pytorch_cos_sim(
    get_word_embedding('the king is angry', 'king'),
    get_word_embedding('the queen is happy', 'queen')
)

tensor([[0.8059]])