In [1]:
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

For the tokenizer to work properly we need to add some tokens of our own.  One being "[CLS]" which marks the beginning of a text entry and the other being "[SEP]" which is the token that we put in between sentences within the same text entry.  So lets put together the following text.  "We are learning something very cool right now. Pay attention". After we put the tokens in the right places we get the following:

In [14]:
sample_text = "[CLS] We are learning something very cool right now about embedding [SEP] Pay attention"

Now we can tokenize the text above.

In [15]:
tokens = tokenizer.tokenize(sample_text)
print(tokens)

['[CLS]', 'we', 'are', 'learning', 'something', 'very', 'cool', 'right', 'now', 'about', 'em', '##bed', '##ding', '[SEP]', 'pay', 'attention']


In [21]:
input_ids = tokenizer.convert_tokens_to_ids(tokens)
for i in range(0, 16):
    print(tuple((tokens[i], input_ids[i])))

('[CLS]', 101)
('we', 2057)
('are', 2024)
('learning', 4083)
('something', 2242)
('very', 2200)
('cool', 4658)
('right', 2157)
('now', 2085)
('about', 2055)
('em', 7861)
('##bed', 8270)
('##ding', 4667)
('[SEP]', 102)
('pay', 3477)
('attention', 3086)


In [25]:
with torch.no_grad():
    # ids -> hidden state vectors
    input_tensor = torch.LongTensor(input_ids).cuda().view(-1,1)
    input_mask = torch.LongTensor(input_mask).cuda().view(-1,1)
    input_type_ids = torch.LongTensor(input_type_ids).cuda().view(-1,1)
    
    encoded_layers, _ = model(input_tensor, token_type_ids=input_type_ids, attention_mask=input_mask)

NameError: name 'input_mask' is not defined

In [22]:
# Define example sentence with multiple meanings of the word "bank"
text = "After stealing money from the bank vault, the bank robber was seen " \
       "fishing on the Mississippi river bank."

# Add the special tokens.
marked_text = "[CLS] " + text + " [SEP]"

# Split the sentence into tokens.
tokenized_text = tokenizer.tokenize(marked_text)

indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

# Mark each of the 22 tokens as belonging to sentence "1".
segments_ids = [1] * len(tokenized_text)

In [24]:
# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

In [1]:
from bert_serving.client import BertClient
bc = BertClient()

In [17]:
sample = ["We are learning something very cool right now", "Pay attention"]
embedding = bc.encode(sample)

In [22]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
print(cosine_similarity([embedding[0]], [embedding[1]])[0][0])
print(cosine_similarity([embedding[0]], [embedding[0]])[0][0])

0.5538108
1.0000001


In [15]:
embedding[0]

array([ 7.76866972e-02, -3.24376851e-01,  1.08688269e-02,  2.16468170e-01,
        4.34082925e-01, -3.97152722e-01, -1.02522179e-01,  8.51855278e-01,
       -2.14468688e-01, -5.11398852e-01,  2.00513691e-01, -4.32051234e-02,
        8.13675672e-02,  4.80618179e-01, -2.45117947e-01,  7.56960034e-01,
       -4.63516146e-01, -1.32480785e-01,  1.58990890e-01,  4.52834427e-01,
       -8.32417309e-01,  1.62431151e-02, -2.33785823e-01,  5.35807550e-01,
        1.13558635e-01, -5.77926040e-01,  4.47058588e-01, -4.40508783e-01,
       -4.12556261e-01,  4.50727642e-02,  2.32749432e-01,  2.53855996e-02,
        8.56867582e-02, -1.30302384e-01,  1.32174954e-01, -4.30623114e-01,
       -1.20438099e-01,  8.52979794e-02, -4.44109827e-01, -2.57052600e-01,
       -3.92818362e-01, -3.79355818e-01,  1.59340680e-01, -2.10484535e-01,
       -5.07567823e-02, -1.72329098e-01, -2.14047760e-01, -1.54110730e-01,
        3.34607661e-01, -2.60638744e-01, -5.91038585e-01,  6.46616817e-02,
       -1.30034178e-01,  

In [30]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('bert-base-nli-mean-tokens')

In [32]:
sample = ["We are learning something very cool right now", "Pay attention"]
sentence_embeddings = model.encode(sample)

In [33]:
sample2 = ["We are learning something very cool right now. Pay attention"]
sentence_embeddings2 = model.encode(sample2)

In [38]:
average = np.mean([sentence_embeddings[0], sentence_embeddings[1]], axis = 0)

In [39]:
big_sentence = sentence_embeddings2

In [45]:
cosine_similarity(big_sentence, [average])

array([[0.91829956]], dtype=float32)

In [29]:
print(cosine_similarity([sentence_embeddings[0]], [sentence_embeddings[1]])[0][0])
print(cosine_similarity([sentence_embeddings[0]], [sentence_embeddings[0]])[0][0])

0.64304817
1.0000001
