In [1]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.3.1 -> 23.3.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# importing libraries
import random
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [3]:

# Set a random seed
random_seed = 42
random.seed(random_seed)
 
# Set a random seed for PyTorch (for GPU as well)
torch.manual_seed(random_seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_seed)

In [4]:
# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [5]:

# Input text
text = "GeeksforGeeks is a computer science portal"
textList = [text]
 
# Tokenize and encode text using batch_encode_plus
# The function returns a dictionary containing the token IDs and attention masks
encoding = tokenizer.batch_encode_plus(
    textList,                   # List of input texts
    padding=True,              # Pad to the maximum sequence length
    truncation=True,           # Truncate to the maximum sequence length if necessary
    return_tensors='pt',      # Return PyTorch tensors
    add_special_tokens=True    # Add special tokens CLS and SEP
)
 
input_ids = encoding['input_ids']  # Token IDs
# print input IDs
print(f"Input ID: {input_ids}")
attention_mask = encoding['attention_mask']  # Attention mask
# print attention mask
print(f"Attention mask: {attention_mask}")

Input ID: tensor([[  101, 29294, 22747, 21759,  4402,  5705,  2003,  1037,  3274,  2671,
          9445,   102]])
Attention mask: tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])


In [6]:
# Generate embeddings using BERT model
with torch.no_grad():
    outputs = model(input_ids, attention_mask=attention_mask)
    word_embeddings = outputs.last_hidden_state  # This contains the embeddings
 
# Output the shape of word embeddings
print(f"Shape of Word Embeddings: {word_embeddings.shape}")

Shape of Word Embeddings: torch.Size([1, 12, 768])


[42,3,768]
42 = Number of Sentences
3 = Number of tokens in Input
768 = Each Token is represented by a 768 Dimentional Vector

In [8]:
# Decode the token IDs back to text
decoded_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
#print decoded text
print(f"Decoded Text: {decoded_text}")
# Tokenize the text again for reference
tokenized_text = tokenizer.tokenize(decoded_text)
#print tokenized text
print(f"tokenized Text: {tokenized_text}")
# Encode the text
encoded_text = tokenizer.encode(text, return_tensors='pt')  # Returns a tensor
# Print encoded text
print(f"Encoded Text: {encoded_text}")

Decoded Text: geeksforgeeks is a computer science portal
tokenized Text: ['geek', '##sf', '##org', '##ee', '##ks', 'is', 'a', 'computer', 'science', 'portal']
Encoded Text: tensor([[  101, 29294, 22747, 21759,  4402,  5705,  2003,  1037,  3274,  2671,
          9445,   102]])


In [10]:


# Print word embeddings for each token
for token, embedding in zip(tokenized_text, word_embeddings[0]):
    #print(f"Token: {token}")
    print(f"Embedding: {embedding}")
    print("\n")

Embedding: tensor([-2.4299e-01, -2.2849e-01,  5.8441e-02,  5.7859e-03, -4.3398e-01,
        -3.4388e-01,  9.6974e-02,  3.6446e-01, -6.3829e-02, -2.3413e-01,
        -3.2477e-01, -4.9730e-01, -3.0048e-01,  3.5098e-01, -4.8904e-01,
        -1.2836e-01, -5.5042e-01,  4.0802e-02, -3.2041e-01, -1.6057e-01,
        -5.1553e-01, -6.8781e-01, -3.1180e-02, -3.0534e-01, -5.4216e-02,
        -3.0744e-03,  1.3321e-01, -1.6943e-01,  8.4898e-02,  9.0043e-02,
        -2.7585e-01,  2.1080e-01,  1.8501e-01,  1.0148e-01, -4.6670e-02,
        -6.0020e-02, -3.8475e-01, -1.5766e-01,  1.5135e-01,  4.8828e-01,
        -8.5848e-02,  1.5162e-01, -5.1325e-02,  3.3067e-01, -3.7622e-01,
         1.4607e-02, -2.5663e+00,  1.3022e-02, -4.0821e-01, -3.8914e-01,
        -2.0393e-01,  2.3422e-01, -1.2840e-01,  2.7005e-01,  2.8026e-01,
         4.2757e-01,  9.1298e-02,  4.1925e-01, -4.9036e-01, -5.7249e-02,
        -9.7080e-02,  4.0584e-01,  1.1193e-02, -1.3692e-01, -1.0951e-01,
        -1.2563e-01, -1.2500e-01,  2.897

In [12]:
# Compute the average of word embeddings to get the sentence embedding
sentence_embedding = word_embeddings.mean(dim=1)  # Average pooling along the sequence length dimension
 
# Print the sentence embedding
print("Sentence Embedding:")
print(sentence_embedding)
 
# Output the shape of the sentence embedding
print(f"Shape of Sentence Embedding: {sentence_embedding.shape}")

Sentence Embedding:
tensor([[-1.2731e-01,  2.3766e-01,  1.6280e-01,  1.7505e-01,  2.1393e-01,
         -7.2085e-01, -1.1638e-01,  5.5303e-01, -2.4897e-01, -3.5928e-02,
         -9.9867e-02, -5.9745e-01, -1.2874e-02,  4.0385e-01, -4.7625e-01,
          9.3285e-02, -3.1485e-01,  1.4257e-02, -3.1248e-01, -1.5662e-01,
         -1.8107e-01, -2.4591e-01, -9.8348e-02,  5.4759e-01,  1.2483e-01,
         -1.4749e-01,  1.3483e-01, -7.2541e-02, -1.9855e-01,  1.4454e-01,
          2.8442e-01,  5.6379e-01,  5.4474e-02,  2.2264e-02, -5.7517e-01,
         -1.2750e-01, -3.6029e-01,  3.5858e-02, -4.9745e-02,  8.4347e-01,
          1.1492e-02, -2.9697e-01, -5.3173e-02,  2.8290e-01, -1.8621e-01,
         -2.4351e-01, -2.6526e-02,  9.3529e-02,  4.0381e-03,  7.7375e-02,
         -7.1318e-01,  3.2485e-01, -3.3769e-01, -4.2121e-02,  3.3158e-01,
          6.6863e-01,  2.6681e-01, -3.5177e-01, -2.5513e-01, -5.6468e-02,
          1.4842e-01,  2.8006e-01, -3.6460e-02, -2.7066e-01,  1.2288e-01,
          5.4925e-

In [13]:
# Example sentence for similarity comparison
example_sentence = "GeeksforGeeks is a technology website"
 
# Tokenize and encode the example sentence
example_encoding = tokenizer.batch_encode_plus(
    [example_sentence],
    padding=True,
    truncation=True,
    return_tensors='pt',
    add_special_tokens=True
)
example_input_ids = example_encoding['input_ids']
example_attention_mask = example_encoding['attention_mask']
 
# Generate embeddings for the example sentence
with torch.no_grad():
    example_outputs = model(example_input_ids, attention_mask=example_attention_mask)
    example_sentence_embedding = example_outputs.last_hidden_state.mean(dim=1)
 
# Compute cosine similarity between the original sentence embedding and the example sentence embedding
similarity_score = cosine_similarity(sentence_embedding, example_sentence_embedding)
 
# Print the similarity score
print("Cosine Similarity Score:", similarity_score[0][0])

Cosine Similarity Score: 0.9561722
