In [1]:
import torch
from transformers import BertTokenizer, BertModel

In [2]:
# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [21]:
def embed_text(input_text, tokenizer, model):
    # Tokenization
    tokens = tokenizer.tokenize(input_text)
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    
    # Padding
    max_length = 10  # Maximum sequence length for BERT
    padded_token_ids = token_ids[:max_length]
    attention_mask = [1] * len(padded_token_ids)
    
    # Padding to match max_length
    while len(padded_token_ids) < max_length:
        padded_token_ids.append(0)
        attention_mask.append(0)
        
    # Convert to PyTorch tensors
    input_ids = torch.tensor([padded_token_ids])
    attention_mask = torch.tensor([attention_mask])
    
    # Model inference
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        
    contextual_embeddings = outputs.last_hidden_state.squeeze(0)
    
    return contextual_embeddings

In [34]:
# Input text
input_text1 = "I love to go for a walk in the park."
input_text2 = "parks are made for walking"

contextual_embeddings1 = embed_text(input_text1, tokenizer, model)
contextual_embeddings2 = embed_text(input_text2, tokenizer, model)


In [37]:
# Index of the word to analyze
word_index1 = 6  # Example: "walk"

# Get the contextualized representation for the word
word_embedding1 = contextual_embeddings1[word_index1]

word_index2 = 4 # park
word_embedding2 = contextual_embeddings2[word_index2]

# Print the contextualized representation
# print("Contextualized representation of the word:", word_embedding)

In [38]:
from scipy.spatial.distance import cosine
rep_1_np = word_embedding1.numpy()
rep_2_np = word_embedding2.numpy()

# Calculate cosine similarity
similarity = 1 - cosine(rep_1_np, rep_2_np)

# Print similarity score
print("Cosine similarity:", similarity)

Cosine similarity: 0.5662438869476318
