In [None]:
import torch
from transformers import BertTokenizer, BertModel

In [None]:
# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

In [None]:
def embed_text(input_text, tokenizer, model):
    # Tokenization
    tokens = tokenizer.tokenize(input_text)
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    
    # Padding
    max_length = 20  # Maximum sequence length for BERT
    padded_token_ids = token_ids[:max_length]
    attention_mask = [1] * len(padded_token_ids)
    
    # Padding to match max_length
    while len(padded_token_ids) < max_length:
        padded_token_ids.append(0)
        attention_mask.append(0)
        
    # Convert to PyTorch tensors
    input_ids = torch.tensor([padded_token_ids])
    attention_mask = torch.tensor([attention_mask])
    
    # Model inference
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        
    contextual_embeddings = outputs.last_hidden_state.squeeze(0)
    
    return contextual_embeddings

In [None]:
def context_words(input_text, tokenizer, window_size):
    # Tokenization
    tokens = tokenizer.tokenize(input_text)

    # Find the position of the target word
    target_word = "walk"
    word_index = tokens.index(target_word)

    # Set the window size for the context words
    window_size = 2

    # Calculate the starting and ending positions for the context window
    start_pos = max(0, word_index - window_size)
    end_pos = min(len(tokens) - 1, word_index + window_size)

    # Extract the context words
    context_words = tokens[start_pos:end_pos+1]

    return context_words


In [None]:
# Input text
input_text1 = "I love to go for a walk in the park. Parks are made for walking."
input_text2 = "parks are made for walking"

contextual_embeddings1 = embed_text(input_text1, tokenizer, model)
contextual_embeddings2 = embed_text(input_text2, tokenizer, model)


In [None]:
# Index of the word to analyze
word_index1 = 6  # Example: "walk"

# Get the contextualized representation for the word
word_embedding1 = contextual_embeddings1[word_index1]

word_index2 = 4 # park
word_embedding2 = contextual_embeddings2[word_index2]

# Print the contextualized representation
# print("Contextualized representation of the word:", word_embedding)

In [None]:
from scipy.spatial.distance import cosine
rep_1_np = word_embedding1.numpy()
rep_2_np = word_embedding2.numpy()

# Calculate cosine similarity
similarity = 1 - cosine(rep_1_np, rep_2_np)

# Print similarity score
print("Cosine similarity:", similarity)

In [None]:
words = context_words(input_text1, tokenizer, 2)
print(words)