In [1]:
import torch
from transformers import BertTokenizer, BertModel
import numpy as np

# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to get contextual embedding for a specific word
def get_word_embedding(sentence, word):
    # Tokenize the sentence
    inputs = tokenizer(sentence, return_tensors='pt')
    
    # Get model output
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get the last hidden state
    last_hidden_state = outputs.last_hidden_state[0]
    
    # Find the position of the word
    word_tokens = tokenizer.tokenize(word)
    input_tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    word_position = input_tokens.index(word_tokens[0])
    
    # Get the embedding for the word
    word_embedding = last_hidden_state[word_position].numpy()
    
    return word_embedding

# Example sentences (including the new one)
sentences = [
    "I need to go to the bank to deposit some money.",
    "The river bank was overgrown with wildflowers.",
    "The pilot had to bank the airplane to avoid turbulence.",
    "You can bank on me to finish the project on time.",
    "The food bank is collecting donations for the homeless.",
    "The central bank announced new interest rates to stabilize the economy."
]

# Get embeddings for 'bank' in each context
embeddings = [get_word_embedding(sentence, 'bank') for sentence in sentences]

# Function to compute cosine similarity
def cosine_similarity(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

# Compute pairwise similarities
similarities = np.zeros((len(embeddings), len(embeddings)))
for i in range(len(embeddings)):
    for j in range(len(embeddings)):
        similarities[i][j] = cosine_similarity(embeddings[i], embeddings[j])

# Print results
for i, sentence in enumerate(sentences):
    print(f"Sentence {i+1}: {sentence}")
    print(f"Embedding for 'bank' (first 5 dimensions): {embeddings[i][:5]}")
    print()

print("Cosine Similarities:")
for i in range(len(similarities)):
    for j in range(len(similarities)):
        print(f"{similarities[i][j]:.4f}", end="\t")
    print()

# Find the most similar pair
max_similarity = 0
max_pair = (0, 0)
for i in range(len(similarities)):
    for j in range(i+1, len(similarities)):
        if similarities[i][j] > max_similarity:
            max_similarity = similarities[i][j]
            max_pair = (i, j)

print(f"\nMost similar pair of sentences for 'bank':")
print(f"Sentence {max_pair[0]+1} and Sentence {max_pair[1]+1}")
print(f"Similarity: {max_similarity:.4f}")
print(f"\nSentence {max_pair[0]+1}: {sentences[max_pair[0]]}")
print(f"Sentence {max_pair[1]+1}: {sentences[max_pair[1]]}")



Sentence 1: I need to go to the bank to deposit some money.
Embedding for 'bank' (first 5 dimensions): [ 0.64034605 -0.18108568  0.11862013 -0.2020568   1.3855611 ]

Sentence 2: The river bank was overgrown with wildflowers.
Embedding for 'bank' (first 5 dimensions): [-0.17355233 -0.5053988  -0.19030745 -0.51391476 -0.2884438 ]

Sentence 3: The pilot had to bank the airplane to avoid turbulence.
Embedding for 'bank' (first 5 dimensions): [ 0.23946215 -0.21487379 -0.16276157  0.3818572   0.22607988]

Sentence 4: You can bank on me to finish the project on time.
Embedding for 'bank' (first 5 dimensions): [ 1.0760132  -0.6041027   0.3984626  -0.35869265  1.0072378 ]

Sentence 5: The food bank is collecting donations for the homeless.
Embedding for 'bank' (first 5 dimensions): [ 0.21331732 -0.6920833  -0.1706988   0.0951876   1.011391  ]

Sentence 6: The central bank announced new interest rates to stabilize the economy.
Embedding for 'bank' (first 5 dimensions): [-0.04606444 -0.13416706 -