In [1]:
from transformers import AutoTokenizer

# Initialize the BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Function to tokenize and print results
def tokenize_and_print(text):
    tokens = tokenizer.tokenize(text)
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    print(f"Text: {text}")
    print("Tokens:", tokens)
    print("Token IDs:", token_ids)
    print("Decoded:", tokenizer.decode(token_ids))
    print()

# Examples of "bank" in different contexts
examples = [
    "I need to go to the bank to deposit some money.",
    "The river bank was overgrown with wildflowers.",
    "The pilot had to bank the airplane to avoid turbulence.",
    "You can bank on me to finish the project on time.",
    "The food bank is collecting donations for the homeless.",
    "They're going to bank the fires for the night.",
    "The pool player tried to bank the shot off the cushion.",
    "The company has a large piggy bank of cash reserves.",
    "Banksy is a famous street artist.",
    "The bank holiday means all offices will be closed on Monday."
]

# Tokenize and print each example
for example in examples:
    tokenize_and_print(example)

Text: I need to go to the bank to deposit some money.
Tokens: ['i', 'need', 'to', 'go', 'to', 'the', 'bank', 'to', 'deposit', 'some', 'money', '.']
Token IDs: [1045, 2342, 2000, 2175, 2000, 1996, 2924, 2000, 12816, 2070, 2769, 1012]
Decoded: i need to go to the bank to deposit some money.

Text: The river bank was overgrown with wildflowers.
Tokens: ['the', 'river', 'bank', 'was', 'overgrown', 'with', 'wild', '##flower', '##s', '.']
Token IDs: [1996, 2314, 2924, 2001, 26433, 2007, 3748, 14156, 2015, 1012]
Decoded: the river bank was overgrown with wildflowers.

Text: The pilot had to bank the airplane to avoid turbulence.
Tokens: ['the', 'pilot', 'had', 'to', 'bank', 'the', 'airplane', 'to', 'avoid', 'turbulence', '.']
Token IDs: [1996, 4405, 2018, 2000, 2924, 1996, 13297, 2000, 4468, 29083, 1012]
Decoded: the pilot had to bank the airplane to avoid turbulence.

Text: You can bank on me to finish the project on time.
Tokens: ['you', 'can', 'bank', 'on', 'me', 'to', 'finish', 'the', 'pr

In [2]:
import torch
from transformers import BertTokenizer, BertModel
import numpy as np

# Initialize tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to get contextual embedding for a specific word
def get_word_embedding(sentence, word):
    # Tokenize the sentence
    inputs = tokenizer(sentence, return_tensors='pt')
    
    # Get model output
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get the last hidden state
    last_hidden_state = outputs.last_hidden_state[0]
    
    # Find the position of the word
    word_tokens = tokenizer.tokenize(word)
    input_tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    word_position = input_tokens.index(word_tokens[0])
    
    # Get the embedding for the word
    word_embedding = last_hidden_state[word_position].numpy()
    
    return word_embedding

# Example sentences
sentences = [
    "I need to go to the bank to deposit some money.",
    "The river bank was overgrown with wildflowers.",
    "The pilot had to bank the airplane to avoid turbulence.",
    "You can bank on me to finish the project on time.",
    "The food bank is collecting donations for the homeless."
]

# Get embeddings for 'bank' in each context
embeddings = [get_word_embedding(sentence, 'bank') for sentence in sentences]

# Function to compute cosine similarity
def cosine_similarity(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

# Compute pairwise similarities
similarities = np.zeros((len(embeddings), len(embeddings)))
for i in range(len(embeddings)):
    for j in range(len(embeddings)):
        similarities[i][j] = cosine_similarity(embeddings[i], embeddings[j])

# Print results
for i, sentence in enumerate(sentences):
    print(f"Sentence {i+1}: {sentence}")
    print(f"Embedding for 'bank' (first 5 dimensions): {embeddings[i][:5]}")
    print()

print("Cosine Similarities:")
for i in range(len(similarities)):
    for j in range(len(similarities)):
        print(f"{similarities[i][j]:.4f}", end="\t")
    print()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]



Sentence 1: I need to go to the bank to deposit some money.
Embedding for 'bank' (first 5 dimensions): [ 0.64034605 -0.18108568  0.11862013 -0.2020568   1.3855611 ]

Sentence 2: The river bank was overgrown with wildflowers.
Embedding for 'bank' (first 5 dimensions): [-0.17355233 -0.5053988  -0.19030745 -0.51391476 -0.2884438 ]

Sentence 3: The pilot had to bank the airplane to avoid turbulence.
Embedding for 'bank' (first 5 dimensions): [ 0.23946215 -0.21487379 -0.16276157  0.3818572   0.22607988]

Sentence 4: You can bank on me to finish the project on time.
Embedding for 'bank' (first 5 dimensions): [ 1.0760132  -0.6041027   0.3984626  -0.35869265  1.0072378 ]

Sentence 5: The food bank is collecting donations for the homeless.
Embedding for 'bank' (first 5 dimensions): [ 0.21331732 -0.6920833  -0.1706988   0.0951876   1.011391  ]

Cosine Similarities:
1.0000	0.4407	0.3670	0.5065	0.5533	
0.4407	1.0000	0.3886	0.4106	0.4724	
0.3670	0.3886	1.0000	0.4654	0.3232	
0.5065	0.4106	0.4654	1.0