In [2]:
! curl -LO https://raw.githubusercontent.com/mmihaltz/word2vec-GoogleNews-vectors/master/GoogleNews-vectors-negative300.bin.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   135  100   135    0     0    365      0 --:--:-- --:--:-- --:--:--   365


In [6]:
import numpy as np
from gensim.models import KeyedVectors
import nltk
from nltk.tokenize import word_tokenize

# Download necessary NLTK data
nltk.download('punkt')

# Load pre-trained Word2Vec model
# Note: You need to download the model file first
# You can use: !wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

# Function to get word embedding
def get_word_embedding(sentence, target_word):
    # Tokenize the sentence
    tokens = word_tokenize(sentence.lower())
    
    # Find the position of the target word
    try:
        word_position = tokens.index(target_word.lower())
    except ValueError:
        print(f"'{target_word}' not found in the sentence.")
        return None
    
    # Get the embedding for the word
    try:
        word_embedding = model[tokens[word_position]]
    except KeyError:
        print(f"'{tokens[word_position]}' not in vocabulary.")
        return None
    
    return word_embedding

# Example sentences
sentences = [
    "I need to go to the bank to deposit some money.",
    "The river bank was overgrown with wildflowers.",
    "The pilot had to bank the airplane to avoid turbulence.",
    "You can bank on me to finish the project on time.",
    "The food bank is collecting donations for the homeless.",
    "The central bank announced new interest rates to stabilize the economy."
]

# Get embeddings for 'bank' in each context
embeddings = [get_word_embedding(sentence, 'bank') for sentence in sentences]

# Remove None values if any
embeddings = [e for e in embeddings if e is not None]

# Function to compute cosine similarity
def cosine_similarity(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

# Compute pairwise similarities
similarities = np.zeros((len(embeddings), len(embeddings)))
for i in range(len(embeddings)):
    for j in range(len(embeddings)):
        similarities[i][j] = cosine_similarity(embeddings[i], embeddings[j])

# Print results
for i, sentence in enumerate(sentences):
    if i < len(embeddings):
        print(f"Sentence {i+1}: {sentence}")
        print(f"Embedding for 'bank' (first 5 dimensions): {embeddings[i][:5]}")
        print()

print("Cosine Similarities:")
for i in range(len(similarities)):
    for j in range(len(similarities)):
        print(f"{similarities[i][j]:.4f}", end="\t")
    print()

# Find the most similar pair
max_similarity = 0
max_pair = (0, 0)
for i in range(len(similarities)):
    for j in range(i+1, len(similarities)):
        if similarities[i][j] > max_similarity:
            max_similarity = similarities[i][j]
            max_pair = (i, j)

print(f"\nMost similar pair of sentences for 'bank':")
print(f"Sentence {max_pair[0]+1} and Sentence {max_pair[1]+1}")
print(f"Similarity: {max_similarity:.4f}")
print(f"\nSentence {max_pair[0]+1}: {sentences[max_pair[0]]}")
print(f"\nSentence {max_pair[1]+1}: {sentences[max_pair[1]]}")

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/grizzlystudio/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Sentence 1: I need to go to the bank to deposit some money.
Embedding for 'bank' (first 5 dimensions): [ 0.02197266  0.13476562 -0.05786133  0.05566406  0.09912109]

Sentence 2: The river bank was overgrown with wildflowers.
Embedding for 'bank' (first 5 dimensions): [ 0.02197266  0.13476562 -0.05786133  0.05566406  0.09912109]

Sentence 3: The pilot had to bank the airplane to avoid turbulence.
Embedding for 'bank' (first 5 dimensions): [ 0.02197266  0.13476562 -0.05786133  0.05566406  0.09912109]

Sentence 4: You can bank on me to finish the project on time.
Embedding for 'bank' (first 5 dimensions): [ 0.02197266  0.13476562 -0.05786133  0.05566406  0.09912109]

Sentence 5: The food bank is collecting donations for the homeless.
Embedding for 'bank' (first 5 dimensions): [ 0.02197266  0.13476562 -0.05786133  0.05566406  0.09912109]

Sentence 6: The central bank announced new interest rates to stabilize the economy.
Embedding for 'bank' (first 5 dimensions): [ 0.02197266  0.13476562 -