In [1]:
import gensim.downloader as api

model = api.load('word2vec-google-news-300') 
# downloads ~1.6GB model

In [2]:
print(model['king'])  # see vector
print(model.most_similar('king'))  # analogies

[ 1.25976562e-01  2.97851562e-02  8.60595703e-03  1.39648438e-01
 -2.56347656e-02 -3.61328125e-02  1.11816406e-01 -1.98242188e-01
  5.12695312e-02  3.63281250e-01 -2.42187500e-01 -3.02734375e-01
 -1.77734375e-01 -2.49023438e-02 -1.67968750e-01 -1.69921875e-01
  3.46679688e-02  5.21850586e-03  4.63867188e-02  1.28906250e-01
  1.36718750e-01  1.12792969e-01  5.95703125e-02  1.36718750e-01
  1.01074219e-01 -1.76757812e-01 -2.51953125e-01  5.98144531e-02
  3.41796875e-01 -3.11279297e-02  1.04492188e-01  6.17675781e-02
  1.24511719e-01  4.00390625e-01 -3.22265625e-01  8.39843750e-02
  3.90625000e-02  5.85937500e-03  7.03125000e-02  1.72851562e-01
  1.38671875e-01 -2.31445312e-01  2.83203125e-01  1.42578125e-01
  3.41796875e-01 -2.39257812e-02 -1.09863281e-01  3.32031250e-02
 -5.46875000e-02  1.53198242e-02 -1.62109375e-01  1.58203125e-01
 -2.59765625e-01  2.01416016e-02 -1.63085938e-01  1.35803223e-03
 -1.44531250e-01 -5.68847656e-02  4.29687500e-02 -2.46582031e-02
  1.85546875e-01  4.47265

In [3]:
from gensim.models import KeyedVectors

# # Load the pre-trained model (you‚Äôve already done this probably)
# model_path = "/content/GoogleNews-vectors-negative300.bin.gz"
# model = KeyedVectors.load_word2vec_format(model_path, binary=True)

# 1. Most similar words
print("üìç Most similar to 'dog':")
print(model.most_similar('dog'))

# 2. Word analogy: king - man + woman = ?
print("\nüëë Word analogy: king - man + woman = ?")
print(model.most_similar(positive=['king', 'woman'], negative=['man'], topn=1))

# 3. Word similarity
print("\nüìè Similarity between 'coffee' and 'tea':")
print(model.similarity('coffee', 'tea'))

# 4. Check if a word exists
print("\nüîé Is 'dragon' in vocabulary?")
print('dragon' in model.key_to_index)

üìç Most similar to 'dog':
[('dogs', 0.8680489659309387), ('puppy', 0.8106428384780884), ('pit_bull', 0.780396044254303), ('pooch', 0.7627376914024353), ('cat', 0.7609457969665527), ('golden_retriever', 0.7500901818275452), ('German_shepherd', 0.7465174198150635), ('Rottweiler', 0.7437615394592285), ('beagle', 0.7418621778488159), ('pup', 0.740691065788269)]

üëë Word analogy: king - man + woman = ?
[('queen', 0.7118193507194519)]

üìè Similarity between 'coffee' and 'tea':
0.5635292

üîé Is 'dragon' in vocabulary?
True


In [5]:
import numpy as np
from gensim.models import KeyedVectors

# # Load Google News vectors
# model_path = "/content/GoogleNews-vectors-negative300.bin.gz"
# model = KeyedVectors.load_word2vec_format(model_path, binary=True)

def avg_word2vec(sentence, model):
    words = sentence.lower().split()
    valid_vectors = [model[word] for word in words if word in model]
    
    if not valid_vectors:
        return np.zeros(model.vector_size)
    
    return np.mean(valid_vectors, axis=0)

# Try it!
sentence = "I love machine learning"
vector = avg_word2vec(sentence, model)

print("üî¢ Sentence Vector (shape):", vector.shape)
print("üìà First 5 dimensions:", vector[:5])

üî¢ Sentence Vector (shape): (300,)
üìà First 5 dimensions: [ 0.01123047 -0.01138306  0.02069092  0.14361572 -0.03967285]


In [1]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

word_pairs = [('learn', 'learning'), ('india', 'indian'), ('fame', 'famous')]

# Compute similarity for each pair of words
for pair in word_pairs:
    tokens = tokenizer(pair, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**tokens)
    
    # Extract embeddings for the [CLS] token
    cls_embedding = outputs.last_hidden_state[:, 0, :]
    
    similarity = torch.nn.functional.cosine_similarity(cls_embedding[0], cls_embedding[1], dim=0)
    print(f"Similarity between '{pair[0]}' and '{pair[1]}' using BERT: {similarity:.3f}")

Similarity between 'learn' and 'learning' using BERT: 0.930
Similarity between 'india' and 'indian' using BERT: 0.957
Similarity between 'fame' and 'famous' using BERT: 0.956
