In [5]:
import gensim.downloader as api
gmodel = api.load('word2vec-google-news-300') # load pre-trained word2vec model
result = gmodel.most_similar(positive=['woman', 'king'], negative=['man']) # find the most similar word to woman + king - man
print(result[0]) # print the first result


('queen', 0.7118192911148071)


In [43]:
import torch
from pprint import pprint
from tqdm import tqdm 
from transformers import AutoTokenizer, AutoModel

device = 'cuda' if torch.cuda.is_available() else 'cpu'
if (device == 'cpu'):
    # ok we are using CPU but we could use apple metal instead
    if torch.backends.mps.is_available():
        print ("Using MPS")
        # device = torch.device('mps')  # use M1 chip if available
    else:
        print ("Using CPU")

# Load model and tokenizer
model_name = "sentence-transformers/distiluse-base-multilingual-cased-v1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.to(device)

def cosine_similarity(a, b):
    return torch.nn.functional.cosine_similarity(a, b).item()


Using MPS


In [44]:

# Compute analogy vector
# get embeddings
inputs = tokenizer(["woman", "king", "man"], return_tensors="pt", padding=True, truncation=True)
outputs = model(**inputs).last_hidden_state

woman = outputs[0].mean(dim=0) # average the embeddings for tokens of "woman"
king = outputs[1].mean(dim=0)  # average the embeddings for tokens of "king"
man = outputs[2].mean(dim=0)   # average the embeddings for tokens of "man"

# compute analogy vector
analogy_vector = woman + king - man
pprint(analogy_vector[:10])

tensor([ 1.8528e-02,  1.2465e-01,  1.2447e-01,  1.0001e-02, -6.8873e-02,
         4.6805e-02, -1.0980e-02, -5.6704e-04,  1.2278e-04,  7.0740e-02],
       grad_fn=<SliceBackward0>)


In [48]:
import numpy as np

# Load word list (assuming you've saved it as a newline-separated text file)
with open("/Users/gianmariaricci/Downloads/true.txt", "r") as f:
    words = f.readlines()
words = [word.strip() for word in words]

# # Compute embeddings
# word_embeddings = []
# for word in tqdm(words[:1000]):
#     inputs = tokenizer(word, return_tensors="pt", padding=True, truncation=True)
#     output = model(**inputs).last_hidden_state
#     word_embedding = output[0].mean(dim=0)
#     word_embeddings.append(word_embedding)

word_embeddings = []

# Create batches
BATCH_SIZE = 128
num_batches = int(np.ceil(len(words) / BATCH_SIZE))

for i in tqdm(range(num_batches)):
    batch = words[i*BATCH_SIZE: (i+1)*BATCH_SIZE]
    
    # Tokenizing in batch
    inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=32).to(device)
    
    # Passing through the model
    outputs = model(**inputs).last_hidden_state
    
    # Extract embeddings for each word in the batch
    for j in range(len(batch)):
        word_embedding = outputs[j].mean(dim=0)
        word_embeddings.append(word_embedding)

# Move embeddings back to CPU if needed
word_embeddings = [emb.cpu() for emb in word_embeddings]

pprint (word_embeddings[0])

# Find most similar word
similarities = {}
for word, embedding in zip(words, word_embeddings):
    similarity = cosine_similarity(analogy_vector.unsqueeze(0), embedding.unsqueeze(0))
    similarities[word] = similarity

sorted_similarities = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
print(sorted_similarities[0])  # This should print the most similar word and its similarity score.


  2%|▏         | 60/3645 [00:20<20:02,  2.98it/s]


KeyboardInterrupt: 