In [None]:
import gensim.downloader as api
from pprint import pprint
gmodel = api.load('word2vec-google-news-300') # load pre-trained word2vec model

model_info = api.info('word2vec-google-news-300')
model_path = model_info['file_name']
pprint(model_info)
print(f'Model path is = {model_path}')


In [None]:

result = gmodel.most_similar(positive=['woman', 'king'], negative=['man']) # find the most similar word to woman + king - man
print(result[0]) # print the first result


In [None]:
result = gmodel.most_similar(positive=['Tokyo', 'france'], negative=['paris']) # find the most similar word to woman + king - man
pprint(result[:5]) # print the first result

In [None]:
result = gmodel.most_similar(positive=['rome', 'paris'], negative=['italy']) # find the most similar word to woman + king - man
pprint(result[:5]) # print the first result

Now the idea is that taking a `wolf` (feral dog) removing the `dog` part and add a `cat` part we *will end with some sort of feral cat*.

In [None]:
result = gmodel.most_similar(positive=['cat', 'wolf'], negative=['dog']) # find the most similar word to woman + king - man
pprint(result[:5]) # print the first result

In [None]:
result = gmodel.most_similar(positive=['weapon', 'wood'], negative=[]) # find the most similar word to woman + king - man
pprint(result[:5]) # print the first result

In [None]:
import torch
from pprint import pprint
from tqdm import tqdm 
from transformers import AutoTokenizer, AutoModel

print(torch.backends.mps.is_available())

# this ensures that the current current PyTorch installation was built with MPS activated.
print(torch.backends.mps.is_built())

device = 'cuda' if torch.cuda.is_available() else 'cpu'

if (device == 'cpu'):
    # ok we are using CPU but we could use apple metal instead
    if torch.backends.mps.is_available():
        print ("Using MPS")
        # device = torch.device('mps')  # use M1 chip if available
    else:
        print ("Using CPU")

# Load model and tokenizer
model_name = "sentence-transformers/distiluse-base-multilingual-cased-v1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)

def cosine_similarity(a, b):
    return torch.nn.functional.cosine_similarity(a, b).item()


In [None]:
import numpy as np
import requests

# Load word list (assuming you've saved it as a newline-separated text file)

url = 'https://raw.githubusercontent.com/dwyl/english-words/master/words.txt'
response = requests.get(url)

with open('words.txt', 'w') as f:
    f.write(response.text)

with open("words.txt", "r") as f:
    words = f.readlines()
    
words = [word.strip() for word in words]

word_embeddings = []

# Create batches
BATCH_SIZE = 512
num_batches = int(np.ceil(len(words) / BATCH_SIZE))

for i in tqdm(range(num_batches)):
    batch = words[i*BATCH_SIZE: (i+1)*BATCH_SIZE]
    
    # Tokenizing in batch
    inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=32)
    inputs.to(device)
    # Passing through the model
    outputs = model(**inputs).last_hidden_state
    
    # Extract embeddings for each word in the batch
    for j in range(len(batch)):
        word_embedding = outputs[j].mean(dim=0).detach()
        word_embeddings.append(word_embedding)
    
    del inputs, outputs
    torch.cuda.empty_cache()

# Move embeddings back to CPU if needed
word_embeddings = [emb.cpu() for emb in word_embeddings]

pprint (word_embeddings[0])

In [None]:

# Compute analogy vector
# get embeddings
inputs = tokenizer(["woman", "king", "man"], return_tensors="pt", padding=True, truncation=True)
inputs.to(device) # move to GPU if available
outputs = model(**inputs).last_hidden_state

woman = outputs[0].mean(dim=0) # average the embeddings for tokens of "woman"
king = outputs[1].mean(dim=0)  # average the embeddings for tokens of "king"
man = outputs[2].mean(dim=0)   # average the embeddings for tokens of "man"

del inputs
torch.cuda.empty_cache()
# compute analogy vector
analogy_vector = woman + king - man
pprint(f"analogy_vcetor device = {analogy_vector.device}")
analogy_vector.to("cpu")

# Find most similar word
similarities = {}
for word, embedding in zip(words, word_embeddings):
    similarity = cosine_similarity(analogy_vector.unsqueeze(0), embedding.unsqueeze(0))
    similarities[word] = similarity

sorted_similarities = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
print(sorted_similarities[0])  # This should print the most similar word and its similarity score.