In [1]:
import gensim.downloader as api
from pprint import pprint
gmodel = api.load('word2vec-google-news-300') # load pre-trained word2vec model

model_info = api.info('word2vec-google-news-300')
model_path = model_info['file_name']
pprint(model_info)
print(f'Model path is = {model_path}')


{'base_dataset': 'Google News (about 100 billion words)',
 'checksum': 'a5e5354d40acb95f9ec66d5977d140ef',
 'description': 'Pre-trained vectors trained on a part of the Google News '
                'dataset (about 100 billion words). The model contains '
                '300-dimensional vectors for 3 million words and phrases. The '
                'phrases were obtained using a simple data-driven approach '
                "described in 'Distributed Representations of Words and "
                "Phrases and their Compositionality' "
                '(https://code.google.com/archive/p/word2vec/).',
 'file_name': 'word2vec-google-news-300.gz',
 'file_size': 1743563840,
 'license': 'not found',
 'num_records': 3000000,
 'parameters': {'dimension': 300},
 'parts': 1,
 'read_more': ['https://code.google.com/archive/p/word2vec/',
               'https://arxiv.org/abs/1301.3781',
               'https://arxiv.org/abs/1310.4546',
               'https://www.microsoft.com/en-us/research/publ

In [2]:

result = gmodel.most_similar(positive=['woman', 'king'], negative=['man']) # find the most similar word to woman + king - man
pprint(result[:5]) # print the first result


[('queen', 0.7118193507194519),
 ('monarch', 0.6189674139022827),
 ('princess', 0.5902431011199951),
 ('crown_prince', 0.5499460697174072),
 ('prince', 0.5377321839332581)]


In [3]:
result = gmodel.most_similar(positive=['Tokyo', 'france'], negative=['paris']) # find the most similar word to france - paris + Tokyo
pprint(result[:5]) # print the first result

[('Japan', 0.6101208925247192),
 ('Osaka', 0.5625056624412537),
 ('Japanese', 0.5529288649559021),
 ('Nagoya', 0.552433431148529),
 ('Seoul', 0.5309796929359436)]


In [4]:
result = gmodel.most_similar(positive=['rome', 'france'], negative=['paris']) 
pprint(result[:5]) # print the first result

[('italy', 0.519952118396759),
 ('european', 0.5075845718383789),
 ('italian', 0.5057743191719055),
 ('epl', 0.490744411945343),
 ('spain', 0.4888668656349182)]


Now the idea is that taking a `wolf` (feral dog) removing the `dog` part and add a `cat` part we *will end with some sort of feral cat*.

In [5]:
result = gmodel.most_similar(positive=['cat', 'wolf'], negative=['dog']) # find the most similar word to woman + king - man
pprint(result[:5]) # print the first result

[('wolves', 0.6855551600456238),
 ('lynx', 0.6411743760108948),
 ('gray_wolf', 0.6279042363166809),
 ('gray_wolves', 0.5749408006668091),
 ('wolverine', 0.5593723058700562)]


In [6]:
result = gmodel.most_similar(positive=['weapon', 'wood'], negative=[]) # find the most similar word to woman + king - man
pprint(result[:5]) # print the first result

[('splitting_maul', 0.631394624710083),
 ('double_barreled_shotgun', 0.5669865608215332),
 ('weapons', 0.5578944683074951),
 ('muzzle_loading_rifle', 0.5393639802932739),
 ('jacketed_bullets', 0.538790225982666)]


In [8]:
import torch
from pprint import pprint
from tqdm import tqdm 
from transformers import AutoTokenizer, AutoModel

print(torch.backends.mps.is_available())

# this ensures that the current current PyTorch installation was built with MPS activated.
print(torch.backends.mps.is_built())

device = 'cuda' if torch.cuda.is_available() else 'cpu'

if (device == 'cpu'):
    # ok we are using CPU but we could use apple metal instead
    if torch.backends.mps.is_available():
        print ("Using MPS")
        # device = torch.device('mps')  # use M1 chip if available
    else:
        print ("Using CPU")

print(f'Using device: {device}')


False
False
Using device: cuda


In [9]:


# Load model and tokenizer
model_name = "sentence-transformers/distiluse-base-multilingual-cased-v1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)

def cosine_similarity(a, b):
    return torch.nn.functional.cosine_similarity(a, b).item()


In [10]:
import numpy as np
import requests

# Load word list (assuming you've saved it as a newline-separated text file)

url = 'https://raw.githubusercontent.com/dwyl/english-words/master/words.txt'
response = requests.get(url)

with open('words.txt', 'w') as f:
    f.write(response.text)

with open("words.txt", "r") as f:
    words = f.readlines()
    
words = [word.strip() for word in words]
pprint(f'Number of words: {len(words)}')

word_embeddings = []


'Number of words: 466550'


In [11]:

# Create batches
BATCH_SIZE = 512
num_batches = int(np.ceil(len(words) / BATCH_SIZE))

for i in tqdm(range(num_batches)):
    batch = words[i*BATCH_SIZE: (i+1)*BATCH_SIZE]
    
    # Tokenizing in batch
    inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=32)
    inputs.to(device)
    # Passing through the model
    outputs = model(**inputs).last_hidden_state
    
    # Extract embeddings for each word in the batch
    for j in range(len(batch)):
        word_embedding = outputs[j].mean(dim=0).detach()
        word_embeddings.append(word_embedding)
    
    del inputs, outputs
    torch.cuda.empty_cache()

# Move embeddings back to CPU if needed
word_embeddings = [emb.cpu() for emb in word_embeddings]


100%|██████████| 912/912 [01:01<00:00, 14.85it/s]


In [12]:

pprint (word_embeddings[0])

tensor([ 0.0517, -0.0343,  0.0778,  0.0019,  0.0024, -0.0077,  0.0327,  0.0210,
        -0.0436,  0.0353, -0.0041,  0.1117, -0.0267,  0.0047,  0.0114, -0.0065,
         0.0367,  0.0344,  0.0537, -0.0209,  0.0631,  0.0037,  0.0315, -0.0138,
         0.0600,  0.0095,  0.0313,  0.1338,  0.0926, -0.0573, -0.0919,  0.0053,
        -0.0204,  0.0067,  0.0179,  0.1597, -0.0904,  0.0343,  0.0243,  0.0141,
        -0.0225, -0.0650,  0.0827, -0.0232, -0.0026, -0.0793, -0.0171, -0.0397,
         0.0730,  0.0091,  0.0205, -0.0574, -0.0126,  0.0094, -0.0509, -0.0643,
        -0.0481,  0.0694,  0.0808,  0.0363,  0.0066,  0.0393,  0.0196,  0.0173,
        -0.1359,  0.0173,  0.0638,  0.0540,  0.0068, -0.0518, -0.0224,  0.0717,
         0.0062, -0.0082,  0.0596,  0.0338, -0.0106,  0.0206,  0.0546, -0.0179,
         0.0127, -0.0339,  0.0689, -0.0053,  0.0164, -0.1020,  0.0288, -0.0183,
         0.0498,  0.1885,  0.0189, -0.0025, -0.0225, -0.0240, -0.0098,  0.0206,
         0.0132, -0.0240,  0.1074, -0.02

In [13]:
from pprint import pprint
def find_similar_words(source, subtract, add):

    # Compute analogy vector
    # get embeddings
    inputs = tokenizer([source, subtract, add], return_tensors="pt", padding=True, truncation=True)
    inputs.to(device) # move to GPU if available
    outputs = model(**inputs).last_hidden_state

    source_tensor = outputs[0].mean(dim=0).detach() # average the embeddings for tokens of "woman"
    subtract_tensor = outputs[1].mean(dim=0).detach()  # average the embeddings for tokens of "king"
    add_tensor = outputs[2].mean(dim=0) .detach()  # average the embeddings for tokens of "man"

    del inputs, outputs
    torch.cuda.empty_cache()

    # compute analogy vector
    analogy_vector = source_tensor - subtract_tensor + add_tensor
    # pprint(f"analogy_vector device = {analogy_vector.device}")
    analogy_vector = analogy_vector.to("cpu")
    # pprint(f"analogy_vector device = {analogy_vector.device}")

    # Find most similar word
    similarities = {}

    pprint(f"Brute forcing on a total of {len(words)} words")

    # Assuming word_embeddings is a 2D tensor where each row is the embedding of a word
    word_embeddings_tensor = torch.stack(word_embeddings)

    # Unsqueeze analogy_vector to match the dimensions of word_embeddings_tensor
    analogy_vector_unsqueezed = analogy_vector.unsqueeze(0)

    # Calculate cosine similarity for all words at once
    similarities = torch.nn.functional.cosine_similarity(analogy_vector_unsqueezed, word_embeddings_tensor).tolist()

    word_similarities = list(zip(words, similarities))

    sorted_similarities = sorted(word_similarities, key=lambda x: x[1], reverse=True)
    pprint(sorted_similarities[:10])  # This should print the most similar word and its similarity score.

In [14]:
find_similar_words("king", "man", "woman")

'Brute forcing on a total of 466550 words'
[('Queen', 0.6610273122787476),
 ('reginas', 0.6440814733505249),
 ('Queena', 0.6367193460464478),
 ('king-ridden', 0.6314623951911926),
 ('queenhood', 0.6295172572135925),
 ('king-whiting', 0.6291139125823975),
 ('reginae', 0.6278669238090515),
 ('rei', 0.6261332035064697),
 ('kral', 0.6235274076461792),
 ("queen's", 0.6195412874221802)]


In [15]:
find_similar_words("France", "Paris", "Tokyo")

'Brute forcing on a total of 466550 words'
[('Japan', 0.677340030670166),
 ('Tokio', 0.6668616533279419),
 ('Japanee', 0.6527673006057739),
 ('Japans', 0.6449138522148132),
 ('Tokyo', 0.6312811970710754),
 ('France', 0.6312498450279236),
 ('Japanesy', 0.6257008910179138),
 ('Nippon', 0.6213622093200684),
 ('Tokyoite', 0.6124879717826843),
 ('Japanesque', 0.6107927560806274)]
