In [1]:
# !pip install datasets
import torch    
import torch.nn as nn
import torch.nn.functional as F
from tqdm.auto import tqdm
from word2vec.data_setup import load_cbow_data, create_data_loaders
from word2vec.model import CBOW
from word2vec.utils import save_model

In [2]:
# hyperparameters 
vocab_size = 1000
embedding_dim = 100
context_size = 5

epochs = 5
batch_size = 32

# device agnostic code
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [3]:
dataset = load_cbow_data(vocab_size, context_size, amount_of_articles=200)
vocab = dataset.vocab
train_dataloader = create_data_loaders(dataset, batch_size=batch_size)
len(dataset)

[nltk_data] Downloading package punkt to /Users/aspisov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Found cached dataset wikipedia (/Users/aspisov/.cache/huggingface/datasets/wikipedia/20220301.simple/2.0.0/d41137e149b2ea90eead07e7e3f805119a8c22dd1d5b61651af8e3e3ee736001)


  0%|          | 0/1 [00:00<?, ?it/s]

146901

In [4]:
# # Huffman tree for hierarchical softmax
# root = build_tree(vocab)

model = CBOW(vocab_size=len(vocab), embedding_dim=embedding_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

number of parameters: 0.20M


In [5]:
# training
for epoch in tqdm(range(epochs)):
    model.train()
    train_loss = 0
    for X, y in train_dataloader:
        X, y = X.to(device), y.to(device)
        
        optimizer.zero_grad()

        logits = model(X)
        loss = criterion(logits, y)
        
        loss.backward()
        optimizer.step()
        
        train_loss += loss
        
    train_loss = train_loss / len(train_dataloader)    
        
    print(f"epoch {epoch+1} loss: {train_loss:.2f}")

  0%|          | 0/5 [00:00<?, ?it/s]

epoch 1 loss: 4.53
epoch 2 loss: 4.14
epoch 3 loss: 3.97
epoch 4 loss: 3.85
epoch 5 loss: 3.76


In [6]:
save_model(model, f"cbow{embedding_dim}.pth")

[INFO] Saving model to: Efficient Estimation of Word Representations in Vector Space/models/cbow100.pth


In [7]:
def get_embedding(word, model, vocab):
    word_idx = vocab[word]
    word_tensor = torch.tensor([word_idx], device=model.embeddings.weight.device)
    embedding = model.embeddings(word_tensor)
    return embedding

vocab['men'], vocab['women'], vocab['king'], vocab['queen']

(563, 446, 220, 545)

In [8]:
king_emb = get_embedding("king", model, vocab)
men_emb = get_embedding("men", model, vocab)
women_emb = get_embedding("women", model, vocab)
queen_emb = get_embedding("queen", model, vocab)

In [9]:
result_emb = king_emb - men_emb + women_emb
F.cosine_similarity(result_emb, queen_emb).item()

0.006044892594218254

In [10]:
inverse_vocab = {v: k for k, v in vocab.items()}

def find_closest_word(embedding, model, vocab, exclude_words):
    exclude_indices = [vocab[word] for word in exclude_words]
    all_embeddings = model.embeddings.weight

    similarities = F.cosine_similarity(embedding, all_embeddings)
    
    # Set similarities of the excluded words to a very low value
    for idx in exclude_indices:
        similarities[idx] = -float('inf')
    
    closest_idx = similarities.argmax().item()
    closest_word = inverse_vocab[closest_idx]
    return closest_word

closest_word = find_closest_word(result_emb, model, vocab, exclude_words=['king', 'men', 'women'])
print("Closest word to 'king - men + women' is:", closest_word)

Closest word to 'king - men + women' is: pint
