In [1]:
# !pip install datasets
import torch    
import torch.nn as nn
import torch.nn.functional as F
from tqdm.auto import tqdm
from word2vec.data_setup import load_skipgram_data, create_data_loaders
from word2vec.model import SkipGramNeg, NegLoss
from word2vec.utils import save_model

In [2]:
# hyperparameters 
vocab_size = 500
embedding_dim = 50
context_size = 3

epochs = 3
batch_size = 32

# device agnostic code
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [3]:
dataset, noise_dist = load_skipgram_data(vocab_size, context_size, amount_of_articles=100)
train_dataloader = create_data_loaders(dataset, batch_size=batch_size)
len(dataset)

[nltk_data] Downloading package punkt to /Users/aspisov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Found cached dataset wikipedia (/Users/aspisov/.cache/huggingface/datasets/wikipedia/20220301.simple/2.0.0/d41137e149b2ea90eead07e7e3f805119a8c22dd1d5b61651af8e3e3ee736001)


  0%|          | 0/1 [00:00<?, ?it/s]

474666

In [4]:
model = SkipGramNeg(vocab_size, embedding_dim).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_function = NegLoss()

model

number of parameters: 0.05M


SkipGramNeg(
  (in_embed): Embedding(500, 50)
  (out_embed): Embedding(500, 50)
)

In [5]:
def generate_negative_samples(n_samples, noise_dist, batch_size=batch_size):            
    noise_words = torch.multinomial(input       = noise_dist,           # input tensor containing probabilities
                                        num_samples = batch_size * n_samples, # number of samples to draw
                                        replacement = True)
    return noise_words.view(batch_size, n_samples)

In [6]:
# training
for epoch in tqdm(range(epochs)):
    model.train()
    train_loss = 0
    for target, context in train_dataloader:
        negative_samples = generate_negative_samples(n_samples=5, noise_dist=noise_dist, batch_size=target.shape[0])
        target, context, negative_samples = target.to(device), context.to(device), negative_samples.to(device)


        embedded_center = model.forward_center(target)
        embedded_context = model.forward_context(context)
        embedded_noise = model.forward_context(negative_samples)
        
        loss = loss_function(embedded_center, embedded_context, embedded_noise)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        train_loss += loss
        
    print(f"epoch {epoch+1} loss: {train_loss / len(train_dataloader):.2f}")

  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
save_model(model, f"skipgram{embedding_dim}.pth")

[INFO] Saving model to: models/skipgram50.pth


In [8]:
'''
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

# getting embeddings from the embedding layer of our model, by name
embeddings = model.in_embed.weight.to('cpu').data.numpy()

viz_words = 50
tsne = TSNE()
embed_tsne = tsne.fit_transform(embeddings[:viz_words, :])

fig, ax = plt.subplots(figsize=(16, 16))
for idx in range(viz_words):
    plt.scatter(*embed_tsne[idx, :], color='steelblue')
    plt.annotate(dataset.idx_to_word[idx], (embed_tsne[idx, 0], embed_tsne[idx, 1]), alpha=0.7)
'''

"\n%matplotlib inline\n%config InlineBackend.figure_format = 'retina'\n\nimport matplotlib.pyplot as plt\nfrom sklearn.manifold import TSNE\n\n# getting embeddings from the embedding layer of our model, by name\nembeddings = model.in_embed.weight.to('cpu').data.numpy()\n\nviz_words = 50\ntsne = TSNE()\nembed_tsne = tsne.fit_transform(embeddings[:viz_words, :])\n\nfig, ax = plt.subplots(figsize=(16, 16))\nfor idx in range(viz_words):\n    plt.scatter(*embed_tsne[idx, :], color='steelblue')\n    plt.annotate(dataset.idx_to_word[idx], (embed_tsne[idx, 0], embed_tsne[idx, 1]), alpha=0.7)\n"