In [1]:
import torch
import torch.nn as nn
import numpy as np
import torch.nn.functional as F

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
a = torch.torch.randn((16, 16), dtype=torch.float32)

In [4]:
import numpy as np

with open("./quora.txt", encoding="utf-8") as file:
    data = list(file)

In [5]:
from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()

# print(tokenizer.tokenize(data[50]))

In [6]:
data_tok = [tokenizer.tokenize(x.lower()) for x in data]

In [7]:
vocab = set()
list_vocab = []
for x in data_tok:
    list_vocab += x

list_vocab += ['<PAD>']
vocab = set(list_vocab)

In [8]:
'<PAD>' in set(list_vocab)

True

In [9]:
len(vocab)

87820

In [10]:
padding = "<PAD>"
list(padding)

['<', 'P', 'A', 'D', '>']

In [11]:
word_to_idx = {word: i for i, word in enumerate(vocab)}

In [12]:
from torch.utils.data import DataLoader, Dataset, random_split
import torchvision
from torchvision.transforms import v2
class DataSetVocab(Dataset):
    def __init__(self, data, window_size, vocab, word_to_idx):
        self.window_size = window_size
        self.vocab = vocab
        self.padding = "<PAD>"
        self.data_input = [[self.padding for q in range(self.window_size)] + x + [self.padding for q in range(self.window_size)] for x in data]
        self.data = []
        self.word_to_idx = word_to_idx
        for text in self.data_input:
            for i in range(self.window_size, len(text) - self.window_size):
                new_context = text[i - self.window_size : i + self.window_size + 1]
                self.data.append(torch.tensor([self.word_to_idx[i] for i in new_context], dtype=torch.long))



    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index]

dataset = DataSetVocab(data_tok, 2, vocab, word_to_idx)

In [13]:
len(dataset)

7131345

In [14]:
dataset[235]

tensor([ 6826, 25211, 39085, 81019, 12346])

In [15]:
dataloader = DataLoader(dataset=dataset, batch_size=6, shuffle=False)
for i in dataloader:
    print(i)
    break

tensor([[31212, 31212, 38103, 81019, 64680],
        [31212, 38103, 81019, 64680, 41951],
        [38103, 81019, 64680, 41951, 52624],
        [81019, 64680, 41951, 52624, 45904],
        [64680, 41951, 52624, 45904, 28566],
        [41951, 52624, 45904, 28566, 24934]])


In [16]:
class MyWord2Vec(nn.Module):
    def __init__(self, window_size, dim, vocab, n_neg_samples, word_to_idx):
        super().__init__()
        self.window_size = window_size
        self.dim = dim
        self.vocab = vocab
        self.len_vocab = len(vocab)
        self.unk = "<UNK>"
        self.sos = "<BOS>"
        self.eos = "<EOS>"
        self.n_neg_samples = n_neg_samples
        self.vectors = nn.Embedding(self.len_vocab, self.dim)
        self.context = nn.Embedding(self.len_vocab, self.dim)
        self.word_to_idx = word_to_idx

    def forward(self, words):
        # words : [batch_size, self.window_size * 2 + 1]
        batch_size = words.size(0)
        device = self.vectors.weight.device
        centroid_idx = words[:, self.window_size]
        centroid_vec = self.vectors(centroid_idx) # (batch_size, self.dim)
        context_idx = torch.cat([words[:, :self.window_size], words[:, self.window_size + 1:]], dim=1) # (batch_size, self.window_size * 2)
        context_vec = self.context(context_idx) # (batch_size, self.window_size * 2, self.dim)

        # centroid_tens = self.vectors(self.word_to_idx[centroid])
        # losses = []
        pos_dot = torch.bmm(context_vec, centroid_vec.unsqueeze(2)).squeeze(2) # (batch_size, self.window_size * 2, self.dim) @  (batch_size, self.dim)     # have (batch_size, 1)batch_size
        # pos_dot: (batch_size, self.window_size * 2)
        pos_score = -F.logsigmoid(pos_dot).sum(1)
        neg_samples = torch.randint(0, self.len_vocab, (batch_size, self.n_neg_samples), device=device)
        neg_vectors = self.context(neg_samples) # (batch_size, n_neg_samples, self.dim)
        neg_dot = torch.bmm(neg_vectors, centroid_vec.unsqueeze(2)).squeeze(2) # (batch_size, n_neg_samples, self.dim) @  (batch_size, self.dim) # have (batch_size, 1)
        # neg_dot: (batch_size, n_neg_samples)
        neg_score = -F.logsigmoid(-neg_dot).sum(1)
        total_loss = (neg_score + pos_score).mean()



        # for i in range(self.window * 2 + 1):
        #     if i == self.window:
        #         continue
        #     context_vec = self.context(torch.tensor(self.word_to_idx[words[i]]))
        #     neg_samples = torch.randint(0, self.len_vocab, (self.n_neg_samples,))
        #     neg_vectors = self.context(neg_samples)
        #     pos_score = -torch.log(torch.sigmoid(torch.dot(context_vec, centroid_tens)))
        #     neg_score = -torch.sum(torch.log(torch.sigmoid((-1) * torch.mul(neg_vectors, centroid_tens) + 1e-10)))
        #     losses.append(pos_score + neg_score)
        return total_loss

In [17]:
'engineering' in list(vocab)

True

In [18]:
for i in dataloader:
    print(i)
    break

tensor([[31212, 31212, 38103, 81019, 64680],
        [31212, 38103, 81019, 64680, 41951],
        [38103, 81019, 64680, 41951, 52624],
        [81019, 64680, 41951, 52624, 45904],
        [64680, 41951, 52624, 45904, 28566],
        [41951, 52624, 45904, 28566, 24934]])


In [19]:
from tqdm import tqdm
model = MyWord2Vec(2, 2, list(vocab), 10, word_to_idx)
model = model.to(device)
model.train()
model.word_to_idx['you']

28439

In [20]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
for epoch in range(5):
    data_loop = tqdm(dataloader, leave=False)
    for sent in data_loop:
        sent = sent.to(device)
        optimizer.zero_grad()
        loss = model.forward(sent)
        loss.backward()
        optimizer.step()


                                                            

In [21]:
torch.save(model.state_dict, './modelw2v.pth')

In [26]:
model = model.to('cpu')

In [39]:
a = model.vectors(torch.tensor(model.word_to_idx['you'])).detach().numpy()
b = np.array([ 2.919476, -4.758562])
print(np.dot(a, b)/(np.linalg.norm(a) * np.linalg.norm(b)))

0.6073406994250237


In [30]:
model.vectors(torch.tensor(model.word_to_idx['black']))

tensor([-0.0593, -0.5388], grad_fn=<EmbeddingBackward0>)