In [3]:
import json
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import json
from torch.utils.data import Dataset, DataLoader
import gc
import itertools
import sys
MIN_COUNT=5
WINDOW_SIZE=2
LR=0.0001
EMBEDDING_DIM = 200
BATCH_SIZE = 2048
NUM_EPOCH = 25
MAKE_DATASET=True
SUB_SAMPLING=False
corpus_file="data/corpus.json"

In [4]:
if MAKE_DATASET==True:
    print("Starting to make Dataset..Loading Corpus..")
    with open(corpus_file) as f:
        corpus = json.load(f)
    words = set()
    print("Constructing Vocab..")
    for sentence in tqdm(corpus):
        cur_words = sentence.split(" ")
        for word in cur_words:
            words.add(word)
    words =list(words)
    words =[i for i in words if len(i)!=0]
    count_word ={word:0 for word in words}
    for sentence in tqdm(corpus):
        cur_words = sentence.split(" ")
        for word in cur_words:
            try:
                count_word[word]+=1
            except:
                pass
    removed_words = []
    sorted_words = [pair[0] for pair in sorted(count_word.items(), key=lambda item: item[1],reverse=True)]
    temp=[]
    for word in sorted_words:
        if count_word[word]>=MIN_COUNT:
            temp.append(word)
        else:
            removed_words.append(word)
    words = temp
    del sorted_words
    del temp
    removed_words=set(removed_words)
    total = len(list(count_word.keys()))
    prob_word ={k:(1-(v/total)) for k,v in count_word.items()}
    def is_successful(success_prob):
        result = np.random.uniform()
        return result < success_prob
    cbow_corpus=[]
    print("Creating CBOW Corpus...")
    for sentence in tqdm(corpus):
        cur_words = sentence.split(" ")
        data =[]
        for word in cur_words:
            if word in removed_words:
                continue
            if len(word)==0 or word == " ":
                continue
            if is_successful(prob_word[word])==True and SUB_SAMPLING==True:
                data.append(word)
            else:
                data.append(word)         
        if len(data)==0:
            continue
        cbow_corpus.append(data)
    with open("data/cbow_words.json","w") as f:
        json.dump(words,f,indent=4)
    del corpus
    del count_word
    del removed_words
    print("DONE")
else:
    print("Loading Vocab")
    with open("data/cbow_words.json") as f:
        words = json.load(f)
    print("Done")

Starting to make Dataset..Loading Corpus..
Constructing Vocab..


  0%|          | 0/14432994 [00:00<?, ?it/s]

  0%|          | 0/14432994 [00:00<?, ?it/s]

Creating CBOW Corpus...


  0%|          | 0/14432994 [00:00<?, ?it/s]

DONE


In [5]:
word2Ind = {w:idx for idx, w in enumerate(words)}

In [6]:
class CBOWDataSet(Dataset):
    def __init__(self,vocab_size,window_size=2,make_corpus=False):
        self.context_n_centre=list()
        if make_corpus==True:
            global cbow_corpus
            global word2Ind
            with open("./data/cbow_dataset.txt","w") as f:
                for sentence in tqdm(cbow_corpus):
                    t = len(sentence)
                    for j in range(t):
                        centre = sentence[j]
                        context=[]
                        if(j>=window_size and j+window_size<t):
                            context=sentence[j-window_size:j]+sentence[j+1:j+window_size+1]
                        elif (j<window_size):
                            context=sentence[0:j]+sentence[j+1:j+window_size+1]           
                        elif(j+window_size>=t):
                            context=sentence[j-window_size:j]+sentence[j+1:]
                        if len(context)==0:
                            continue
                        context = [word2Ind[word]for word in context]
                        if len(context)!=window_size*2:
                            total = len(context)
                            while len(context)< window_size*2:
                                context.append(vocab_size)
                        self.context_n_centre.append((context,word2Ind[centre],total))
                        f.write(json.dumps((context,word2Ind[centre],total))+"\n")
        else:
            print("Getting lines..")
            with open("./data/cbow_dataset.txt") as f:
                total = sum(1 for line in f)
            print("Loading Dataset..")
            with open("./data/cbow_dataset.txt") as f:
                for line in tqdm(f,total=total):
                    self.context_n_centre.append(eval(line.replace("\n","")))
            print("Done")
    def __getitem__(self, idx):
        context,centre,total = self.context_n_centre[idx]
        return (torch.tensor(context),centre,total)

    def __len__(self):
        return len(self.context_n_centre)

In [7]:
import torch
import numpy as np
class CBOW(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        self.embeddings = torch.nn.Embedding(vocab_size+1, embedding_dim, padding_idx=vocab_size)
        self.linear = torch.nn.Linear(embedding_dim, vocab_size+1)

    def forward(self, inputs,total):
        x = self.embeddings(inputs)
        embeds = torch.sum(x, dim=1)/total.view(total.shape[0],1)
        out = self.linear(embeds)
        log_probs = torch.nn.functional.log_softmax(out,dim=1)
        return log_probs

In [8]:
data = CBOWDataSet(len(words),WINDOW_SIZE,MAKE_DATASET)
model = CBOW(len(words), EMBEDDING_DIM)
optimizer = torch.optim.Adam(model.parameters(), LR)
loss_function = torch.nn.NLLLoss()
losses = []
data_loader = DataLoader(data, batch_size=BATCH_SIZE)

  0%|          | 0/512 [00:00<?, ?it/s]

In [9]:
if MAKE_DATASET==True:
    del cbow_corpus

In [10]:
len(data.context_n_centre)

4021

In [11]:
data.context_n_centre[:5]

[([51, 404, 149308, 149308], 341, 2),
 ([341, 404, 1808, 149308], 51, 3),
 ([341, 51, 1808, 38415], 404, 3),
 ([51, 404, 38415, 1181], 1808, 3),
 ([404, 1808, 1181, 21863], 38415, 3)]

In [12]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print('device: ' + str(device))
model = model.to(device)

device: cuda


In [13]:
model.train()
for epoch in range(NUM_EPOCH):
    total_loss = 0
    print(f"Epoch:{epoch}")
    for batch in tqdm(data_loader):
        context,centre,total = batch
        context = context.to(device)
        centre = centre.to(device)
        total = total.to(device)
        log_probs = model(context,total)
        loss = loss_function(log_probs, centre)
        loss.backward()
        optimizer.step()
        model.zero_grad()
        total_loss += loss.detach().item()
    print('total_loss:',total_loss)

Epoch:0


  0%|          | 0/2 [00:00<?, ?it/s]

total_loss: 23.99436664581299
Epoch:1


  0%|          | 0/2 [00:00<?, ?it/s]

total_loss: 23.968806266784668


In [15]:
model.eval()
embeddings = model.embeddings
tokens = torch.tensor([word2Ind[w] for w in words]).to(device)
embeddings = embeddings(tokens)

In [17]:
model ={}
for i, word in enumerate(words):
    model[word]=embeddings[i].detach().cpu().numpy().tolist()

In [19]:
with open("./data/cbow_model.json","w") as f:
    json.dump(model,f,indent=4)