In [15]:
import json
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import json
from torch.utils.data import Dataset, DataLoader
import gc
import itertools
import sys
MIN_COUNT=5
WINDOW_SIZE=2
LR=0.001
EMBEDDING_DIM = 200
BATCH_SIZE = 10000
NUM_EPOCH = 2
MAKE_DATASET=False
corpus_file="data/corpus.json"

In [2]:
if MAKE_DATASET==True:
    print("Starting to make Dataset..Loading Corpus..")
    with open(corpus_file) as f:
        corpus = json.load(f)
    words = set()
    print("Constructing Vocab..")
    for sentence in tqdm(corpus):
        cur_words = sentence.split(" ")
        for word in cur_words:
            words.add(word)
    words =list(words)
    words =[i for i in words if len(i)!=0]
    count_word ={word:0 for word in words}
    for sentence in tqdm(corpus):
        cur_words = sentence.split(" ")
        for word in cur_words:
            try:
                count_word[word]+=1
            except:
                pass
    removed_words = []
    sorted_words = [pair[0] for pair in sorted(count_word.items(), key=lambda item: item[1],reverse=True)]
    temp=[]
    for word in sorted_words:
        if count_word[word]>=MIN_COUNT:
            temp.append(word)
        else:
            removed_words.append(word)
    words = temp
    del sorted_words
    del temp
    removed_words=set(removed_words)
    total = len(list(count_word.keys()))
    prob_word ={k:(1-(v/total)) for k,v in count_word.items()}
    def is_successful(success_prob):
        result = np.random.uniform()
        return result < success_prob
    cbow_corpus=[]
    print("Creating CBOW Corpus...")
    for sentence in tqdm(corpus):
        cur_words = sentence.split(" ")
        data =[]
        for word in cur_words:
            if word in removed_words:
                continue
            if len(word)==0 or word == " ":
                continue
            if is_successful(prob_word[word])==True:
                data.append(word)
        if len(data)==0:
            continue
        cbow_corpus.append(data)
    with open("data/cbow_words.json","w") as f:
        json.dump(words,f,indent=4)
    del corpus
    del count_word
    del removed_words
    print("DONE")
else:
    print("Loading Vocab")
    with open("data/cbow_words.json") as f:
        words = json.load(f)
    print("Done")

Loading Vocab
Done


In [3]:
word2Ind = {w:idx for idx, w in enumerate(words)}

In [4]:
class CBOWDataSet(Dataset):
    def __init__(self,vocab_size,window_size=2,make_corpus=False):
        self.context_n_centre=list()
        if make_corpus==True:
            global cbow_corpus
            global word2Ind
            with open("./data/cbow_dataset.txt","w") as f:
                for sentence in tqdm(cbow_corpus):
                    t = len(sentence)
                    for i in range(window_size,t-window_size):
                        centre = sentence[i]
                        context = sentence[i-window_size:i]+ sentence[i+1:i+1+window_size]
                        if len(context)==0:
                            continue
                        context = [word2Ind[word]for word in context]
                        self.context_n_centre.append((context,word2Ind[centre]))
                        f.write(json.dumps((context,word2Ind[centre]))+"\n")
        else:
            print("Getting lines..")
            with open("./data/cbow_dataset.txt") as f:
                total = sum(1 for line in f)
            print("Loading Dataset..")
            with open("./data/cbow_dataset.txt") as f:
                for line in tqdm(f,total=total):
                    self.context_n_centre.append(eval(line.replace("\n","")))
            print("Done")
    def __getitem__(self, idx):
        context,centre = self.context_n_centre[idx]
        return (torch.tensor(context),centre)

    def __len__(self):
        return len(self.context_n_centre)

In [5]:
import torch
import numpy as np
class CBOW(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        self.embeddings = torch.nn.Embedding(vocab_size, embedding_dim)
        self.linear = torch.nn.Linear(embedding_dim, vocab_size)

    def forward(self, inputs):
        x = self.embeddings(inputs)
        embeds = torch.mean(x, dim=1)
        out = self.linear(embeds)
        log_probs = torch.nn.functional.log_softmax(out,dim=1)
        return log_probs

In [16]:
data = CBOWDataSet(len(words),WINDOW_SIZE,MAKE_DATASET)
model = CBOW(len(words), EMBEDDING_DIM)
optimizer = torch.optim.Adam(model.parameters(), LR)
loss_function = torch.nn.NLLLoss()
losses = []
data_loader = DataLoader(data, batch_size=BATCH_SIZE)

In [17]:
if MAKE_DATASET==True:
    del cbow_corpus

In [18]:
len(data.context_n_centre)

48242285

In [13]:
data.context_n_centre[:5]

[[[341, 51, 1809, 38847], 404],
 [[51, 404, 38847, 1181], 1809],
 [[404, 1809, 1181, 21794], 38847],
 [[131, 522, 308, 1649], 596],
 [[522, 596, 1649, 330], 308]]

In [19]:
for epoch in range(NUM_EPOCH):
    total_loss = 0
    print(f"Epoch:{epoch}")
    for batch in tqdm(data_loader):
        context,centre = batch
        log_probs = model(context)
        loss = loss_function(log_probs, centre)
        loss.backward()
        optimizer.step()
        model.zero_grad()
        total_loss += loss.item()
    print('total_loss:',total_loss)

Epoch:0


  0%|          | 0/4825 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
embed_matrix = model.embeddings.weight.detach().cpu().numpy()

In [None]:
model ={}
for word in words:
    model[word]=embed_matrix[word2Ind[word]].tolist()

In [None]:
with open("./data/cbow_model.json","w") as f:
    json.dump(model,f,indent=4)