In [1]:
import torch
import gensim
import pickle

import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from os import listdir
from nltk import word_tokenize,pos_tag
from tqdm import tqdm as tqdm
from torch.autograd import Variable

dataset_path = 'dataset'
embeddings_path = 'GoogleNews-vectors-negative300.bin'
embedding_dimension = 300
vocab_size = 0
window_size = 2
debug_iters = 100
epochs = 2
batch_size = 256

In [2]:
def load():
    print('Loading Google Model')

    model = gensim.models.KeyedVectors.load_word2vec_format(embeddings_path, binary=True)  

    print('Generating Vocab')
    word2idx = {}
    idx2word = {}

    idx = 0
    # for word in model.vocab:
    #     word2idx[word] = idx
    #     idx2word[idx] = word
    #     idx += 1

    # trained_words = idx

    old_vocab = model.vocab

    print('Reading new domain files')

    dataset_files = listdir(dataset_path)

    data_tokenized = []

    for file_path in dataset_files:
        with open(dataset_path+'/'+file_path) as file:
            data_tokenized.append(word_tokenize(file.read()))

    print("Replacing proper nouns")
    for i in tqdm(range(len(data_tokenized))):
            token_set = data_tokenized[i]
            datum_pos_tagged = pos_tag(token_set)
            for j in range(len(datum_pos_tagged)):
                tag = datum_pos_tagged[j][1]
                if(tag == 'NNP' or tag == 'NNPS'):
                    data_tokenized[i][j] = '-pro-'

    print('Adding new domain tokens')
    for tokens in data_tokenized:
        for token in tokens:
            if (token not in word2idx):
                word2idx[token] = idx
                idx2word[idx] = token
                idx += 1

    vocab_size = idx

    print('Copying old embeddings')
    # embedding_dimension = model[idx2word[0]].shape[0]

    initial_embeds = torch.randn(vocab_size,embedding_dimension)
    for i in range(vocab_size):
        if idx2word[i] in old_vocab:
            initial_embeds[i,:] = torch.as_tensor(model[idx2word[i]])
    # initial_embeds[:trained_words,:] = torch.as_tensor(model[model.vocab])

    print("Creating Training Examples")

    train_examples = []
    target_words = []
    for i in tqdm(range(len(data_tokenized))):
        for j in range(len(data_tokenized[i])):
            for k in range(j-window_size,j+window_size+1):
                if(k<0 or j==k or k>=len(data_tokenized[i])):
                    continue
                train_examples.append(word2idx[data_tokenized[i][k]])
                target_words.append(word2idx[data_tokenized[i][j]])

    return word2idx,idx2word,vocab_size,embedding_dimension,initial_embeds,train_examples,target_words

word2idx,idx2word,vocab_size,embedding_dimension,initial_embeds,train_examples,target_words = load()

Loading Google Model
Generating Vocab
Reading new domain files


  0%|          | 0/13 [00:00<?, ?it/s]

Replacing proper nouns


100%|██████████| 13/13 [00:28<00:00,  2.26s/it]


Adding new domain tokens
Copying old embeddings
Creating Training Examples

100%|██████████| 13/13 [00:02<00:00,  4.85it/s]







In [9]:
print('Creating Model')

from model import Net

net = Net(vocab_size,embedding_dimension)
net.set_weights(initial_embeds)

optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
criterion = nn.CrossEntropyLoss()

Creating Model


In [11]:
print('Training')

for epoch in range(epochs):
    total_loss = 0.0
    iter_num = 0
    for i in range(0,len(train_examples),batch_size):
        print(iter_num,end='\r')
        
        if(i + batch_size > len(train_examples)):
            context_words = train_examples[i:]
            center_word = target_words[i:]
        else:
            context_words = train_examples[i:i+batch_size]
            center_word = target_words[i:i+batch_size]
        
        input_ = torch.tensor(context_words)
        output_ = Variable(torch.from_numpy(np.array(center_word)).long())

        optimizer.zero_grad()
        
        # Forward
        outputs = net(input_)
        # log_softmax = F.log_softmax(outputs)
        
        # Backward
        loss = criterion(outputs,output_)
        # loss = F.nll_loss(log_softmax,output_)
        loss.backward()
        
        # Optimize
        optimizer.step()

        total_loss += loss.item()
        if iter_num % debug_iters == debug_iters-1:
            print(total_loss)
            out = net(torch.tensor(word2idx['the']))
            log_softmax = F.log_softmax(out)
            _, indices = log_softmax.max(0)
            print(idx2word[int(indices.numpy())],_,indices)
            total_loss = 0.0
        iter_num += 1

Training
2652.6342344284058
of tensor(-3.9290, grad_fn=<MaxBackward0>) tensor(16)
101



2704.070827484131
of tensor(-4.0626, grad_fn=<MaxBackward0>) tensor(16)
2843.9307537078857
of tensor(-4.1882, grad_fn=<MaxBackward0>) tensor(16)
388

KeyboardInterrupt: 

In [25]:
db = {} 

db['word2idx'] = word2idx
db['idx2word'] = idx2word
db['embedding_dimension'] = embedding_dimension
db['vocab_size'] = vocab_size

dbfile = open('pickLLe', 'ab')
    
pickle.dump(db, dbfile)             
dbfile.close()

In [43]:
torch.save(net.state_dict(), 'torchModel')