In [142]:
import re
import numpy as np
from tqdm import tqdm
import mmap
import time

In [3]:
question_path='data/question.txt'
sentence_path='data/sentence.txt'
embs_path='data/embs.npy'

In [72]:
def get_num_lines(file_path):
    fp = open(file_path, "r+")
    buf = mmap.mmap(fp.fileno(), 0)
    lines = 0
    while buf.readline():
        lines += 1
    fp.close()
    return lines

def parse_words(filen_name):
    with open(filen_name,"r") as file:
        text = re.sub(r'\W+', ' ', file.read())
        text = re.sub(r' +',' ',text)
        return set(text.split())

def create_words_dict():
    q_words=parse_words(question_path)
    s_words=parse_words(sentence_path)
    words=q_words.union(s_words)
    id_2_word=dict(enumerate(words))
    return id_2_word,dict(zip(id_2_word.values(), id_2_word.keys()))

def str_2_id(x):
    return word_2_id[x]

def prepare_data(text):
    X=[]
    with open(text,'r') as f:
        for line in tqdm(f,total=get_num_lines(text)):
            line = re.sub(r'\W+', ' ', line)
            line = re.sub(r' +',' ',line)
            X.append(np.array(list(map(str_2_id,line.split()))+[eos_token]))
        
    
    return np.array(X)

In [99]:
id_2_word,word_2_id=create_words_dict()
embeddings=np.load(embs_path)
embeddings=np.vstack([embeddings,np.zeros([1,300])])
embeddings=np.vstack([embeddings,np.zeros([1,300])])
print('> Vocab Len : ',len(id_2_word)+2)
eos_token=len(id_2_word)
sos_token=len(id_2_word)+1
vocab_size=len(id_2_word)+2
print('> Embs Shape : ',embeddings.shape)

> Vocab Len :  64940
> Embs Shape :  (64940, 300)


In [100]:
print('> Loading Questions ')
Qs=prepare_data(question_path)
print('> Loading Sentences ')
As=prepare_data(sentence_path)

  3%|▎         | 2218/70484 [00:00<00:03, 22174.21it/s]

> Loading Questions 


100%|██████████| 70484/70484 [00:01<00:00, 38105.41it/s]
  6%|▌         | 4282/70484 [00:00<00:03, 21402.90it/s]

> Loading Sentences 


100%|██████████| 70484/70484 [00:03<00:00, 21699.33it/s]


In [8]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F

In [101]:
class Encoder(nn.Module):
    def __init__(self,input_size,hidden_size,embs):
        super(Encoder,self).__init__()
        self.hidden_size=hidden_size
        self.embedding=nn.Embedding(input_size,hidden_size)
        self.embedding.weight.data.copy_(torch.from_numpy(embs))
        self.gru=nn.GRU(hidden_size,hidden_size)
    
    def forward(self,inputt,hidden):
        embedded=self.embedding(inputt).view(1,1,-1)
        output=embedded
        output,hidden=self.gru(output,hidden)
        return output,hidden

    def init_hidden(self):
        result=Variable(torch.zeros(1,1,self.hidden_size))
        return result
    
    
    

In [140]:
# test Encoder
input_size=vocab_size
hidden_size=300

enc=Encoder(input_size,hidden_size,embeddings)
enc_hidden=enc.init_hidden()


In [141]:
sentence=Variable(torch.from_numpy(As[0])).view(-1,1)
print(sentence[0].size())
# print(sentence[:,0])
# # print(sentence[0].view(1,1,-1))
for i in range(As[0].shape[0]):
    _,enc_hidden=enc.forward(sentence[i],enc_hidden)

print(enc_output.shape)



torch.Size([1])
torch.Size([1, 1, 300])


In [130]:
class Decoder(nn.Module):
    def __init__(self,hidden_size,output_size,embs):
        super(Decoder,self).__init__()
        self.hidden_size=hidden_size
        self.embedding=nn.Embedding(input_size,hidden_size)
        self.embedding.weight.data.copy_(torch.from_numpy(embs))
        self.gru=nn.GRU(hidden_size,hidden_size)
        self.out=nn.Linear(hidden_size,output_size)
        self.softmax=nn.LogSoftmax(dim=1)
    
    def forward(self,inputt,hidden):
        output=self.embedding(inputt).view(1,1,-1)
        output=F.relu(output)
        output,hidden=self.gru(output,hidden)
        output=self.out(output[0])
        output=self.softmax(output)
        return output,hidden    

In [131]:
dec=Decoder(hidden_size,input_size,embeddings)

In [134]:
sos=Variable(torch.LongTensor([input_size-1])).view(-1,1)
print(sos[0].size())
dec.embedding(sos[0]).view(1,-1,1)
o,h=dec.forward(sos[0],enc_hidden)
print(o.shape)

torch.Size([1])
torch.Size([1, 64940])


In [30]:
x = np.array([1, 2, 25])
x=x.astype('float32')
x

array([ 1.,  2., 25.], dtype=float32)

In [136]:
def train(sentence,question,encoder,decoder,enc_opt,dec_opt,criterion,max_length=100):
    t=time.time()
    encoder_hidden=encoder.init_hidden()
    
    enc_opt.zero_grad()
    dec_opt.zero_grad()
    
    sentence=Variable(torch.from_numpy(sentence)).view(-1,1)
    question=Variable(torch.from_numpy(question)).view(-1,1)
    
    s_len=sentence.size()[0]
    q_len=question.size()[0]
    
    loss=0
    for ei in range(s_len):
        encoder_output,encoder_hidden=encoder.forward(sentence[ei],encoder_hidden)
    
    te=time.time()
    print('encoder forward',te-t)
    sos=Variable(torch.LongTensor([input_size-1])).view(-1,1)
    decoder_hidden=encoder_hidden
    
    for di in range(q_len):
        decoder_output,decoder_hidden=decoder(sos,decoder_hidden)
        topv, topi = decoder_output.data.topk(1)
        ni = topi[0][0]
        loss+=criterion(decoder_output,question[di])
    
        if ni==eos_token:
            break
    
    loss.backward()
    
    enc_opt.step()
    dec_opt.step()
    
    return loss.data[0]/q_len

In [138]:
input_size=vocab_size
hidden_size=300

enc=Encoder(input_size,hidden_size,embeddings)
dec=Decoder(hidden_size,input_size,embeddings)

enc_opt=optim.SGD(enc.parameters(),lr=0.01)
dec_opt=optim.SGD(dec.parameters(),lr=0.01)

criterion=nn.NLLLoss()

print_every=100
loss=0
for i in tqdm(range(Qs.shape[0])):
    loss += train(As[i],Qs[i],enc,dec,enc_opt,dec_opt,criterion)
    break
    if(i%print_every==0):
        print('Loss : ',loss/print_every)
        loss=0


  0%|          | 0/70484 [00:00<?, ?it/s][A
  0%|          | 1/70484 [00:00<16:29:06,  1.19it/s][A

Loss :  0.11062356948852539



  0%|          | 2/70484 [00:01<15:49:59,  1.24it/s][A
  0%|          | 3/70484 [00:02<14:35:50,  1.34it/s][A
Exception in thread Thread-14:
Traceback (most recent call last):
  File "/usr/lib/python3.5/threading.py", line 914, in _bootstrap_inner
    self.run()
  File "/home/arijitx/.local/lib/python3.5/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/usr/lib/python3.5/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration

  0%|          | 101/70484 [01:21<15:45:20,  1.24it/s]

Loss :  8.38737415243744


  0%|          | 201/70484 [02:42<15:49:00,  1.23it/s]

Loss :  5.320705669099673


  0%|          | 301/70484 [03:56<15:19:38,  1.27it/s]

Loss :  4.831473617442605


  1%|          | 401/70484 [05:08<14:58:12,  1.30it/s]

Loss :  4.66480626704426


  1%|          | 408/70484 [05:13<14:56:10,  1.30it/s]


KeyboardInterrupt: 

151.96451146258937