# STATIC GRAPH

In [55]:
import torch
from torch import nn
import torch.nn.functional as F
from torch import optim
from torch.autograd import Variable


## DATA

In [56]:
import data_utils
metadata, idx_q, idx_a = data_utils.load_data('../data/')

In [57]:
# add special symbol
i2w = metadata['idx2w'] + ['GO']
w2i = metadata['w2idx']
w2i['GO'] = len(i2w)-1

## Parameters

In [74]:
batch_size = 1
L = len(idx_q[0])
vocab_size = len(i2w)
hidden_size = 256

In [84]:
class Config:
    pass

config = Config()
config.printsize = True

In [169]:
len(idx_q)

267518


## Graph

In [278]:
def initial_state():
    state = torch.zeros([batch_size, hidden_size])
    return Variable(state.cuda())

def psize(name, variable):
    if config.printsize:
        print(name, variable.size(), type(variable.data))
        
class Encoder(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super(Encoder, self).__init__()
        
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
                
        self.embed = nn.Embedding(vocab_size, hidden_size)
        self.encode = nn.LSTMCell(hidden_size, hidden_size)
            
    def forward(self, enc_inputs, hidden):
        psize('enc_inputs', enc_inputs)
        enc_embeddings = self.embed(enc_inputs).unsqueeze(1)                   #LxBxH
                
        psize('enc_embeddings', enc_embeddings)        
        hidden, cell_state = hidden
        for i in range(enc_embeddings.size()[1]):
            hidden, cell_state = self.encode(enc_embeddings[i], (hidden, cell_state))
            
        return hidden, cell_state
    
class Decoder(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super(Decoder, self).__init__()
        
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        
        self.decode = nn.LSTMCell(hidden_size, hidden_size)
        self.embed = nn.Embedding(vocab_size, hidden_size)
        self.project = nn.Linear(hidden_size, vocab_size)
        
    def forward(self, outputs, hidden):
        psize('hidden', hidden[0]), psize('hidden', hidden[1])
        predicted_outputs = []
    
        dec_embeddings = self.embed(outputs).unsqueeze(1)           #LxBxH
    
        GO = torch.LongTensor([w2i['GO']]).cuda()            
        GO = Variable(GO)
        psize('GO', GO)
        GO_emb = self.embed(GO)
        psize('GO_emd', GO_emb)
        
        hidden, cell_state = self.decode(GO_emb, hidden)
        predicted_outputs.append(hidden)
        for i in range(outputs.size()[0] - 1):
            import random
            if random.random() > 0:
                dec_input = dec_embeddings[i+1]
            else:
                dec_input = hidden
            hidden, cell_state = self.decode(dec_input, (hidden, cell_state))
            predicted_outputs.append(hidden)
            
        predicted_outputs = torch.stack(predicted_outputs).squeeze(1)
        psize('predicted_outputs', predicted_outputs)

        predicted_outputs = self.project(predicted_outputs)
        psize('predicted_outputs', predicted_outputs)
        return predicted_outputs
    
    def predict(self, outputs, hidden):
        psize('hidden', hidden[0]), psize('hidden', hidden[1])
        predicted_outputs = []
    
        dec_embeddings = self.embed(outputs).unsqueeze(1)           #LxBxH
    
        GO = torch.LongTensor([w2i['GO']]).cuda()            
        GO = Variable(GO)
        psize('GO', GO)
        GO_emb = self.embed(GO)
        psize('GO_emd', GO_emb)
        
        hidden, cell_state = self.decode(GO_emb, hidden)
        predicted_outputs.append(hidden)
        for i in range(outputs.size()[0] - 1):
            dec_input = hidden
            hidden, cell_state = self.decode(dec_input, (hidden, cell_state))
            predicted_outputs.append(hidden)
            
        predicted_outputs = torch.stack(predicted_outputs).squeeze(1)
        psize('predicted_outputs', predicted_outputs)

        predicted_outputs = self.project(predicted_outputs)
        psize('predicted_outputs', predicted_outputs)
        return predicted_outputs
    

# TRAINING

In [206]:
from pprint import pprint
from tqdm import tqdm
def train_epochs(epochs, encoder, decoder, eoptim, doptim, criterion, print_every=1):
    model.train()
    losses = []
    config.printsize = True

    for epoch in tqdm(range(epochs+1)):
        loss = train(encoder, decoder, eoptim, doptim, criterion, idx_q[:30000], idx_a[:30000])    
        if epoch % print_every == 0:
            losses.append(loss)
            print('{} - loss: {}'.format(epoch, loss))

        
def train(encoder, decoder, eoptim, doptim, criterion, question_ids, answer_ids):
    for question_id, answer_id in zip(question_ids, answer_ids):
        data = Variable(torch.from_numpy(question_id).long().cuda())
        target = Variable(torch.from_numpy(answer_id).long().cuda())

        eoptim.zero_grad(), doptim.zero_grad()    
        initial_hidden = initial_state().cuda(), initial_state().cuda()
        
        encoder_output = encoder(data, initial_hidden)
        decoder_output = decoder(target, encoder_output)
        logits = F.log_softmax(decoder_output)
        loss = criterion(logits, target)    
        loss.backward()
        eoptim.step(), doptim.step()
        config.printsize = False
    return loss.data[0]

In [194]:
encoder = Encoder(vocab_size, hidden_size)
decoder = Decoder(vocab_size, hidden_size)

encoder.cuda()
decoder.cuda()

criterion = nn.NLLLoss()

eoptim = optim.SGD(encoder.parameters(), lr=0.1, momentum=0.1)
doptim = optim.SGD(decoder.parameters(), lr=0.1, momentum=0.1)

In [276]:
train_epochs(10, encoder, decoder, eoptim, doptim, criterion,)

  0%|          | 0/11 [00:00<?, ?it/s]

enc_inputs torch.Size([21]) <class 'torch.autograd.variable.Variable'>
enc_embeddings torch.Size([21, 1, 256]) <class 'torch.autograd.variable.Variable'>
hidden torch.Size([1, 256]) <class 'torch.autograd.variable.Variable'>
hidden torch.Size([1, 256]) <class 'torch.autograd.variable.Variable'>
GO torch.Size([1]) <class 'torch.autograd.variable.Variable'>
GO_emd torch.Size([1, 256]) <class 'torch.autograd.variable.Variable'>
predicted_outputs torch.Size([21, 256]) <class 'torch.autograd.variable.Variable'>
predicted_outputs torch.Size([21, 6005]) <class 'torch.autograd.variable.Variable'>


  9%|▉         | 1/11 [08:53<1:28:50, 533.04s/it]

0 - loss: 0.23039956390857697


 18%|█▊        | 2/11 [18:34<1:22:08, 547.63s/it]

1 - loss: 0.22800886631011963


 27%|██▋       | 3/11 [28:06<1:13:57, 554.74s/it]

2 - loss: 0.22560757398605347


 36%|███▋      | 4/11 [37:59<1:06:03, 566.26s/it]

3 - loss: 0.22365570068359375


 45%|████▌     | 5/11 [47:14<56:17, 562.96s/it]  

4 - loss: 0.2219550907611847


 55%|█████▍    | 6/11 [56:54<47:20, 568.01s/it]

5 - loss: 0.22024543583393097


 64%|██████▎   | 7/11 [1:06:02<37:28, 562.15s/it]

6 - loss: 0.21847470104694366


 73%|███████▎  | 8/11 [1:15:11<27:54, 558.02s/it]

7 - loss: 0.21648193895816803


 82%|████████▏ | 9/11 [1:24:03<18:20, 550.33s/it]

8 - loss: 0.21421021223068237


 91%|█████████ | 10/11 [1:32:59<09:05, 545.95s/it]

9 - loss: 0.21166130900382996


100%|██████████| 11/11 [1:41:51<00:00, 541.96s/it]

10 - loss: 0.20907345414161682





In [213]:
torch.save(encoder.state_dict(), 'graph.pytorch.encoder.pth')
torch.save(decoder.state_dict(), 'graph.pytorch.decoder.pth')

## Test

In [279]:
encoder_test = Encoder(vocab_size, hidden_size)
decoder_test = Decoder(vocab_size, hidden_size)
encoder_test.cuda()
decoder_test.cuda()
encoder_test.load_state_dict(torch.load('graph.pytorch.encoder.pth'))
decoder_test.load_state_dict(torch.load('graph.pytorch.decoder.pth'))

In [280]:
batch = 0
l, r = batch * B, (batch + 1) * B
test_q, test_a = idx_q[0], idx_a[0]

encoder_test.eval()
decoder_test.eval()

test_q = Variable(torch.from_numpy(test_q).long().cuda())
test_a = Variable(torch.from_numpy(test_a).long().cuda())

config.printsize = True

hidden = initial_state().cuda(), initial_state().cuda()
predictions = decoder_test.predict(test_a, encoder_test(test_q, hidden))
predictions = F.log_softmax(predictions).max(1)[1].squeeze(1)


enc_inputs torch.Size([21]) <class 'torch.cuda.LongTensor'>
enc_embeddings torch.Size([21, 1, 256]) <class 'torch.cuda.FloatTensor'>
hidden torch.Size([1, 256]) <class 'torch.cuda.FloatTensor'>
hidden torch.Size([1, 256]) <class 'torch.cuda.FloatTensor'>
GO torch.Size([1]) <class 'torch.cuda.LongTensor'>
GO_emd torch.Size([1, 256]) <class 'torch.cuda.FloatTensor'>
predicted_outputs torch.Size([21, 256]) <class 'torch.cuda.FloatTensor'>
predicted_outputs torch.Size([21, 6005]) <class 'torch.cuda.FloatTensor'>


In [216]:
def arr2sent(arr):
    return ' '.join([i2w[item] for item in arr])

In [281]:
print(predictions)
print(arr2sent(predictions.cpu().data.numpy()))
print(arr2sent(test_a.cpu().data.numpy()))

Variable containing:
    1
  624
 3921
 4403
 4239
  475
  475
 1240
 1240
 4106
 4102
 3722
  414
 3920
 1795
  855
 3428
  475
  475
 1240
 3207
[torch.cuda.LongTensor of size 21 (GPU 0)]

unk forget bath irresponsible coincidence dead dead brooklyn brooklyn yahoo aaron metal hit kit narrative weve penn dead dead brooklyn cutest
yeah dude i would definitely consider a daniel unk super reliable and they are just bad ass EOS _ _ _
