# STATIC GRAPH

In [2]:
import torch
from torch import nn
import torch.nn.functional as F
from torch import optim
from torch.autograd import Variable


## DATA

In [3]:
import data_utils
metadata, idx_q, idx_a = data_utils.load_data('../data/')

In [4]:
# add special symbol
i2w = metadata['idx2w'] + ['GO']
w2i = metadata['w2idx']
w2i['GO'] = len(i2w)-1

## Parameters

In [5]:
B = 256
L = len(idx_q[0])
vocab_size = len(i2w)
enc_hdim = 250
dec_hdim = enc_hdim

In [82]:
len(idx_q)

130422

## Graph

In [88]:
class Model(nn.Module):
    def __init__(self, batch_size, vocab_size, hidden_size, input_size):
        super(Model, self).__init__()
        
        self.batch_size = batch_size
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.input_size  = input_size
        
        self.encode = nn.LSTMCell(hidden_size, hidden_size)
        self.decode = nn.LSTMCell(hidden_size, hidden_size)
        
        self.embed = nn.Embedding(vocab_size, hidden_size)
        self.project = nn.Linear(hidden_size, vocab_size)
        
        self.printsize = True
        
    def initial_state(self):
        state = torch.zeros([self.batch_size, self.hidden_size])
        return Variable(state.cuda())
    
    def psize(self, name, tensor):
        if self.printsize:
            print(name, tensor.size(), type(tensor))
    
    def forward(self, enc_inputs, dec_outputs):
        self.psize('enc_inputs', enc_inputs)
        self.psize('dec_outputs', dec_outputs)

        enc_embeddings = self.embed(enc_inputs)                                 #BxLXH <- BxL
        dec_embeddings = self.embed(dec_outputs)                                #BXLXH <- BxL
        
        self.psize('enc_embeddings', enc_embeddings)
        self.psize('dec_embeddings', dec_embeddings)
        
        hidden = cell_state = self.initial_state()                              #BxH
        for i in range(enc_embeddings.size()[1]):
            hidden, cell_state = self.encode(enc_embeddings[:,i], (hidden, cell_state))
        
        self.psize('hidden', hidden)
        self.psize('cell_state', cell_state)
        
        predicted_outputs = []
        first_input = torch.LongTensor([w2i['GO']] * self.batch_size).cuda()
            
        first_input = Variable(first_input)
        self.psize('first_input', first_input)
        first_input = self.embed(first_input)
        self.psize('first_input', first_input)
        hidden, cell_state = self.decode(first_input, (hidden, cell_state))
        predicted_outputs.append(hidden)
        for i in range(dec_embeddings.size()[1] - 1):
            import random
            if random.random():
                hidden, cell_state = self.decode(dec_embeddings[:,i+1], (hidden, cell_state))
            else:
                hidden, cell_state = self.decode(dec_embeddings[:,i+1], (hidden, cell_state))
                
            predicted_outputs.append(hidden)
            
        predicted_outputs = torch.stack(predicted_outputs)
        predicted_outputs = predicted_outputs.view(
                            self.batch_size * dec_outputs.size()[1], 
                            self.hidden_size)
        self.psize('predicted_outputs', predicted_outputs)
        outputs = self.project(predicted_outputs)
        self.psize('outputs', outputs)
        outputs = outputs.view(self.batch_size, dec_outputs.size()[1], self.vocab_size)
        self.psize('outputs', outputs)
        
        outputs = F.log_softmax(outputs)
        #outputs = F.log_softmax(outputs).max(2)[1].squeeze(2)
        self.psize('outputs', outputs)

        return outputs
    
    def predict(self, enc_inputs):
        self.psize('enc_inputs', enc_inputs)

        enc_embeddings = self.embed(enc_inputs)                                 #BxLXH <- BxL
        hidden = cell_state = self.initial_state()                              #BxH
        for i in range(enc_embeddings.size()[1]):
            hidden, cell_state = self.encode(enc_embeddings[:,i], (hidden, cell_state))

        predicted_outputs = []
        first_input = torch.LongTensor([w2i['GO']] * self.batch_size).cuda()
        first_input = Variable(first_input)
        first_input = self.embed(first_input)

        hidden, cell_state = self.decode(first_input, (hidden, cell_state))
        predicted_outputs.append(hidden)
        for i in range(enc_embeddings.size()[1] - 1):
            hidden, cell_state = self.decode(hidden, (hidden, cell_state))
            predicted_outputs.append(hidden)
            
        predicted_outputs = torch.stack(predicted_outputs)
        predicted_outputs = predicted_outputs.view(
                                self.batch_size * enc_inputs.size()[1], 
                                self.hidden_size)
        
        outputs = F.tanh(self.project(predicted_outputs))
        
        outputs = outputs.view(self.batch_size, enc_inputs.size()[1], self.vocab_size)
        outputs = F.log_softmax(outputs).max(2)[1].squeeze(2)
        self.psize('outputs', outputs)

        return outputs

# TRAINING

In [86]:
from pprint import pprint
def train(epochs, model, question_ids, answer_ids):
    model.train()
    optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.1)
    batches = len(question_ids)//B
    print('number of batches:', batches)
    for epoch in range(epochs+1):
        avg_loss = 0
        for batch in range(batches):
            l, r = batch * B, (batch + 1) * B
            data = Variable(torch.from_numpy(question_ids[l:r]).long().cuda())
            target = Variable(torch.from_numpy(answer_ids[l:r]).long().cuda())
            
            optimizer.zero_grad()
            logits = model(data, target)
            b, l = target.size()
            target = target.view(B * l)
            logits = logits.view(B*l, -1)
            
            loss = F.nll_loss(logits, target)
            loss.backward()
            optimizer.step()
            
            avg_loss += loss.data[0]
            model.printsize = False
            if not batch % 20 and batch:
                print('{}.{} - {} {}'.format(epoch, batch, avg_loss/20, loss.data[0]))
                avg_loss = 0

In [89]:
model = Model(B, vocab_size, enc_hdim, 1).cuda()

In [90]:
train(10, model, idx_q, idx_a)

number of batches: 509
enc_inputs torch.Size([256, 20]) <class 'torch.autograd.variable.Variable'>
dec_outputs torch.Size([256, 20]) <class 'torch.autograd.variable.Variable'>
enc_embeddings torch.Size([256, 20, 250]) <class 'torch.autograd.variable.Variable'>
dec_embeddings torch.Size([256, 20, 250]) <class 'torch.autograd.variable.Variable'>
hidden torch.Size([256, 250]) <class 'torch.autograd.variable.Variable'>
cell_state torch.Size([256, 250]) <class 'torch.autograd.variable.Variable'>
first_input torch.Size([256]) <class 'torch.autograd.variable.Variable'>
first_input torch.Size([256, 250]) <class 'torch.autograd.variable.Variable'>
predicted_outputs torch.Size([5120, 250]) <class 'torch.autograd.variable.Variable'>
outputs torch.Size([5120, 8003]) <class 'torch.autograd.variable.Variable'>
outputs torch.Size([256, 20, 8003]) <class 'torch.autograd.variable.Variable'>
outputs torch.Size([256, 20, 8003]) <class 'torch.autograd.variable.Variable'>
0.20 - 5.825427603721619 5.5465478

KeyboardInterrupt: 

## Test

In [55]:
batch = 0
l, r = batch * B, (batch + 1) * B
test_q, test_a = idx_q[l:r], idx_a[l:r]
model.eval()
test_q = Variable(torch.from_numpy(test_q).long().cuda())
model.printsize = True
model.psize('test_q', test_q)
predictions = model.predict(test_q)

test_q torch.Size([256, 20]) <class 'torch.autograd.variable.Variable'>
enc_inputs torch.Size([256, 20]) <class 'torch.autograd.variable.Variable'>
outputs torch.Size([256, 20]) <class 'torch.autograd.variable.Variable'>


In [64]:
def arr2sent(arr):
    return ' '.join([i2w[item] for item in arr])

In [65]:
arr2sent(predictions[0].cpu().data.numpy()), arr2sent(test_a[0])

('famous state famous famous reveals famous x famous famous famous famous famous code famous url seattle famous code famous reveals',
 'how do you do this _ _ _ _ _ _ _ _ _ _ _ _ _ _ _')