In [1]:
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch
from torch.autograd import Variable
import os
import torch.optim as optim
import pickle
import sys
import pdb

# Pandas 5 years data 

In [2]:
fp = open('/home/anushap/Code-Generation/nmt_model/data/nl2code/code_train.txt', 'r')
raw_data = fp.readlines()
fp.close()

In [3]:
import py_utils as utils

# create dictionaries 

In [None]:
vocabulary = set()
for line in raw_data:
    try:
        words = utils.tokenize_code(line, mode='canonicalize')
        vocabulary = vocabulary.union(set(words))
    except:
        continue

In [None]:
char2id = dict()
id2char = dict()
for i, char in enumerate(vocabulary):
    char2id[char] = i
    id2char[i] = char

# Use existing vocabulary for consistency reasons 

In [4]:
import sys
sys.path.append('/home/anushap/Code-Generation/nmt_model')

In [5]:
from docopt import docopt
from vocab import Vocab, VocabEntry

In [6]:
vocabulary = pickle.load(open('/home/anushap/Code-Generation/nmt_model/data/nl2code/vocab.bin', 'rb'))

# Data Loading

In [7]:
def sample_gumbel(shape, eps=1e-10, out=None):
    """
    Sample from Gumbel(0, 1)
    based on
    https://github.com/ericjang/gumbel-softmax/blob/3c8584924603869e90ca74ac20a6a03d99a91ef9/Categorical%20VAE.ipynb ,
    (MIT license)
    """
    U = out.resize_(shape).uniform_() if out is not None else torch.rand(shape)
    return - torch.log(eps - torch.log(U + eps))

In [9]:
def preprocess_inputs(data):
    converted_data = []
    for line in data:
        try:
            x = [vocabulary.tgt.word2id.get(word, 3) for word in utils.tokenize_code(line)]
            converted_data.extend(x)
        except:
            continue
    return np.array(converted_data)

In [10]:
total_data = preprocess_inputs(raw_data)

In [11]:
class TextDataLoader(DataLoader):

    def __init__(self, data, batch_size=1):
        self.data = data
        self.batch_size = batch_size

    def __iter__(self):
        data = preprocess_inputs(self.data)
        n = len(data) - 1
        m = n // self.batch_size
        data = data[:self.batch_size * m + 1]
        inputs = data[:-1].reshape((self.batch_size, m)).T
        targets = data[1:].reshape((self.batch_size, m)).T

        pos = 0

        while n - pos > 0:

            l = np.random.random_integers(40, 60)
            if pos + l >= m:
                break

            yield inputs[pos:pos+l], targets[pos: pos+l]
            pos += l


In [12]:
N = len(raw_data)
permute = np.random.permutation(N)
raw_data = np.array(raw_data)[permute]
idx = int(0.7* len(raw_data))
train_data = raw_data[0:idx]
dev_data = raw_data[idx:]

In [13]:
train_loader = TextDataLoader(train_data, batch_size=32)

In [14]:
vocabsize = len(vocabulary.tgt.word2id.keys())

# Model 

In [15]:
class Net(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(Net, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm1 = nn.LSTM(embedding_dim, hidden_dim)
        self.lstm2 = nn.LSTM(hidden_dim, hidden_dim)
        self.lstm3 = nn.LSTM(hidden_dim, hidden_dim)
        self.hidden2word = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence, forward):

        embeds = self.word_embeddings(sentence)
        lstm_out, h1 = self.lstm1(embeds)
        lstm_out, h2 = self.lstm2(lstm_out)
        lstm_out, h3 = self.lstm3(lstm_out)
        h = self.hidden2word(lstm_out)

        gumbel = Variable(sample_gumbel(shape=h.size(), out=h.data.new()))
        h += gumbel
        logits = h

        if forward > 0:
            outputs = []
            logits = torch.transpose(logits, 0, 1)
            h = torch.max(logits[:, -1:, :], dim = 2)[1].t()

            for i in range(forward):
                h = self.word_embeddings(h)
                h, _ = self.lstm1(h)
                h = self.hidden2word(h)
                gumbel = Variable(sample_gumbel(shape=h.size(), out=h.data.new()))
                h += gumbel
                outputs.append(h)
                h = torch.max(h, dim=2)[1]

            logits = torch.transpose(logits, 0, 1)
            logits = torch.cat([logits] + outputs, dim=0)
            logits = torch.max(logits, dim=2)[1]
        return logits


In [19]:
model = Net(64, 256, vocabsize, vocabsize)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [20]:
model.cuda()

Net(
  (word_embeddings): Embedding(10002, 64)
  (lstm1): LSTM(64, 256)
  (lstm2): LSTM(256, 256)
  (lstm3): LSTM(256, 256)
  (hidden2word): Linear(in_features=256, out_features=10002, bias=True)
)

In [None]:
for epoch in range(10):
    print ('Epoch num ', epoch)
    for i, data in enumerate(train_loader):
        inputs, targets = data
        inputs, targets = Variable(torch.LongTensor(inputs).cuda()), Variable(torch.LongTensor(targets).cuda())

        model.zero_grad()

        word_scores = model(inputs, 0)

        r, c, h = word_scores.shape
        word_scores = word_scores.view(r * c, h)

        targets = targets.contiguous().view(-1)

        loss = criterion(word_scores, targets)
        if i%20 == 0:
            print('Loss is ', loss.data.cpu().numpy())
        loss.backward()
        optimizer.step()

# Evaluation 

In [22]:
dev_loader = TextDataLoader(dev_data, batch_size=32)

In [24]:
model.train(False)
model.eval()

Net(
  (word_embeddings): Embedding(10002, 64)
  (lstm1): LSTM(64, 256)
  (lstm2): LSTM(256, 256)
  (lstm3): LSTM(256, 256)
  (hidden2word): Linear(in_features=256, out_features=10002, bias=True)
)

In [25]:
final_ip = []
final_op = []
for epoch in range(1):
    for i, data in enumerate(dev_loader):
        inputs, targets = data
        final_ip.append(inputs.transpose())
        inputs, targets = Variable(torch.LongTensor(inputs).cuda()), Variable(torch.LongTensor(targets).cuda())

        word_scores = model(inputs, 20)
        outputs = word_scores.transpose(0, 1)
        
        final_op.append(outputs[:, -20:].cpu().data.numpy())
        



In [26]:
final_ip[0].shape, final_op[0].shape

((32, 54), (32, 20))

In [None]:
fp = open('dummy_pandas_train_word_latest.txt', 'w')
for i, item in enumerate(final_ip):
    for j, line in enumerate(item):
        chars = [id2char[x] for x in line]
        fp.write(''.join(chars))
        op_chars = [id2char[x] for x in final_op[i][j]]
        fp.write(''.join(chars))
        fp.write('\n')

In [None]:
len(final_ip)