In [1]:
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch
from torch.autograd import Variable
import os
import torch.optim as optim
import pickle
import sys
import pdb

# Pandas dataset by us

In [3]:
with open('/home/anushap/Code-Generation/data_processing/pickles/pandas_5yr_filtered_commented.pkl', 'rb') as fp:
    pandas_data = pickle.load(fp)

In [5]:
pandas_data[0]

(['r= requests.get("https://www.quandl.com/api/v3/datasets/FSE/AFX_X/data.json")\n',
  'json_data = r.json()\n',
  'json_data["dataset_data"]["data"][0]'],
 ['# Now, call the Quandl API and pull out a small sample of the data (only one day) to get a glimpse',
  '# into the JSON structure that will be returned'])

In [12]:
raw_data = []
for item in pandas_data:
    raw_data.extend(item[0])

# Conala data 

In [7]:
#fp = open('../../conala-baseline/conala-corpus/conala-train.snippet', 'r')
#raw_data = fp.readlines()
#fp.close()

In [9]:
import py_utils as utils

# create dictionaries 

In [None]:
vocabulary = set()
for line in raw_data:
    try:
        words = utils.tokenize_code(line, mode='canonicalize')
        pdb.set_trace()
        vocabulary = vocabulary.union(set(words))
    except:
        continue

In [None]:
len(vocabulary)

In [15]:
vocabulary

{'to_drop',
 'clear_dest_dir',
 'null_valls',
 'NBClassifier',
 'df_night',
 'df_restaurants',
 'rt_set',
 'df_all_wells_wKNN_DEPTHtoDEPT_KNN1PredTopMcM__NearTop_CurveF_20180726',
 'images',
 '165000',
 'r_test',
 'checkDataMatchBetweenVars',
 'access_info',
 'convereted_rate_old',
 'complib',
 'ticket_status',
 '#saves German stop words in a list',
 'ddate',
 'tmp_idx',
 'SI6',
 'usr_idx',
 'twitter_archive_clean',
 'files',
 'df_freq_users',
 'df_ll',
 'check_rhum',
 'tweets2',
 '683',
 'rain_stats',
 'gap',
 'mismatch1',
 'trump_tweets',
 'Missoula',
 'file_size',
 'RDDTestScorees',
 'ndvi_of_interest02',
 'dfAnnualMGD',
 'exc_data',
 'sale_prod_sort',
 'df_merge',
 'column',
 'simplefilter',
 'until',
 'cisuabg7',
 'make_corrections',
 'you',
 'dataSeries',
 'nv',
 'Print_hello_class',
 'jupyter',
 'json_data_format_records',
 'regex',
 'rhum_fine',
 'options_frame',
 'top_features',
 'getLogger',
 'df_mk',
 'y_tmp',
 'token_pattern',
 'obo_parser',
 'best_model_lr',
 'words_only_s

In [16]:
char2id = dict()
id2char = dict()
for i, char in enumerate(vocabulary):
    char2id[char] = i
    id2char[i] = char

# Data Loading

In [17]:
def sample_gumbel(shape, eps=1e-10, out=None):
    """
    Sample from Gumbel(0, 1)
    based on
    https://github.com/ericjang/gumbel-softmax/blob/3c8584924603869e90ca74ac20a6a03d99a91ef9/Categorical%20VAE.ipynb ,
    (MIT license)
    """
    U = out.resize_(shape).uniform_() if out is not None else torch.rand(shape)
    return - torch.log(eps - torch.log(U + eps))

In [18]:
def preprocess_inputs(data):
    converted_data = []
    for line in data:
        try:
            x = [char2id[char] for char in utils.tokenize_code(line)]
            converted_data.extend(x)
        except:
            continue
    return np.array(converted_data)

In [19]:
total_data = preprocess_inputs(raw_data)

In [20]:
class TextDataLoader(DataLoader):

    def __init__(self, data, batch_size=1):
        self.data = data
        self.batch_size = batch_size

    def __iter__(self):
        data = preprocess_inputs(self.data)
        n = len(data) - 1
        m = n // self.batch_size
        data = data[:self.batch_size * m + 1]
        inputs = data[:-1].reshape((self.batch_size, m)).T
        targets = data[1:].reshape((self.batch_size, m)).T

        pos = 0

        while n - pos > 0:

            l = np.random.random_integers(40, 60)
            if pos + l >= m:
                break

            yield inputs[pos:pos+l], targets[pos: pos+l]
            pos += l


In [21]:
N = len(raw_data)
permute = np.random.permutation(N)
raw_data = np.array(raw_data)[permute]
idx = int(0.7* len(raw_data))
train_data = raw_data[0:idx]
dev_data = raw_data[idx:]

In [22]:
train_loader = TextDataLoader(train_data, batch_size=32)

In [23]:
vocabsize = len(vocabulary)

# Model 

In [24]:
class Net(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(Net, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm1 = nn.LSTM(embedding_dim, hidden_dim)
        self.lstm2 = nn.LSTM(hidden_dim, hidden_dim)
        self.lstm3 = nn.LSTM(hidden_dim, hidden_dim)
        self.hidden2word = nn.Linear(hidden_dim, tagset_size)

    def forward(self, sentence, forward):

        embeds = self.word_embeddings(sentence)
        lstm_out, h1 = self.lstm1(embeds)
        lstm_out, h2 = self.lstm2(lstm_out)
        lstm_out, h3 = self.lstm3(lstm_out)
        h = self.hidden2word(lstm_out)

        gumbel = Variable(sample_gumbel(shape=h.size(), out=h.data.new()))
        h += gumbel
        logits = h

        if forward > 0:
            outputs = []
            logits = torch.transpose(logits, 0, 1)
            h = torch.max(logits[:, -1:, :], dim = 2)[1].t()

            for i in range(forward):
                h = self.word_embeddings(h)
                h, _ = self.lstm1(h)
                h = self.hidden2word(h)
                gumbel = Variable(sample_gumbel(shape=h.size(), out=h.data.new()))
                h += gumbel
                outputs.append(h)
                h = torch.max(h, dim=2)[1]

            logits = torch.transpose(logits, 0, 1)
            logits = torch.cat([logits] + outputs, dim=0)
            logits = torch.max(logits, dim=2)[1]
        return logits


In [25]:
model = Net(64, 256, vocabsize, vocabsize)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [26]:
model.cuda()

Net(
  (word_embeddings): Embedding(14745, 64)
  (lstm1): LSTM(64, 256)
  (lstm2): LSTM(256, 256)
  (lstm3): LSTM(256, 256)
  (hidden2word): Linear(in_features=256, out_features=14745, bias=True)
)

In [25]:
optimizer = optim.Adam(model.parameters(), lr=0.00001)
for epoch in range(10):
    for i, data in enumerate(train_loader):
        inputs, targets = data
        inputs, targets = Variable(torch.LongTensor(inputs).cuda()), Variable(torch.LongTensor(targets).cuda())

        model.zero_grad()

        word_scores = model(inputs, 0)

        r, c, h = word_scores.shape
        word_scores = word_scores.view(r * c, h)

        targets = targets.contiguous().view(-1)

        loss = criterion(word_scores, targets)
        print('Loss is ', loss.data.cpu()[0])
        loss.backward()
        optimizer.step()



Loss is  tensor(5.7288)
Loss is  tensor(5.7062)
Loss is  tensor(5.6744)
Loss is  tensor(5.6867)
Loss is  tensor(5.8169)
Loss is  tensor(5.6507)
Loss is  tensor(5.6875)
Loss is  tensor(5.5780)
Loss is  tensor(5.9032)
Loss is  tensor(5.7375)
Loss is  tensor(5.7537)
Loss is  tensor(5.5929)
Loss is  tensor(5.7931)
Loss is  tensor(5.7658)
Loss is  tensor(5.8453)
Loss is  tensor(5.9539)
Loss is  tensor(5.8329)
Loss is  tensor(5.7910)
Loss is  tensor(5.7702)
Loss is  tensor(5.7113)
Loss is  tensor(5.8330)
Loss is  tensor(5.6876)
Loss is  tensor(5.7604)
Loss is  tensor(5.7584)
Loss is  tensor(5.8285)
Loss is  tensor(5.8105)
Loss is  tensor(5.8361)
Loss is  tensor(5.6966)
Loss is  tensor(5.7879)
Loss is  tensor(5.8125)
Loss is  tensor(6.0284)
Loss is  tensor(5.8373)
Loss is  tensor(5.7012)
Loss is  tensor(5.6681)
Loss is  tensor(5.6382)
Loss is  tensor(5.7341)
Loss is  tensor(5.6517)
Loss is  tensor(5.6393)
Loss is  tensor(5.5907)
Loss is  tensor(5.6306)
Loss is  tensor(5.8274)
Loss is  tensor(

# Evaluation 

In [26]:
dev_loader = TextDataLoader(dev_data, batch_size=32)

In [27]:
model.train(False)

Net(
  (word_embeddings): Embedding(10336, 64)
  (lstm1): LSTM(64, 256)
  (lstm2): LSTM(256, 256)
  (lstm3): LSTM(256, 256)
  (hidden2word): Linear(in_features=256, out_features=10336, bias=True)
)

In [28]:
final_ip = []
final_op = []
for epoch in range(1):
    for i, data in enumerate(dev_loader):
        inputs, targets = data
        final_ip.append(inputs.transpose())
        inputs, targets = Variable(torch.LongTensor(inputs).cuda()), Variable(torch.LongTensor(targets).cuda())

        word_scores = model(inputs, 20)
        outputs = word_scores.transpose(0, 1)
        
        final_op.append(outputs[:, -20:].cpu().data.numpy())
        



In [29]:
final_ip[0].shape, final_op[0].shape

((32, 57), (32, 20))

In [30]:
fp = open('dummy_pandas_train_word_latest.txt', 'w')
for i, item in enumerate(final_ip):
    for j, line in enumerate(item):
        chars = [id2char[x] for x in line]
        fp.write(''.join(chars))
        op_chars = [id2char[x] for x in final_op[i][j]]
        fp.write(''.join(chars))
        fp.write('\n')