In [1]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import torchtext
from torchtext.data import Field, BucketIterator

import tensorflow as tf
import tensorflow_datasets as tfds
from tokenize import tokenize, untokenize, NUMBER, STRING, NAME, OP
from io import BytesIO

import linecache
import sys
import os
import re
import random

## Loading the dataset

In [3]:
try:
    os.mkdir("./datasets")
except FileExistsError:
    print("Directories already exists")

# getting descriptions
!wget https://raw.githubusercontent.com/odashi/ase15-django-dataset/master/django/all.anno -O ./datasets/all.desc

# getting code
!wget https://raw.githubusercontent.com/odashi/ase15-django-dataset/master/django/all.code -O ./datasets/all.code

Directories already exists
--2019-10-21 09:39:50--  https://raw.githubusercontent.com/odashi/ase15-django-dataset/master/django/all.anno
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.16.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.16.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1382085 (1.3M) [text/plain]
Saving to: './datasets/all.desc'


2019-10-21 09:39:51 (5.00 MB/s) - './datasets/all.desc' saved [1382085/1382085]

--2019-10-21 09:39:51--  https://raw.githubusercontent.com/odashi/ase15-django-dataset/master/django/all.code
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.16.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.16.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 906732 (885K) [text/plain]
Saving to: './datasets/all.code'


2019-10-21 09:39:52 (4.02 MB/s) - './datasets/all.code' saved [906732

## Creating a token text encoder
An encoder will take a file and a splitting function and return an object able to encode and decode a string. It will also be able to save a vocab file and retrieve from file.

In [4]:
text = " append rel_to to string 'ForeignKey, (substitute the result for field_type.)"

# looks like code split need parenthesis to be matched in the same string, if not it gives an error...
def code_split(s):
    return [x.string for x in tokenize(BytesIO(s.encode('utf-8')).readline) if x.string != '' and x.string != "\n" and not x.string.isspace()][1:]

print(code_split(text))

['append', 'rel_to', 'to', 'string', "'", 'ForeignKey', ',', '(', 'substitute', 'the', 'result', 'for', 'field_type', '.', ')']


In [5]:
text = " append rel_to to string 'ForeignKey, (subs__titute the result' for field_type."

def string_split(s):
    return list(filter(lambda x: x != '' and x != "\n" and not x.isspace(), re.split('(_|\W)', s))) # this will chunk all code properly by plits strings with quotes
#     return list(filter(lambda x: x != '' and x != "\n" and not x.isspace(), re.split('(\\\'.*?\\\'|\\\".*?\\\"|_|\W)', s))) # this keeps the strings intact

print(string_split(text))

['append', 'rel', '_', 'to', 'to', 'string', "'", 'ForeignKey', ',', '(', 'subs', '_', '_', 'titute', 'the', 'result', "'", 'for', 'field', '_', 'type', '.']


In [6]:
class Tokenizer():
    def __init__(self, vocab_size=sys.maxsize, tokenizer=str.split, vocab=[]):
        self.vocab_size = vocab_size
        self.tokenizer = tokenizer
        self.t2id = {}
        vocab.append("<UNK>")
        for tok in vocab:
            self.t2id[tok] = len(self.t2id)
        self.id2t = {v:k for k,v in self.t2id.items()}
    
    def encode(self,s):
        ids = []
        for tok in self.tokenizer(s):
            if tok in self.t2id:
                ids.append(self.t2id[tok])
            else:
                ids.append(self.t2id["<UNK>"])
        return ids
        
    def decode(self,arr):
        return [self.id2t[id] for id in arr]
    
    def vocab_size(self):
        return len(self.id2t)
        
    def build_vocab_from_corpus(self,fp):
        from collections import Counter
        all_toks = []
        with open(fp, "r") as corpus_file:
            for line in corpus_file.readlines():
                for tok in self.tokenizer(line):
                    all_toks.append(tok)
        counter = Counter(all_toks)
        unique_toks = [x for _,x in sorted(zip(counter.values(),counter.keys()))][::-1][:self.vocab_size]
        for tok in unique_toks:
            self.t2id[tok] = len(self.t2id)
        self.id2t = {v:k for k,v in self.t2id.items()}
        
                
        
    def save_vocab(self,fp):
        with open(fp, "w") as vocab_file:
            for i in range(len(self.id2t)):
                vocab_file.write(self.id2t[i]+"\n")
            
        
    def load_vocab(self,fp):
        self.t2id = {}
        with open(fp, "r") as vocab_file:
            for line in vocab_file.readlines():
                tok = line[:-1]
                self.t2id[tok] = len(self.t2id)
        self.id2t = {v:k for k,v in self.t2id.items()}

In [7]:
tokenizer_en = Tokenizer(vocab=["<PAD>", "<START>", "<END>"], tokenizer=string_split)
tokenizer_code = Tokenizer(vocab=["<PAD>", "<START>", "<END>"], tokenizer=string_split) # turns out this works well for code too

tokenizer_en.build_vocab_from_corpus("./datasets/all.desc")
tokenizer_code.build_vocab_from_corpus("./datasets/all.code")

tokenizer_en.save_vocab("./desc_vocab.txt")
tokenizer_code.save_vocab("./code_vocab.txt")

In [8]:
tokenizer_en.encode("make a list with 10 random numbers")

[315, 34, 45, 9, 509, 894, 2183]

In [9]:
tokenizer_en.decode([315, 34, 45, 9, 509, 894, 2183])

['make', 'a', 'list', 'with', '10', 'random', 'numbers']

## Making the input pipeline

In [10]:
def corpus_to_array(src_fp, tgt_fp):
    lines = []
    with open(src_fp, "r") as src_file, open(tgt_fp, "r") as tgt_file:
        for src, tgt in zip(src_file, tgt_file):
            lines.append((src, tgt))
    return lines

In [11]:
def file_to_array(fp):
    lines = []
    with open(fp, "r") as file:
        for line in file:
            lines.append(line)
    return lines

In [12]:
data = corpus_to_array("datasets/all.desc", "datasets/all.code")
random.shuffle(data)

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [14]:
def batch(data, batch_size):
    return [pad_to_dense(list(t)) for t in zip(*[iter(data)]*batch_size)]

In [15]:
SRC = Field(tokenize = string_split,
            tokenizer_language="desc",
            init_token = '<sos>',
            eos_token = '<eos>')

TGT = Field(tokenize = string_split,
            tokenizer_language="code",
            init_token = '<sos>',
            eos_token = '<eos>')
fields = [
    ("desc",SRC),
    ("code",TGT)
]

src_txt = torchtext.data.Example.fromlist(file_to_array("datasets/all.desc"))
tgt_txt = torchtext.data.Example.fromlist(file_to_array("datasets/all.code"))

dataset = torchtext.data.Dataset(data, fields)
train_data, valid_data = dataset.split()

SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)

TypeError: fromlist() missing 1 required positional argument: 'fields'

In [16]:
train_src = [tokenizer_en.encode(x[0]) for x in data[:16000]]
train_tgt = [tokenizer_code.encode(x[1]) for x in data[:16000]]

test_src = [tokenizer_en.encode(x[0]) for x in data[16000:17000]]
test_tgt = [tokenizer_code.encode(x[1]) for x in data[16000:17000]]
             
val_src = [tokenizer_en.encode(x[0]) for x in data[17000:]]
val_tgt = [tokenizer_code.encode(x[1]) for x in data[17000:]]

# batch_size = 2

# train_src = batch(train_src,batch_size)
# train_tgt = batch(train_tgt,batch_size)

# test_src = batch(test_src,batch_size)
# test_tgt = batch(test_tgt,batch_size)
          
# val_src =  batch(val_src,batch_size)
# val_tgt =  batch(val_tgt,batch_size)

In [120]:
def samples_to_dataset(samples, src_field, tgt_field):
    """
    Args:
        samples: [(src_string),(tgt_string)]
        src/tgt_tokenizer: a func that takes a string and returns an array of strings
    """
    examples = []
    
    for sample in samples:
        src_string, tgt_string = sample
        examples.append(torchtext.data.Example.fromdict({"src":src_string, "tgt":tgt_string}, 
                                        fields={"src":("src",src_field), "tgt":("tgt",tgt_field)}))
        
    dataset = torchtext.data.Dataset(examples,fields={"src":src_field, "tgt":tgt_field})
    return dataset

In [136]:
SRC_TEXT = Field(sequential=True, tokenize=string_split, init_token='<sos>',eos_token='<eos>', lower=False)
TGT_TEXT = Field(sequential=True, init_token='<sos>',eos_token='<eos>')

dataset = samples_to_dataset(data, SRC_TEXT, TGT_TEXT)

train_dataset, val_dataset = dataset.split([0.7,0.3])

In [137]:
SRC_TEXT.build_vocab(train_dataset)
TGT_TEXT.build_vocab(train_dataset)


sample = dataset[0].src
for tok, id in zip(sample, SRC_TEXT.numericalize([sample], device)):
    print("{} -> {}".format(tok, id.numpy()[0]))

_ -> 5
response -> 177
_ -> 5
middleware -> 325
is -> 11
an -> 12
empty -> 54
list -> 45
. -> 4


In [140]:
train_iterator, valid_iterator = BucketIterator.splits(
    (train_dataset, val_dataset),
    batch_size = BATCH_SIZE,
    device = device)

for i, batch in enumerate(train_iterator):
    print(batch)


[torchtext.data.batch.Batch of size 128]
	[.src]:[torch.LongTensor of size 112x128]
	[.tgt]:[torch.LongTensor of size 28x128]

[torchtext.data.batch.Batch of size 128]
	[.src]:[torch.LongTensor of size 67x128]
	[.tgt]:[torch.LongTensor of size 35x128]

[torchtext.data.batch.Batch of size 128]
	[.src]:[torch.LongTensor of size 229x128]
	[.tgt]:[torch.LongTensor of size 89x128]

[torchtext.data.batch.Batch of size 128]
	[.src]:[torch.LongTensor of size 51x128]
	[.tgt]:[torch.LongTensor of size 53x128]

[torchtext.data.batch.Batch of size 128]
	[.src]:[torch.LongTensor of size 62x128]
	[.tgt]:[torch.LongTensor of size 73x128]

[torchtext.data.batch.Batch of size 128]
	[.src]:[torch.LongTensor of size 175x128]
	[.tgt]:[torch.LongTensor of size 71x128]

[torchtext.data.batch.Batch of size 128]
	[.src]:[torch.LongTensor of size 60x128]
	[.tgt]:[torch.LongTensor of size 70x128]

[torchtext.data.batch.Batch of size 128]
	[.src]:[torch.LongTensor of size 90x128]
	[.tgt]:[torch.LongTensor of si

In [337]:
def pad_to_dense(M):
    """Appends the minimal required amount of zeroes at the end of each 
     array in the jagged array `M`, such that `M` looses its jagedness."""

    maxlen = max(len(r) for r in M)

    Z = np.full((len(M), maxlen), 0)
    for enu, row in enumerate(M):
        Z[enu, :len(row)] += row 
    return Z

In [338]:
a = [[1,2,3],[1]]
pad_to_dense(a)

array([[1, 2, 3],
       [1, 0, 0]])

In [325]:
def chunk(it, size, padval=_no_padding):
    if padval == _no_padding:
        it = iter(it)
        sentinel = ()
    else:
        it = chain(iter(it), repeat(padval))
        sentinel = (padval,) * size
    return iter(lambda: tuple(islice(it, size)), sentinel)

NameError: name '_no_padding' is not defined

In [None]:
def batchify(data, batch_size):
    data = TEXT.numericalize([data.examples[0].text])
    # Divide the dataset into bsz parts.
    nbatch = data.size(0) // bsz
    # Trim off any extra elements that wouldn't cleanly fit (remainders).
    data = data.narrow(0, 0, nbatch * bsz)
    # Evenly divide the data across the bsz batches.
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)

batch_size = 20
eval_batch_size = 10
train_data = batchify(train_txt, batch_size)
val_data = batchify(val_txt, eval_batch_size)
test_data = batchify(test_txt, eval_batch_size)

In [None]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)