In [195]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import torchtext

import tensorflow as tf
import tensorflow_datasets as tfds
from tokenize import tokenize, untokenize, NUMBER, STRING, NAME, OP
from io import BytesIO

import linecache
import sys
import os
import re

## Loading the dataset

In [224]:
try:
    os.mkdir("./datasets")
except FileExistsError:
    print("Directories already exists")

# getting descriptions
!wget https://raw.githubusercontent.com/odashi/ase15-django-dataset/master/django/all.anno -O ./datasets/all.desc

# getting code
!wget https://raw.githubusercontent.com/odashi/ase15-django-dataset/master/django/all.code -O ./datasets/all.code

Directories already exists
--2019-10-18 13:17:39--  https://raw.githubusercontent.com/odashi/ase15-django-dataset/master/django/all.anno
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.16.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.16.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1382085 (1.3M) [text/plain]
Saving to: './datasets/all.desc'


2019-10-18 13:17:40 (5.40 MB/s) - './datasets/all.desc' saved [1382085/1382085]

--2019-10-18 13:17:40--  https://raw.githubusercontent.com/odashi/ase15-django-dataset/master/django/all.code
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.16.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.16.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 906732 (885K) [text/plain]
Saving to: './datasets/all.code'


2019-10-18 13:17:41 (4.35 MB/s) - './datasets/all.code' saved [906732

## Creating a token text encoder
An encoder will take a file and a splitting function and return an object able to encode and decode a string. It will also be able to save a vocab file and retrieve from file.

In [142]:
text = " append rel_to to string 'ForeignKey, (substitute the result for field_type.)"

# looks like code split need parenthesis to be matched in the same string, if not it gives an error...
def code_split(s):
    return [x.string for x in tokenize(BytesIO(s.encode('utf-8')).readline) if x.string != '' and x.string != "\n" and not x.string.isspace()][1:]

print(code_split(text))

['append', 'rel_to', 'to', 'string', "'", 'ForeignKey', ',', '(', 'substitute', 'the', 'result', 'for', 'field_type', '.', ')']


In [277]:
text = " append rel_to to string 'ForeignKey, (subs__titute the result' for field_type."

def string_split(s):
    return list(filter(lambda x: x != '' and x != "\n" and not x.isspace(), re.split('(_|\W)', s))) # this will chunk all code properly by plits strings with quotes
#     return list(filter(lambda x: x != '' and x != "\n" and not x.isspace(), re.split('(\\\'.*?\\\'|\\\".*?\\\"|_|\W)', s))) # this keeps the strings intact

print(string_split(text))

['append', 'rel', '_', 'to', 'to', 'string', "'", 'ForeignKey', ',', '(', 'subs', '_', '_', 'titute', 'the', 'result', "'", 'for', 'field', '_', 'type', '.']


In [274]:
class Tokenizer():
    def __init__(self, vocab_size=sys.maxsize, tokenizer=str.split, vocab=[]):
        self.vocab_size = vocab_size
        self.tokenizer = tokenizer
        self.t2id = {}
        vocab.append("<UNK>")
        for tok in vocab:
            self.t2id[tok] = len(self.t2id)
        self.id2t = {v:k for k,v in self.t2id.items()}
    
    def encode(self,s):
        ids = []
        for tok in self.tokenizer(s):
            if tok in self.t2id:
                ids.append(self.t2id[tok])
            else:
                ids.append(self.t2id["<UNK>"])
        return ids
        
    def decode(self,arr):
        return [self.id2t[id] for id in arr]
    
    def vocab_size(self):
        return len(self.id2t)
        
    def build_vocab_from_corpus(self,fp):
        from collections import Counter
        all_toks = []
        with open(fp, "r") as corpus_file:
            for line in corpus_file.readlines():
                for tok in self.tokenizer(line):
                    all_toks.append(tok)
        counter = Counter(all_toks)
        unique_toks = [x for _,x in sorted(zip(counter.values(),counter.keys()))][::-1][:self.vocab_size]
        for tok in unique_toks:
            self.t2id[tok] = len(self.t2id)
        self.id2t = {v:k for k,v in self.t2id.items()}
        
                
        
    def save_vocab(self,fp):
        with open(fp, "w") as vocab_file:
            for i in range(len(self.id2t)):
                vocab_file.write(self.id2t[i]+"\n")
            
        
    def load_vocab(self,fp):
        self.t2id = {}
        with open(fp, "r") as vocab_file:
            for line in vocab_file.readlines():
                tok = line[:-1]
                self.t2id[tok] = len(self.t2id)
        self.id2t = {v:k for k,v in self.t2id.items()}

In [275]:
tokenizer_en = Tokenizer(vocab=["<START>", "<END>", "<PAD>"], tokenizer=string_split)
tokenizer_code = Tokenizer(vocab=["<START>", "<END>", "<PAD>"], tokenizer=string_split) # turns out this works well for code too

tokenizer_en.build_vocab_from_corpus("./datasets/all.desc")
tokenizer_code.build_vocab_from_corpus("./datasets/all.code")

tokenizer_en.save_vocab("./desc_vocab.txt")
tokenizer_code.save_vocab("./code_vocab.txt")

In [285]:
tokenizer_en.encode("make an array with 10 random numbers")

[315, 12, 3, 9, 509, 894, 2183]

In [286]:
tokenizer_en.decode([214, 11, 34, 1864, 5, 3])

['this', 'is', 'a', 'te', '_', '<UNK>']