In [195]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import torchtext

import tensorflow as tf
import tensorflow_datasets as tfds
from tokenize import tokenize, untokenize, NUMBER, STRING, NAME, OP
from io import BytesIO

import linecache
import sys
import os
import re

## Loading the dataset

In [12]:
try:
    os.mkdir("./datasets")
except FileExistsError:
    print("Directories already exists")

# getting descriptions
!wget https://raw.githubusercontent.com/odashi/ase15-django-dataset/master/django/all.anno -O ./datasets/all.desc

# getting code
!wget https://raw.githubusercontent.com/odashi/ase15-django-dataset/master/django/all.code -O ./datasets/all.code

Directories already exists
--2019-10-18 10:03:27--  https://raw.githubusercontent.com/odashi/ase15-django-dataset/master/django/all.anno
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.16.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.16.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1382085 (1.3M) [text/plain]
Saving to: './datasets/all.desc'


2019-10-18 10:03:28 (3.89 MB/s) - './datasets/all.desc' saved [1382085/1382085]

--2019-10-18 10:03:28--  https://raw.githubusercontent.com/odashi/ase15-django-dataset/master/django/all.code
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.16.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.16.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 906732 (885K) [text/plain]
Saving to: './datasets/all.code'


2019-10-18 10:03:29 (3.61 MB/s) - './datasets/all.code' saved [906732

## Creating a token text encoder
An encoder will take a file and a splitting function and return an object able to encode and decode a string. It will also be able to save a vocab file and retrieve from file.

In [142]:
text = " append rel_to to string 'ForeignKey, (substitute the result for field_type.)"

# looks like code split need parenthesis to be matched in the same string, if not it gives an error...
def code_split(s):
    return [x.string for x in tokenize(BytesIO(s.encode('utf-8')).readline) if x.string != '' and x.string != "\n" and not x.string.isspace()][1:]

print(code_split(text))

['append', 'rel_to', 'to', 'string', "'", 'ForeignKey', ',', '(', 'substitute', 'the', 'result', 'for', 'field_type', '.', ')']


In [222]:
text = " append rel_to to string 'ForeignKey, (subs_titute the result' for field_type."

def string_split(s):
#     return list(filter(lambda x: x != '' and x != "\n" and not x.isspace(), re.split('(_|\W)', s))) # this will chunk all code properly by plits strings with quotes
    return list(filter(lambda x: x != '' and x != "\n" and not x.isspace(), re.split('(\\\'.*?\\\'|\\\".*?\\\"|_|\W)', s))) # this keeps the strings intact

print(string_split(text))

['append', 'rel', '_', 'to', 'to', 'string', "'ForeignKey, (subs_titute the result'", 'for', 'field', '_', 'type', '.']


In [220]:
from collections import Counter

words = ['a', 'b', 'c', 'c', 'c', 'a']

print(Counter(words).keys())# equals to list(set(words))
print(Counter(words).values()) # counts the elements' frequency
counter = Counter(words)
[x for _,x in sorted(zip(counter.values(),counter.keys()))][::-1]

dict_keys(['a', 'b', 'c'])
dict_values([2, 1, 3])


['c', 'a', 'b']

In [105]:
with open("./datasets/all.code", "r") as corpus_file:
    for line in corpus_file.readlines()[:10]:
        print(code_split(line))
#         print(list(line))

 from threading import local

['from', 'threading', 'import', 'local']

  from django . conf import settings

['from', 'django', '.', 'conf', 'import', 'settings']
 from django . core import signals

['from', 'django', '.', 'core', 'import', 'signals']

 from django . core . exceptions import ImproperlyConfigured

['from', 'django', '.', 'core', '.', 'exceptions', 'import', 'ImproperlyConfigured']

 from django . utils . module_loading import import_string

['from', 'django', '.', 'utils', '.', 'module_loading', 'import', 'import_string']

  DEFAULT_CACHE_ALIAS = 'default'

['DEFAULT_CACHE_ALIAS', '=', "'default'"]


In [183]:
class Tokenizer():
    def __init__(self, vocab_size=sys.maxsize, tokenizer=str.split, vocab=[]):
        self.vocab_size = vocab_size
        self.tokenizer = tokenizer
        self.t2id = {}
        vocab.append("<UNK>")
        for tok in vocab:
            self.t2id[tok] = len(self.t2id)
        self.id2t = {v:k for k,v in self.t2id.items()}
    
    def encode(self,s):
        ids = []
        for tok in self.tokenizer(s):
            if tok in self.t2id:
                ids.append(self.t2id[tok])
            else:
                ids.append(self.t2id["<UNK>"])
        return ids
        
    def decode(self,arr):
        return [self.id2t[id] for id in arr]
        
    def build_vocab_from_corpus(self,fp):
        from collections import Counter
        all_toks = []
        with open(fp, "r") as corpus_file:
            for line in corpus_file.readlines():
                for tok in self.tokenizer(line):
                    all_toks.append(tok)
        counter = Counter(all_toks)
        unique_toks = [x for _,x in sorted(zip(counter.values(),counter.keys()))][::-1][:self.vocab_size]
        for tok in unique_toks:
            self.t2id[tok] = len(self.t2id)
        self.id2t = {v:k for k,v in self.t2id.items()}
        
                
        
    def save_vocab(self,fp):
        with open(fp, "w") as vocab_file:
            for i in range(len(self.id2t)):
                vocab_file.write(self.id2t[i]+"\n")
            
        
    def load_vocab(self,fp):
        self.t2id = {}
        with open(fp, "r") as vocab_file:
            for line in vocab_file.readlines():
                tok = line[:-1]
                self.t2id[tok] = len(self.t2id)
        self.id2t = {v:k for k,v in self.t2id.items()}

In [191]:
import shlex
tokenizer_en = Tokenizer(vocab=["<START>", "<END>", "<PAD>"],vocab_size=30000)

In [192]:
tokenizer_en.build_vocab_from_corpus("./datasets/all.desc")

In [193]:
tokenizer_en.id2t

{0: '<START>',
 1: '<END>',
 2: '<PAD>',
 3: '<UNK>',
 4: 'the',
 5: 'with',
 6: 'an',
 7: 'is',
 8: 'for',
 9: 'and',
 10: 'if',
 11: 'of',
 12: 'method',
 13: 'call',
 14: 'to',
 15: 'substitute',
 16: 'argument',
 17: 'string',
 18: 'result',
 19: 'function',
 20: 'return',
 21: 'define',
 22: 'from',
 23: 'arguments:',
 24: 'value',
 25: 'a',
 26: 'set',
 27: 'in',
 28: 'as',
 29: '2',
 30: 'key',
 31: 'it',
 32: 'into',
 33: 'import',
 34: 'class',
 35: 'name',
 36: 'under',
 37: 'default',
 38: 'dictionary',
 39: 'boolean',
 40: 'self',
 41: 'not',
 42: 'space.',
 43: 'result.',
 44: 'list',
 45: 'integer',
 46: 'true,',
 47: 'every',
 48: 'empty',
 49: 'arguments',
 50: 'base',
 51: 'not,',
 52: 'exception',
 53: '3',
 54: 'instance',
 55: 'None,',
 56: 'element',
 57: 'unpacked',
 58: 'self.',
 59: 'append',
 60: 'self,',
 61: 'raise',
 62: 'None.',
 63: 'or',
 64: 'class.',
 65: 'derive',
 66: 'dictionary.',
 67: 'try,',
 68: 'get',
 69: "'%s'",
 70: 'by',
 71: 'caught,',
 72:

In [188]:
t.save_vocab("./t_vocab.txt")