In [109]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds

import re
import javalang

from itertools import chain

In [116]:
df = pd.read_hdf('../data/interim/preprocessed/sequences.h5', key='data')

In [136]:
np.stack(df['inputs'].head(2))

array([[   8,   15,  486,   10,    7,    5,  137, 4043,    4,  140,    3,
        1674,  259,   48,   25,    4,   58,    2,    5,  137, 4043,    4,
         140,  132,    3,    7,   17,  486,    2,    5, 1144,  609, 2062,
           3,  486,    2,    5,    9,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0, 

In [99]:
def flatten(l):
    return [item for sublist in l for item in sublist]

In [35]:
modifiers = ['public', 'private', 'protected', 'static']

RE_WORDS = re.compile(r'''
    # Find words in a string. Order matters!
    [A-Z]+(?=[A-Z][a-z]) |  # All upper case before a capitalized word
    [A-Z]?[a-z]+ |  # Capitalized words / all lower case
    [A-Z]+ |  # All upper case
    \d+ | # Numbers
    .+
''', re.VERBOSE)

def split_subtokens(str):
    return [subtok for subtok in RE_WORDS.findall(str) if not subtok == '_']

In [36]:
def tokenize_method(method_body):
    method_content = method_body
    try:
        tokens = list(javalang.tokenizer.tokenize(method_content))
    except:
        print('ERROR in tokenizing: ' + method_content)
        #tokens = method_content.split(' ')
    if len(tokens) > 0:
        return ' '.join([' '.join(split_subtokens(i.value)) for i in tokens if not i.value in modifiers])
    else:
        return ''

In [38]:
tokenize_method('void (String fooBar){System.out.println("hello world");}')

'void ( String foo Bar ) { System . out . println ( "hello world" ) ; }'

In [106]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(
    num_words=1000,
    filters='',
    lower=False,
    oov_token='<OOV>',
)

In [107]:
sequences = [
    ['{', 'printf', '(', '"', 'hello', 'world', '"', ')', ';', '}'],
    ['{', 'fprintf', '(', '"', 'hello', 'dad', '"', ')', ';', '}'],
    ['{', 'vprintf', '(', '"', 'hello', 'mom', '"', ')', ';', '}'],
]

texts = [
    'void ( String foo Bar ) { System . out . println ( " hello world " ) ; }'
]

In [108]:
tokenizer.fit_on_texts(chain.from_iterable(sequences))

In [90]:
tokenizer.num_words = 5

In [95]:
tokenizer.word_index

{'<OOV>': 1,
 '"': 2,
 '{': 3,
 '(': 4,
 'hello': 5,
 ')': 6,
 ';': 7,
 '}': 8,
 'printf': 9,
 'world': 10,
 'fprintf': 11,
 'dad': 12,
 'vprintf': 13,
 'mom': 14}

In [104]:
list(map(lambda l: flatten(l), sequences))
sequences[0]

['{', 'printf', '(', '"', 'hello', 'world', '"', ')', ';', '}']

In [105]:
tokenizer.texts_to_sequences(map(lambda l: ' '.join(l), sequences))

[[3, 9, 4, 2, 5, 10, 2, 6, 7, 8],
 [3, 11, 4, 2, 5, 12, 2, 6, 7, 8],
 [3, 13, 4, 2, 5, 14, 2, 6, 7, 8]]

In [117]:
df = pd.read_hdf('../data/interim/preprocessed/sequences.h5', key='data')

In [128]:
dataset = tf.data.Dataset.from_tensor_slices(np.stack(df['inputs'].values))
dataset

<TensorSliceDataset shapes: (200,), types: tf.int64>

In [149]:
shifted = next(iter(dataset.map(lambda seq: seq[1:])))
tf.concat(axis=0, values=[shifted, [53100]])

<tf.Tensor: shape=(200,), dtype=int64, numpy=
array([61674, 16275, 29012, 40249, 25223,  6673, 15050, 59831, 11591,
       14256, 15050, 39792, 68638, 25223,  6673, 15050, 32537, 11591,
       40249, 33282, 16275, 68638, 25223, 60298, 11591, 16275, 68638,
       25223, 32001, 53100, 53100, 53100, 53100, 53100, 53100, 53100,
       53100, 53100, 53100, 53100, 53100, 53100, 53100, 53100, 53100,
       53100, 53100, 53100, 53100, 53100, 53100, 53100, 53100, 53100,
       53100, 53100, 53100, 53100, 53100, 53100, 53100, 53100, 53100,
       53100, 53100, 53100, 53100, 53100, 53100, 53100, 53100, 53100,
       53100, 53100, 53100, 53100, 53100, 53100, 53100, 53100, 53100,
       53100, 53100, 53100, 53100, 53100, 53100, 53100, 53100, 53100,
       53100, 53100, 53100, 53100, 53100, 53100, 53100, 53100, 53100,
       53100, 53100, 53100, 53100, 53100, 53100, 53100, 53100, 53100,
       53100, 53100, 53100, 53100, 53100, 53100, 53100, 53100, 53100,
       53100, 53100, 53100, 53100, 53100, 53

In [146]:

tf.concat(values=[tf.constant(np.array([512])), [256]], axis=0)

<tf.Tensor: shape=(2,), dtype=int64, numpy=array([512, 256])>

In [129]:
[1, 2, 3, 4, 5][:-1]

[1, 2, 3, 4]

In [78]:
sentences = [
    'We pad the sequece in the post-order where maxlen is max_seq_length.',
    'If any vector is larger than max_seq_length, we truncate the post-sequence to shorten it.',
]

<TensorSliceDataset shapes: (), types: tf.string>

b'We pad the sequece in the post-order where maxlen is max_seq_length.'
b'If any vector is larger than max_seq_length, we truncate the post-sequence to shorten it.'
b'We pad the sequece in the post-order where maxlen is max_seq_length.'
b'If any vector is larger than max_seq_length, we truncate the post-sequence to shorten it.'
b'We pad the sequece in the post-order where maxlen is max_seq_length.'
b'If any vector is larger than max_seq_length, we truncate the post-sequence to shorten it.'
b'We pad the sequece in the post-order where maxlen is max_seq_length.'
b'If any vector is larger than max_seq_length, we truncate the post-sequence to shorten it.'
b'We pad the sequece in the post-order where maxlen is max_seq_length.'
b'If any vector is larger than max_seq_length, we truncate the post-sequence to shorten it.'
b'We pad the sequece in the post-order where maxlen is max_seq_length.'
b'If any vector is larger than max_seq_length, we tr

22

In [114]:
# 1. create Dataset entity
dataset = tf.data.Dataset.from_tensor_slices(sentences * 10)

print(dataset)
print()
for item in dataset:
    print(item.numpy())

# 2. Tokenize
# 3. Build vocabulary

# TODO: swap out the tokenizer with the javalang tokenizer
tokenizer = tfds.features.text.Tokenizer()

vocabulary_set = set()
for text_tensor in dataset.take(2):
    some_tokens = tokenizer.tokenize(text_tensor.numpy())
    vocabulary_set.update(some_tokens)

vocab_size = len(vocabulary_set)
vocab_size

# 4. Encode the text/tokens into numbers

# TODO: this is very bad design by TF, because it includes both tokenization AND encoding
# we already did the tokenization in the last step ...
encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)

example_text = next(iter(dataset)).numpy()
print(example_text)
encoded_example = encoder.encode(example_text)
print(encoded_example)

for text_tensor in dataset.take(10):
    print(encoder.encode(text_tensor.numpy()))

def encode(text_tensor):
    print('--->', text_tensor.numpy())
    encoded_text = encoder.encode(text_tensor.numpy())
    print('<---', encoded_text)
    
    # TODO: why the fuck do we need to return a fucking list/tuple here?
    return [encoded_text]

def encode_map_fn(text):
    # py_func doesn't set the shape of the returned tensors.
    # TODO: but why?!?
    encoded_text = tf.py_function(encode, inp=[text], Tout=tf.int64, name='encode')
    
    print(f'<- encoded_text after py_function: {encoded_text}')
    
    # `tf.data.Datasets` work best if all components have a shape set
    #  so set the shapes manually: 
    # TODO: but why?!?
#     encoded_text.set_shape([None])

    return encoded_text

dataset = dataset.map(encode_map_fn)

print(dataset)
print()
for item in dataset.take(5):
    print(item.numpy())

# # 5. pad sequences
# def pad_seq(seq):
#     """
#         We pad the sequece in the post-order where maxlen is max_seq_length.
#         If any vector is larger than max_seq_length, we truncate the post-sequence
#     """
#     return tf.keras.preprocessing.sequence.pad_sequences(
#         [seq], # TODO: [seq.numpy()]?
#         maxlen=max_seq_length,
#         truncating='post',
#         padding='post',
#         value='',
# #         dtype=np.float
#     ).squeeze() # TODO: why sequeeze?

# # def pad_map_fn(seq):
# #     return tf.py_function(pad_seq, inp=[seq], Tout=(tf.float32))
# def pad_seq_map_fn(seq):
#     seq.numpy()

# dataset = dataset.map(pad_seq_map_fn)
    
# 6. one-hot encode
dataset = dataset.map(lambda seq: tf.one_hot(seq, vocab_size))


print(dataset)
print()
for item in dataset.take(5):
    print(item.numpy())

<TensorSliceDataset shapes: (), types: tf.string>

b'We pad the sequece in the post-order where maxlen is max_seq_length.'
b'If any vector is larger than max_seq_length, we truncate the post-sequence to shorten it.'
b'We pad the sequece in the post-order where maxlen is max_seq_length.'
b'If any vector is larger than max_seq_length, we truncate the post-sequence to shorten it.'
b'We pad the sequece in the post-order where maxlen is max_seq_length.'
b'If any vector is larger than max_seq_length, we truncate the post-sequence to shorten it.'
b'We pad the sequece in the post-order where maxlen is max_seq_length.'
b'If any vector is larger than max_seq_length, we truncate the post-sequence to shorten it.'
b'We pad the sequece in the post-order where maxlen is max_seq_length.'
b'If any vector is larger than max_seq_length, we truncate the post-sequence to shorten it.'
b'We pad the sequece in the post-order where maxlen is max_seq_length.'
b'If any vector is larger than max_seq_length, we tr

In [12]:
label1 = ['transform', 'Search', 'Response']
label2 = ['get', 'Abstract', 'Factory', 'Creator', 'Service']
labels = [label1, label2]

In [24]:
max_seq_length = 4

In [28]:
def pad_seq(seq):
    """
        We pad the sequece in the post-order where maxlen is max_seq_length.
        If any vector is larger than max_seq_length, we truncate the post-sequence
    """
    return tf.keras.preprocessing.sequence.pad_sequences(
        [seq], # TODO: [seq.numpy()]?
        maxlen=max_seq_length,
        truncating='post',
        padding='post',
        value='',
#         dtype=np.float
    ).squeeze() # TODO: why sequeeze?

# def pad_map_fn(seq):
#     return tf.py_function(pad_seq, inp=[seq], Tout=(tf.float32))

In [20]:
labels_tensor = tf.ragged.constant(labels)
labels_tensor

<tf.RaggedTensor [[b'transform', b'Search', b'Response'], [b'get', b'Abstract', b'Factory', b'Creator', b'Service']]>

In [22]:
label_vocabulary = {token for label in labels for token in label}
label_vocabulary

label_to_index = { index: token for token, index in enumerate(label_vocabulary) }
label_to_index

{'Service': 0,
 'get': 1,
 'Response': 2,
 'Creator': 3,
 'transform': 4,
 'Abstract': 5,
 'Factory': 6,
 'Search': 7}

In [32]:
labels_tensor.to_tensor()

<tf.Tensor: shape=(2, 5), dtype=string, numpy=
array([[b'transform', b'Search', b'Response', b'', b''],
       [b'get', b'Abstract', b'Factory', b'Creator', b'Service']],
      dtype=object)>

In [33]:
# 2. From RaggedTensors to normal Tensors
pad_seq(labels_tensor.to_tensor().numpy())

ValueError: `dtype` int32 is not compatible with `value`'s type: <class 'str'>
You should set `dtype=object` for variable length strings.

In [37]:

# first you have to convert the RaggedTensor to a normal tensor
# ValueError: TypeError: object of type 'RaggedTensor' has no len()
# tf.map_fn(lambda token: label_to_index[token], labels_tensor.to_tensor())
tf.py_function(lambda token: label_to_index[token], [labels_tensor.to_tensor()], tf.int64)

InvalidArgumentError: TypeError: Tensor is unhashable if Tensor equality is enabled. Instead, use tensor.experimental_ref() as the key.
Traceback (most recent call last):

  File "/home/tony/source/identifier-suggestion/.venv/lib/python3.7/site-packages/tensorflow_core/python/ops/script_ops.py", line 234, in __call__
    return func(device, token, args)

  File "/home/tony/source/identifier-suggestion/.venv/lib/python3.7/site-packages/tensorflow_core/python/ops/script_ops.py", line 123, in __call__
    ret = self._func(*args)

  File "<ipython-input-37-7853e3c76557>", line 4, in <lambda>
    tf.py_function(lambda token: label_to_index[token], [labels_tensor.to_tensor()], tf.int64)

  File "/home/tony/source/identifier-suggestion/.venv/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py", line 705, in __hash__
    raise TypeError("Tensor is unhashable if Tensor equality is enabled. "

TypeError: Tensor is unhashable if Tensor equality is enabled. Instead, use tensor.experimental_ref() as the key.

 [Op:EagerPyFunc]

In [18]:
tf.one_hot(labels_tensor, len(label_vocabulary))

NotFoundError: Could not find valid device for node.
Node:{{node OneHot}}
All kernels registered for op OneHot :
  device='XLA_CPU'; TI in [DT_INT32, DT_UINT8, DT_INT64]; T in [DT_FLOAT, DT_DOUBLE, DT_INT32, DT_UINT8, DT_INT16, ..., DT_UINT16, DT_COMPLEX128, DT_HALF, DT_UINT32, DT_UINT64]
  device='XLA_CPU_JIT'; TI in [DT_INT32, DT_UINT8, DT_INT64]; T in [DT_FLOAT, DT_DOUBLE, DT_INT32, DT_UINT8, DT_INT16, ..., DT_UINT16, DT_COMPLEX128, DT_HALF, DT_UINT32, DT_UINT64]
  device='XLA_GPU_JIT'; TI in [DT_INT32, DT_UINT8, DT_INT64]; T in [DT_FLOAT, DT_DOUBLE, DT_INT32, DT_UINT8, DT_INT16, ..., DT_UINT16, DT_COMPLEX128, DT_HALF, DT_UINT32, DT_UINT64]
  device='CPU'; TI in [DT_INT64]; T in [DT_VARIANT]
  device='CPU'; TI in [DT_INT32]; T in [DT_VARIANT]
  device='CPU'; TI in [DT_UINT8]; T in [DT_VARIANT]
  device='CPU'; TI in [DT_INT64]; T in [DT_RESOURCE]
  device='CPU'; TI in [DT_INT32]; T in [DT_RESOURCE]
  device='CPU'; TI in [DT_UINT8]; T in [DT_RESOURCE]
  device='CPU'; TI in [DT_INT64]; T in [DT_STRING]
  device='CPU'; TI in [DT_INT32]; T in [DT_STRING]
  device='CPU'; TI in [DT_UINT8]; T in [DT_STRING]
  device='CPU'; TI in [DT_INT64]; T in [DT_BOOL]
  device='CPU'; TI in [DT_INT32]; T in [DT_BOOL]
  device='CPU'; TI in [DT_UINT8]; T in [DT_BOOL]
  device='CPU'; TI in [DT_INT64]; T in [DT_COMPLEX128]
  device='CPU'; TI in [DT_INT32]; T in [DT_COMPLEX128]
  device='CPU'; TI in [DT_UINT8]; T in [DT_COMPLEX128]
  device='CPU'; TI in [DT_INT64]; T in [DT_COMPLEX64]
  device='CPU'; TI in [DT_INT32]; T in [DT_COMPLEX64]
  device='CPU'; TI in [DT_UINT8]; T in [DT_COMPLEX64]
  device='CPU'; TI in [DT_INT64]; T in [DT_DOUBLE]
  device='CPU'; TI in [DT_INT32]; T in [DT_DOUBLE]
  device='CPU'; TI in [DT_UINT8]; T in [DT_DOUBLE]
  device='CPU'; TI in [DT_INT64]; T in [DT_FLOAT]
  device='CPU'; TI in [DT_INT32]; T in [DT_FLOAT]
  device='CPU'; TI in [DT_UINT8]; T in [DT_FLOAT]
  device='CPU'; TI in [DT_INT64]; T in [DT_BFLOAT16]
  device='CPU'; TI in [DT_INT32]; T in [DT_BFLOAT16]
  device='CPU'; TI in [DT_UINT8]; T in [DT_BFLOAT16]
  device='CPU'; TI in [DT_INT64]; T in [DT_HALF]
  device='CPU'; TI in [DT_INT32]; T in [DT_HALF]
  device='CPU'; TI in [DT_UINT8]; T in [DT_HALF]
  device='CPU'; TI in [DT_INT64]; T in [DT_INT8]
  device='CPU'; TI in [DT_INT32]; T in [DT_INT8]
  device='CPU'; TI in [DT_UINT8]; T in [DT_INT8]
  device='CPU'; TI in [DT_INT64]; T in [DT_UINT8]
  device='CPU'; TI in [DT_INT32]; T in [DT_UINT8]
  device='CPU'; TI in [DT_UINT8]; T in [DT_UINT8]
  device='CPU'; TI in [DT_INT64]; T in [DT_INT16]
  device='CPU'; TI in [DT_INT32]; T in [DT_INT16]
  device='CPU'; TI in [DT_UINT8]; T in [DT_INT16]
  device='CPU'; TI in [DT_INT64]; T in [DT_UINT16]
  device='CPU'; TI in [DT_INT32]; T in [DT_UINT16]
  device='CPU'; TI in [DT_UINT8]; T in [DT_UINT16]
  device='CPU'; TI in [DT_INT64]; T in [DT_INT32]
  device='CPU'; TI in [DT_INT32]; T in [DT_INT32]
  device='CPU'; TI in [DT_UINT8]; T in [DT_INT32]
  device='CPU'; TI in [DT_INT64]; T in [DT_INT64]
  device='CPU'; TI in [DT_INT32]; T in [DT_INT64]
  device='CPU'; TI in [DT_UINT8]; T in [DT_INT64]
  device='GPU'; TI in [DT_INT64]; T in [DT_INT64]
  device='GPU'; TI in [DT_INT32]; T in [DT_INT64]
  device='GPU'; TI in [DT_UINT8]; T in [DT_INT64]
  device='GPU'; TI in [DT_INT64]; T in [DT_INT32]
  device='GPU'; TI in [DT_INT32]; T in [DT_INT32]
  device='GPU'; TI in [DT_UINT8]; T in [DT_INT32]
  device='GPU'; TI in [DT_INT64]; T in [DT_BOOL]
  device='GPU'; TI in [DT_INT32]; T in [DT_BOOL]
  device='GPU'; TI in [DT_UINT8]; T in [DT_BOOL]
  device='GPU'; TI in [DT_INT64]; T in [DT_DOUBLE]
  device='GPU'; TI in [DT_INT32]; T in [DT_DOUBLE]
  device='GPU'; TI in [DT_UINT8]; T in [DT_DOUBLE]
  device='GPU'; TI in [DT_INT64]; T in [DT_FLOAT]
  device='GPU'; TI in [DT_INT32]; T in [DT_FLOAT]
  device='GPU'; TI in [DT_UINT8]; T in [DT_FLOAT]
  device='GPU'; TI in [DT_INT64]; T in [DT_HALF]
  device='GPU'; TI in [DT_INT32]; T in [DT_HALF]
  device='GPU'; TI in [DT_UINT8]; T in [DT_HALF]
  device='XLA_GPU'; TI in [DT_INT32, DT_UINT8, DT_INT64]; T in [DT_FLOAT, DT_DOUBLE, DT_INT32, DT_UINT8, DT_INT16, ..., DT_UINT16, DT_COMPLEX128, DT_HALF, DT_UINT32, DT_UINT64]
 [Op:OneHot] name: RaggedOneHot/one_hot/