### code2vec model
## Выглядит так: 
![code2vec](img/code2vec_network.jpg)


мы существуем в папке experimental, поэтому нужно вернуться в корень репозитория, чтобы иметь доступ к vocabulary.py и path_context_reader.py

In [1]:
%cd ../

/home/ruslan/Documents/course-project-TiMP/code2var


Базовая инициализация необходимых для работы переменных. 

In [None]:
import tensorflow as tf
import tensorboard
import os
import datetime
import config
import numpy as np
from vocabulary import Code2VecVocabs
from path_context_reader import PathContextReader, ReaderInputTensors
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
config.config.CREATE_VOCAB = True
config.config.TRAINING_FREQ_DICTS_PATH = "dataset/java-small/java-small.c2v.dict"
c2v_vocabs = Code2VecVocabs()
pcr = PathContextReader(is_train=True, vocabs=c2v_vocabs, csv_path="dataset/java-small/java-small.train_vec.csv")
dataset = pcr.get_dataset()
#init lookups
c2v_vocabs.target_vocab.get_word_to_index_lookup_table()
c2v_vocabs.token_vocab.get_word_to_index_lookup_table()
c2v_vocabs.path_vocab.get_word_to_index_lookup_table()


Num GPUs Available:  1
Creating vocab from dataset/java-small/java-small.c2v.dict
Loading frequency dicts from dataset/java-small/java-small.c2v.dict
Loading token freq dict
Loading path freq dict
Loading target freq dict
Creating token vocab
Creating vocab from frequency dictionary of 1651196 elements
Created token vocab
Creating path vocab
Creating vocab from frequency dictionary of 1582063 elements
Created path vocab
Creating target vocab
Creating vocab from frequency dictionary of 118171 elements
Created target vocab
Created all vocabs


Константы, которые потом будут помещены в config.config

In [None]:
EMBED_DIMENSION = 100
DROPOUT_KEEP_RATE = 0.75
TOKEN_VOCAB_SIZE = c2v_vocabs.token_vocab.lookup_table_word_to_index.size().numpy()
TARGET_VOCAB_SIZE=c2v_vocabs.target_vocab.lookup_table_word_to_index.size().numpy()
PATH_VOCAB_SIZE = c2v_vocabs.path_vocab.lookup_table_word_to_index.size().numpy()

In [None]:
from tensorflow.python.client import device_lib 
print(device_lib.list_local_devices())


Переформатировать dataset для подачи tuple(x, y) в code2vec. x- tuple из путей-контекстов и имён функций

In [None]:
dataset = dataset.map(lambda x : ((x.path_source_token_indices, x.path_indices, x.path_target_token_indices, x.target_index), x.target_index))

Создаём модели для эмбедов token-ов

In [None]:
input_source_token_embed = tf.keras.Input(shape = (config.config.MAX_CONTEXTS,), name="input_source_token")
input_target_token_embed = tf.keras.Input(shape = (config.config.MAX_CONTEXTS,), name="input_target_token")
token_embed = tf.keras.layers.Embedding(input_dim = TOKEN_VOCAB_SIZE, 
                                    output_dim=EMBED_DIMENSION,
                                    embeddings_initializer='uniform',
                                    name="token_embed") 
token_source_embed_model = tf.keras.Sequential([input_source_token_embed, token_embed])
token_target_embed_model = tf.keras.Sequential([input_target_token_embed, token_embed])
print(token_source_embed_model.summary())
print(token_target_embed_model.summary())

Создаём модель для эмбедов путей

In [None]:
input_paths_embed = tf.keras.Input(shape=(config.config.MAX_CONTEXTS,), name="input_paths")
paths_embed = tf.keras.layers.Embedding(input_dim=PATH_VOCAB_SIZE, output_dim=EMBED_DIMENSION, embeddings_initializer='uniform', name="paths_embed")
path_embed_model = tf.keras.Sequential([input_paths_embed, paths_embed])
print(path_embed_model.summary())

Основная модель code2vec, то, что работает уже с эмбедами

In [None]:
concatenated_embeds = tf.keras.layers.Concatenate(name="concatenated_embeds")([token_source_embed_model.output, path_embed_model.output, token_target_embed_model.output])

droped_embeds = tf.keras.layers.Dropout(1-DROPOUT_KEEP_RATE)(concatenated_embeds)
flatten_embeds = tf.keras.layers.Reshape((-1, 3*EMBED_DIMENSION), name="flatten_embeds")(droped_embeds)
combined_context_vector = tf.keras.layers.Dense(3*EMBED_DIMENSION, activation='tanh', name="combined_context_vector")(flatten_embeds)
сontext_weights = tf.keras.layers.Dense(1, activation='softmax', name="context_weights")(combined_context_vector)
attention_weights = tf.keras.layers.Reshape((-1, config.config.MAX_CONTEXTS, 1), name="attention_weights")(сontext_weights)

batched_embed = tf.keras.layers.Reshape((-1, config.config.MAX_CONTEXTS, 3*EMBED_DIMENSION), name="batched_embed") (combined_context_vector)
code_vectors = tf.keras.layers.Multiply()([batched_embed, attention_weights])
code_vectors = tf.keras.backend.squeeze(code_vectors, axis=1)
code_vectors = tf.keras.backend.sum(code_vectors, axis=1)

Нововведение, определяем вероятность каждого из target softmax-ом

In [None]:
possible_targets = tf.keras.layers.Dense(TARGET_VOCAB_SIZE, activation="softmax", name="possible_targets")(code_vectors)

Эмбед для target

In [None]:
input_target_embed = tf.keras.Input(shape=(1,), dtype=tf.int64, name="target")
target_embed = tf.keras.layers.Embedding(input_dim=TARGET_VOCAB_SIZE, 
                                        output_dim=3*EMBED_DIMENSION,
                                        embeddings_initializer='uniform',
                                        name="target_embed") (input_target_embed)
target_embed = tf.keras.backend.squeeze(target_embed, axis=1)
target_embed_model = tf.keras.Model(inputs=input_target_embed, outputs=target_embed)
print(target_embed_model.summary())

Финальный этап - подсчёт logits

In [None]:
logits = tf.keras.layers.Dot([1, 1],name="logits")([code_vectors, target_embed_model.output])
batch_size = tf.cast(tf.shape(input_target_embed)[0], tf.float32)

inputs = [token_source_embed_model.input, path_embed_model.input, token_target_embed_model.input, target_embed_model.input]
code2vec = tf.keras.Model(inputs=inputs, outputs=possible_targets)
print(code2vec.summary())
tf.keras.utils.plot_model(code2vec, show_shapes=True, dpi=300)

In [None]:
def tf_loss(labels, logits):
    """custom loss function"""
    return tf.keras.backend.sum(tf.nn.sparse_softmax_cross_entropy_with_logits(labels=tf.reshape(labels, [-1]), logits=logits)) / config.config.BATCH_SIZE

In [None]:
code2vec.compile(optimizer=tf.keras.optimizers.Adam(), metrics=['accuracy'],  loss="sparse_categorical_crossentropy")
code2vec.describe()

Просто чтобы посмотреть как оно там живёт

In [14]:
checkpoint_path = "training_2/cp-{epoch:04d}.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)



In [17]:
tf.debugging.set_log_device_placement(
    True
)

In [None]:
code2vec.fit(dataset, callbacks=[cp_callback])

    280/Unknown - 779s 3s/step - loss: 7.7905 - accuracy: 0.1859

In [None]:
code2vec.evaluate(dataset)

In [None]:
eval_c2v = tf.keras.Model(inputs=inputs, outputs=code_vectors)

In [None]:
it = iter(dataset).get_next()
a, a_1 = eval_c2v(it[0]), it[1]
it = iter(dataset).get_next()
b, b_1 = eval_c2v(it[0]), it[1]
it = iter(dataset).get_next()
c, c_1 = eval_c2v(it[0]), it[1]

In [None]:
a_1

In [None]:
a_1
r_1 = a[0]
r_2 = a[1]
r_3 = a[2]
def cos(x, y):
    return np.dot(x, y) / (np.sqrt(np.dot(x, x)) * np.sqrt(np.dot(y, y)))

print(cos(r_1, r_2))
print(cos(r_2, r_3))
print(cos(r_1, r_3))

In [None]:
c2v_vocabs.target_vocab.word_to_index