In [72]:
from datetime import datetime
import argparse
import os
import sys

sys.path.append(".")
sys.path.append('./hf_transformers')

from transformers import Seq2SeqTrainer
from transformers import Seq2SeqTrainingArguments
from transformers import T5Config
from transformers import T5ForConditionalGeneration
from transformers import T5Tokenizer
from transformers import set_seed
import torch

from data_reader import GetDataAsPython
from prepare_data import create_data
from prepare_data import create_dataset
from prepare_data import extract_warning_types
from utils import boolean_string
from utils import get_current_time


In [73]:
import numpy as np

In [74]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
from transformers import AutoTokenizer, AutoModel

model = AutoModel.from_pretrained("microsoft/codebert-base")
model.to(device)

tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")

In [75]:
local = True

model_name = 't5-small'#args.model_name

if local:
    storage_directory = './storage/'
    pretrained_model = model_name
else:
    storage_directory = '/scratch/arminz/'
    pretrained_model = f'{storage_directory}/pretrained/{model_name}'

In [76]:
data = GetDataAsPython(f"{storage_directory}/data_and_models/data/data_autofix_tracking_repo_specific_final.json")
data_eslint = GetDataAsPython(f"{storage_directory}/data_and_models/data/data_autofix_tracking_eslint_final.json")

In [77]:
data += data_eslint
all_warning_types = extract_warning_types(data)
# if args.error_type != "":
#     all_warning_types = [args.error_type]
print(all_warning_types)
(
    train_inputs,
    train_labels,
    val_inputs,
    val_labels,
    test_inputs,
    test_labels,
    train_info,
    val_info,
    test_info,
) = create_data(data, all_warning_types, include_warning=True, design='repo-based')


['no-invalid-this', 'no-throw-literal', 'no-new-wrappers', 'guard-for-in', 'no-new-object', 'comma-style', 'prefer-spread', 'no-caller', 'no-extra-bind', 'no-array-constructor', 'prefer-rest-params', 'generator-star-spacing', 'no-this-before-super', 'no-extend-native', 'no-undef', 'no-useless-escape', 'no-dupe-keys', 'no-console', 'no-constant-condition', 'no-duplicate-case', 'no-empty', 'no-extra-semi', 'no-redeclare', 'no-cond-assign', 'no-extra-boolean-cast', 'no-fallthrough', 'no-unreachable', 'valid-typeof', 'no-unsafe-finally', 'no-unused-vars', 'no-debugger', 'no-unsafe-negation', 'no-case-declarations', 'no-self-assign', 'no-process-exit', 'no-inner-declarations', 'for-direction', 'no-compare-neg-zero', 'no-sparse-arrays', 'no-func-assign', 'no-const-assign', 'no-global-assign', 'use-isnan', 'no-unused-labels', 'require-yield', 'getter-return', 'no-dupe-class-members', 'no-ex-assign', 'constructor-super', 'no-new-symbol', 'no-empty-pattern', 'no-class-assign']
splitting by : re

In [14]:
def code_to_vec(code):
    code_tokens=tokenizer.tokenize(code)
    tokens=[tokenizer.cls_token]+code_tokens+[tokenizer.sep_token]
    tokens_ids=tokenizer.convert_tokens_to_ids(tokens)
    context_embeddings=model(torch.tensor(tokens_ids).to(device)[None,:])[0]
    return context_embeddings[0][0].cpu().detach().numpy()

In [15]:
def vec_distance(code1, code2):
    return np.linalg.norm(code1 - code2)

In [16]:
sample1 = [data_point.source_code for data_point in train_info][0]
sample2 = [data_point.source_code for data_point in train_info][1]

In [17]:
len(train_info)

95109

In [18]:
arr = [(data_point.source_code, ind) for (ind, data_point) in enumerate(train_info)]

In [29]:
%%time
vecs = [(code_to_vec(code), ind) for (code, ind) in arr]

CPU times: user 16min 9s, sys: 457 ms, total: 16min 9s
Wall time: 16min 9s


In [20]:
((len(train_info) / 1000) * 10) / 3600

0.26419166666666666

In [31]:
len(vecs)

95109

In [67]:
min_distance = 5
max_distance = -5

vec_sample1 = code_to_vec(sample1)
vec_sample2 = code_to_vec(sample2)

for vec1 in vecs[:50]:
    this_vec = vec1[0]
    for vec in vecs:
        if (vec[0] == vec1[0]).all() or (train_info[vec[1]].repo == train_info[vec1[1]].repo):
            continue
        if vec_distance(this_vec, vec[0]) < min_distance:
            closest_code = vec[1]
            closest_this = vec1[1]
            min_distance = vec_distance(this_vec, vec[0])
        if vec_distance(this_vec, vec[0]) > max_distance:
            furthest_code = vec[1]
            furthest_this = vec1[1]
            max_distance = vec_distance(this_vec, vec[0])
print(closest_code, min_distance, furthest_code, max_distance, closest_this, furthest_this)

93848 0.99134284 56959 10.311465 8 7


In [68]:
train_info[closest_code].source_code, train_info[closest_code].repo

('\n\tthis.start = function*() {\n\t\treturn true;\n',
 '/data/all/data/Coonti/Coonti')

In [69]:
train_info[closest_this].source_code, train_info[closest_this].repo

('\t\t\n\t\tthis.prototype.hello = function (){\n\t\t\treturn true;\n',
 '/data/all/data/imba/imba')

In [70]:
train_info[furthest_code].source_code

'{}\n'

In [71]:
train_info[furthest_this].source_code

"        attachDatePickerEvents = function () {\n            var self = this, $this, $parent, expanded, closed, collapseData;\n            picker.widget.on('click', '.datepicker *', $.proxy(click, this)); \n"

In [43]:
# Curriculum Learning for Domain Adaptation
# in Neural Machine Translation

In [45]:
tokenizer.tokenize('                    } else {\n                        uiFunctions.completeAction($(this), action);\n                    }\n')

['Ġ',
 'Ġ',
 'Ċ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ}',
 'Ġelse',
 'Ġ{',
 'Ċ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġu',
 'i',
 'Fun',
 'ctions',
 '.',
 'complete',
 'Action',
 '($',
 '(',
 'this',
 '),',
 'Ġaction',
 ');',
 'Ċ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ}',
 'Ċ']