In [87]:
from collections import defaultdict
from datetime import datetime
import argparse
import json
import os
import sys
from typing import DefaultDict, List

sys.path.append("..")

from transformers import Seq2SeqTrainer
from transformers import Seq2SeqTrainingArguments
from transformers import T5ForConditionalGeneration
from transformers import T5Tokenizer
from transformers import set_seed
import numpy as np
import torch

from data_reader import DataPoint, GetDataAsPython
from prepare_data import create_data
from prepare_data import create_dataset
from prepare_data import extract_warning_types
from prepare_data import filter_rule
from utils import boolean_string
from utils import compute_dict_average
from utils import get_current_time

# transformers.logging.set_verbosity_info()
set_seed(42)
print("start time: ", get_current_time())


# Create job's directory
model_name = "t5base"

now = datetime.now()
dt_string = now.strftime("%d-%m-%Y_%H-%M-%S")
model_directory = "t5global" + "_" + dt_string
model_directory = model_name + "_global_" + dt_string

os.makedirs(model_directory)
with open(os.path.join(model_directory, "commandline_args.txt"), "w") as f:
    f.write("\n".join(sys.argv[1:]))

# Read data
data = GetDataAsPython("data_and_models/data/data_autofix_tracking_repo_specific_final.json")
data_eslint = GetDataAsPython("data_and_models/data/data_autofix_tracking_eslint_final.json")
data += data_eslint
all_warning_types = extract_warning_types(data)

print(all_warning_types)
(
    train_inputs,
    train_labels,
    val_inputs,
    val_labels,
    test_inputs,
    test_labels,
    train_info,
    val_info,
    test_info,
) = create_data(data, all_warning_types, include_warning=True, model_name=model_name)

# Load the tokenizer and the model that will be tested.
model_path = "data_and_models/models/t5base"
tokenizer = T5Tokenizer.from_pretrained(model_path)
print("Loaded tokenizer from directory {}".format(model_path))
model = T5ForConditionalGeneration.from_pretrained(model_path)
print("Loaded model from directory {}".format(model_path))
model.to(f"cuda:{torch.cuda.current_device()}")
model.resize_token_embeddings(len(tokenizer))
model.eval()

# Create dataset required by pytorch
train_dataset = create_dataset(
    train_inputs, train_labels, tokenizer, pad_truncate=True, max_length=128
)
val_dataset = create_dataset(val_inputs, val_labels, tokenizer, pad_truncate=True)

# Trainer arguments.
# Note that Seq2SeqTrainer class has a method predict() that will be used to generate predictions.
# That is why we still need to create a trainer instance and its arguments even though we are in testing
training_args = Seq2SeqTrainingArguments(
    output_dir=model_directory,
    num_train_epochs=0,
    per_device_eval_batch_size=1,
    logging_dir=model_directory,
    logging_steps=100,
    do_eval=True,
    evaluation_strategy="epoch",
    eval_accumulation_steps=1,  # set this lower, if testing or validation crashes
    predict_with_generate=True,  # never set this to false, it is for testing.
    seed=42,  # default value
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

print("Testing has started")

counter = 0
for key in test_inputs:
    counter += len(test_inputs[key])
print("Number of testing samples: ", counter)

# test that the samples are well aligned among inputs and info
for warning in test_inputs:
    inputs = test_inputs[warning]
    infos = test_info[warning]
    for i, code in enumerate(inputs):
        assert code == infos[i].GetT5Representation(True)[0], "something wrong! stop it!"

# Generate predictions
scores: DefaultDict[str, float] = defaultdict(float)
for i, warning in enumerate(all_warning_types):

    test_warning = test_inputs[warning]
    test_warning_labels = test_labels[warning]
    test_warning_info = test_info[warning]
    if len(test_warning) > 20:
        test_warning = test_warning[:20]
        test_warning_labels = test_warning_labels[:20]
        test_warning_info = test_warning_info[:20]

    target_max_length = 256  # Set this to 256 if enough memory

    print(f"rule {i}: {warning}, # {len(test_warning)}")
    correct_counter, total_counter = 0, 0
    test_warning_dataset = create_dataset(
        test_warning,
        test_warning_labels,
        tokenizer,
        pad_truncate=True,
        max_length=target_max_length,
    )

    target_ids = tokenizer(
        test_warning_labels,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=target_max_length,
    ).input_ids
    target_ids = np.array(target_ids)

    output_ids = trainer.predict(
        test_dataset=test_warning_dataset, num_beams=5, max_length=target_max_length
    ).predictions
    output_ids = np.pad(
        output_ids, ((0, 0), (0, target_max_length - output_ids.shape[1])), mode="constant"
    )
    output_ids = np.delete(output_ids, 0, axis=1)
    output_ids = np.insert(output_ids, target_max_length - 1, 0, axis=1)
    correct= np.all(np.equal(target_ids, output_ids), axis=1)
    correct_counter += np.sum(np.all(np.equal(target_ids, output_ids), axis=1))
    total_counter += len(output_ids)
    for k, output_id in enumerate(output_ids):
        pred = tokenizer.decode(output_id, skip_special_tokens=True)
        predictions = []
        predictions.append(pred)
        test_warning_info[k].predictions = predictions
        test_warning_info[k].correct = correct[k]

    scores[warning] = correct_counter / total_counter
    test_info[warning] = test_warning_info
    print(f"rule {i} acc: {correct_counter / total_counter}")


start time:  15:52:31
['no-invalid-this', 'no-throw-literal', 'no-new-wrappers', 'guard-for-in', 'no-new-object', 'comma-style', 'prefer-spread', 'no-caller', 'no-extra-bind', 'no-array-constructor', 'prefer-rest-params', 'generator-star-spacing', 'no-this-before-super', 'no-extend-native', 'no-undef', 'no-useless-escape', 'no-dupe-keys', 'no-console', 'no-constant-condition', 'no-duplicate-case', 'no-empty', 'no-extra-semi', 'no-redeclare', 'no-cond-assign', 'no-extra-boolean-cast', 'no-fallthrough', 'no-unreachable', 'valid-typeof', 'no-unsafe-finally', 'no-unused-vars', 'no-debugger', 'no-unsafe-negation', 'no-case-declarations', 'no-self-assign', 'no-process-exit', 'no-inner-declarations', 'for-direction', 'no-compare-neg-zero', 'no-sparse-arrays', 'no-func-assign', 'no-const-assign', 'no-global-assign', 'use-isnan', 'no-unused-labels', 'require-yield', 'getter-return', 'no-dupe-class-members', 'no-ex-assign', 'constructor-super', 'no-new-symbol', 'no-empty-pattern', 'no-class-assi

loading file spiece.model
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file data_and_models/models/t5base\config.json
Model config T5Config {
  "_name_or_path": "data_and_models/models/t5base",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 3072,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "return_dict": false,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
 

train size: 84846
val size: 9454
test size: 10504
Loaded tokenizer from directory data_and_models/models/t5base


All model checkpoint weights were used when initializing T5ForConditionalGeneration.

All the weights of T5ForConditionalGeneration were initialized from the model checkpoint at data_and_models/models/t5base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use T5ForConditionalGeneration for predictions without further training.


Loaded model from directory data_and_models/models/t5base


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running Prediction *****
  Num examples = 20
  Batch size = 1


Testing has started
Number of testing samples:  10504
rule 0: no-invalid-this, # 20


  0%|          | 0/20 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 20
  Batch size = 1


rule 0 acc: 0.3
rule 1: no-throw-literal, # 20


  0%|          | 0/20 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 20
  Batch size = 1


rule 1 acc: 0.65
rule 2: no-new-wrappers, # 20


  0%|          | 0/20 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 20
  Batch size = 1


rule 2 acc: 0.35
rule 3: guard-for-in, # 20


  0%|          | 0/20 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 20
  Batch size = 1


rule 3 acc: 0.25
rule 4: no-new-object, # 20


  0%|          | 0/20 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 20
  Batch size = 1


rule 4 acc: 0.7
rule 5: comma-style, # 20


  0%|          | 0/20 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 20
  Batch size = 1


rule 5 acc: 0.55
rule 6: prefer-spread, # 20


  0%|          | 0/20 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 20
  Batch size = 1


rule 6 acc: 0.3
rule 7: no-caller, # 20


  0%|          | 0/20 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 20
  Batch size = 1


rule 7 acc: 0.2
rule 8: no-extra-bind, # 20


  0%|          | 0/20 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 20
  Batch size = 1


rule 8 acc: 0.7
rule 9: no-array-constructor, # 20


  0%|          | 0/20 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 20
  Batch size = 1


rule 9 acc: 0.95
rule 10: prefer-rest-params, # 20


  0%|          | 0/20 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 20
  Batch size = 1


rule 10 acc: 0.45
rule 11: generator-star-spacing, # 20


  0%|          | 0/20 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 20
  Batch size = 1


rule 11 acc: 0.8
rule 12: no-this-before-super, # 20


  0%|          | 0/20 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 20
  Batch size = 1


rule 12 acc: 0.5
rule 13: no-extend-native, # 20


  0%|          | 0/20 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 20
  Batch size = 1


rule 13 acc: 0.2
rule 14: no-undef, # 20


  0%|          | 0/20 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 20
  Batch size = 1


rule 14 acc: 0.15
rule 15: no-useless-escape, # 20


  0%|          | 0/20 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 20
  Batch size = 1


rule 15 acc: 0.45
rule 16: no-dupe-keys, # 20


  0%|          | 0/20 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 20
  Batch size = 1


rule 16 acc: 0.55
rule 17: no-console, # 20


  0%|          | 0/20 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 20
  Batch size = 1


rule 17 acc: 0.8
rule 18: no-constant-condition, # 20


  0%|          | 0/20 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 20
  Batch size = 1


rule 18 acc: 0.5
rule 19: no-duplicate-case, # 20


  0%|          | 0/20 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 20
  Batch size = 1


rule 19 acc: 0.6
rule 20: no-empty, # 20


  0%|          | 0/20 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 20
  Batch size = 1


rule 20 acc: 0.2
rule 21: no-extra-semi, # 20


  0%|          | 0/20 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 20
  Batch size = 1


rule 21 acc: 0.85
rule 22: no-redeclare, # 20


  0%|          | 0/20 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 20
  Batch size = 1


rule 22 acc: 0.5
rule 23: no-cond-assign, # 20


  0%|          | 0/20 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 20
  Batch size = 1


rule 23 acc: 0.55
rule 24: no-extra-boolean-cast, # 20


  0%|          | 0/20 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 20
  Batch size = 1


rule 24 acc: 0.55
rule 25: no-fallthrough, # 20


  0%|          | 0/20 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 20
  Batch size = 1


rule 25 acc: 0.6
rule 26: no-unreachable, # 20


  0%|          | 0/20 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 20
  Batch size = 1


rule 26 acc: 0.65
rule 27: valid-typeof, # 20


  0%|          | 0/20 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 7
  Batch size = 1


rule 27 acc: 0.5
rule 28: no-unsafe-finally, # 7


  0%|          | 0/7 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 20
  Batch size = 1


rule 28 acc: 0.42857142857142855
rule 29: no-unused-vars, # 20


  0%|          | 0/20 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 20
  Batch size = 1


rule 29 acc: 0.55
rule 30: no-debugger, # 20


  0%|          | 0/20 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 20
  Batch size = 1


rule 30 acc: 1.0
rule 31: no-unsafe-negation, # 20


  0%|          | 0/20 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 20
  Batch size = 1


rule 31 acc: 0.7
rule 32: no-case-declarations, # 20


  0%|          | 0/20 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 20
  Batch size = 1


rule 32 acc: 0.65
rule 33: no-self-assign, # 20


  0%|          | 0/20 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 20
  Batch size = 1


rule 33 acc: 0.15
rule 34: no-process-exit, # 20


  0%|          | 0/20 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 20
  Batch size = 1


rule 34 acc: 0.4
rule 35: no-inner-declarations, # 20


  0%|          | 0/20 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 5
  Batch size = 1


rule 35 acc: 0.25
rule 36: for-direction, # 5


  0%|          | 0/5 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 2
  Batch size = 1


rule 36 acc: 0.2
rule 37: no-compare-neg-zero, # 2


  0%|          | 0/2 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 20
  Batch size = 1


rule 37 acc: 0.0
rule 38: no-sparse-arrays, # 20


  0%|          | 0/20 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 15
  Batch size = 1


rule 38 acc: 0.35
rule 39: no-func-assign, # 15


  0%|          | 0/15 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 20
  Batch size = 1


rule 39 acc: 0.4666666666666667
rule 40: no-const-assign, # 20


  0%|          | 0/20 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 20
  Batch size = 1


rule 40 acc: 0.25
rule 41: no-global-assign, # 20


  0%|          | 0/20 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 8
  Batch size = 1


rule 41 acc: 0.55
rule 42: use-isnan, # 8


  0%|          | 0/8 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 19
  Batch size = 1


rule 42 acc: 0.5
rule 43: no-unused-labels, # 19


  0%|          | 0/19 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 20
  Batch size = 1


rule 43 acc: 0.5263157894736842
rule 44: require-yield, # 20


  0%|          | 0/20 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 20
  Batch size = 1


rule 44 acc: 0.55
rule 45: getter-return, # 20


  0%|          | 0/20 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 12
  Batch size = 1


rule 45 acc: 0.65
rule 46: no-dupe-class-members, # 12


  0%|          | 0/12 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 4
  Batch size = 1


rule 46 acc: 0.08333333333333333
rule 47: no-ex-assign, # 4


  0%|          | 0/4 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 20
  Batch size = 1


rule 47 acc: 0.5
rule 48: constructor-super, # 20


  0%|          | 0/20 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 1
  Batch size = 1


rule 48 acc: 0.6
rule 49: no-new-symbol, # 1


  0%|          | 0/1 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 18
  Batch size = 1


rule 49 acc: 1.0
rule 50: no-empty-pattern, # 18


  0%|          | 0/18 [00:00<?, ?it/s]

***** Running Prediction *****
  Num examples = 12
  Batch size = 1


rule 50 acc: 0.3888888888888889
rule 51: no-class-assign, # 12


  0%|          | 0/12 [00:00<?, ?it/s]

rule 51 acc: 0.4166666666666667


In [88]:
scores["average"] = compute_dict_average(scores)

# create the whole test list
test_list: List[DataPoint] = []
for key in test_info:
    test_list += test_info[key]
with open(os.path.join(model_directory, "test_data.json"), "w") as json_file:
    json.dump([{"source_code": test.source_code,"target_code": test.target_code, "predection" : test.predictions[0],"error id": test.linter_report.rule_id, "correct" : str(test.correct)} for test in test_list], json_file, indent = 3)
serialized_scores = json.dumps(scores, indent=4)
output_file = open(os.path.join(model_directory, "first_accs.txt"), "w+")
output_file.write(serialized_scores)
output_file.close()

print("end time: ", get_current_time())

end time:  16:22:58
