In [1]:
from datetime import datetime
import argparse
import os
import sys

sys.path.append(".")
sys.path.append("./hf_transformers")

from transformers import Seq2SeqTrainer
from transformers import Seq2SeqTrainingArguments
from transformers import T5Config
from transformers import T5ForConditionalGeneration
from transformers import T5Tokenizer
from transformers import set_seed
import torch

from data_reader import GetDataAsPython
from prepare_data import create_data
from prepare_data import create_dataset
from prepare_data import extract_warning_types
from utils import boolean_string
from utils import get_current_time

In [2]:
set_seed(42)

In [3]:
import socket
local = False if 'computecanada' in socket.gethostname() else True

In [4]:
model_name = 't5-small'

In [5]:
if local:
    storage_directory = './storage/'
    pretrained_model = model_name
else:
    storage_directory = '/scratch/arminz/'
    pretrained_model = f'{storage_directory}/pretrained/{model_name}'


In [6]:
model_dir = "" # args.model_dir
design = "repo-based" #args.design
pre_trained = True #args.pre_trained
epochs = 30 #args.epochs
batch_size = 16
save_total_limit = 1 # args.save-total-limit
eval_acc_steps = 1 # eval-acc-steps
learning_rate = 1e-4 # args.learning-rate
weight_decay = 0 # args.weight-decay

In [7]:
# Create job directory
if model_dir != "":
    model_directory = args.model_dir
else:
    now = datetime.now()
    dt_string = now.strftime("%d-%m-%Y_%H-%M-%S")
    # model_directory = "t5global" + "_" + dt_string
    model_directory = f'{storage_directory}/training-tbug/{model_name}_{design}_{dt_string}'

In [8]:
# os.makedirs(model_directory)
# with open(os.path.join(model_directory, "commandline_args.txt"), "w") as f:
#     f.write("\n".join(sys.argv[1:]))

# Read and prepare data
data = GetDataAsPython(f"{storage_directory}/data_and_models/data/data_autofix_tracking_repo_specific_final.json")
data_eslint = GetDataAsPython(f"{storage_directory}/data_and_models/data/data_autofix_tracking_eslint_final.json")
data += data_eslint
all_warning_types = extract_warning_types(data)
# if args.error_type != "":
#     all_warning_types = [args.error_type]
print(all_warning_types)

['no-invalid-this', 'no-throw-literal', 'no-new-wrappers', 'guard-for-in', 'no-new-object', 'comma-style', 'prefer-spread', 'no-caller', 'no-extra-bind', 'no-array-constructor', 'prefer-rest-params', 'generator-star-spacing', 'no-this-before-super', 'no-extend-native', 'no-undef', 'no-useless-escape', 'no-dupe-keys', 'no-console', 'no-constant-condition', 'no-duplicate-case', 'no-empty', 'no-extra-semi', 'no-redeclare', 'no-cond-assign', 'no-extra-boolean-cast', 'no-fallthrough', 'no-unreachable', 'valid-typeof', 'no-unsafe-finally', 'no-unused-vars', 'no-debugger', 'no-unsafe-negation', 'no-case-declarations', 'no-self-assign', 'no-process-exit', 'no-inner-declarations', 'for-direction', 'no-compare-neg-zero', 'no-sparse-arrays', 'no-func-assign', 'no-const-assign', 'no-global-assign', 'use-isnan', 'no-unused-labels', 'require-yield', 'getter-return', 'no-dupe-class-members', 'no-ex-assign', 'constructor-super', 'no-new-symbol', 'no-empty-pattern', 'no-class-assign']


In [9]:
(
    train_inputs,
    train_labels,
    val_inputs,
    val_labels,
    test_inputs,
    test_labels,
    train_info,
    val_info,
    test_info,
) = create_data(data, all_warning_types, include_warning=True, design=design, back_translation=True)

splitting by : repo-based
train size: 70022
val size: 23366
test size: 3374


In [10]:
# Create the tokenizer and the model
tokenizer = T5Tokenizer.from_pretrained(
    pretrained_model,
)
tokenizer.add_tokens(["{", "}", ">", "\\", "^"])
tokenizer.save_pretrained(model_directory)
if pre_trained:
    model = T5ForConditionalGeneration.from_pretrained(pretrained_model, return_dict=False)
else:
    print("Training from scratch")
    config = T5Config.from_pretrained(pretrained_model)
    model = T5ForConditionalGeneration(config)
model.parallelize()
model.resize_token_embeddings(len(tokenizer))
print("Models parameters: ", model.num_parameters())


Models parameters:  60494336


In [11]:
model

T5ForConditionalGeneration(
  (shared): Embedding(32104, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32104, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseReluDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Dro

In [12]:
print(len(train_inputs), len(train_labels))
train_dataset = create_dataset(
    train_labels, train_inputs, tokenizer, pad_truncate=True, max_length=128
)
val_dataset = create_dataset(val_labels, val_inputs, tokenizer, pad_truncate=True)

70022 70022




In [13]:
train_labels[10]

"bug no-invalid-this       \n      var titleEl = this.el.querySelector('.title');\n      if(!titleEl) {\n </s>"

In [14]:
# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir=model_directory,
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_steps=500,
    weight_decay=weight_decay,
    logging_dir=model_directory,
    logging_steps=100,
    do_eval=True,
    evaluation_strategy="epoch",
    learning_rate=learning_rate,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    save_total_limit=args.epochs if save_total_limit == -1 else save_total_limit,
    eval_accumulation_steps=eval_acc_steps,  # set this lower, if testing or validation crashes
    disable_tqdm=False,
    predict_with_generate=True,  # never set this to false.
    seed=42,  # default value
)

In [15]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    optimizers=[torch.optim.Adam(params=model.parameters(), lr=learning_rate), None],
    tokenizer=tokenizer,
)

In [16]:
trainer.train()

Epoch,Training Loss,Validation Loss,Runtime,Samples Per Second
1,0.3575,0.16194,151.5065,154.224
2,0.3224,0.138747,150.7777,154.97
3,0.2727,0.129006,149.4795,156.316
4,0.2518,0.126303,152.1087,153.614
5,0.2498,0.126444,150.132,155.636
6,0.2398,0.124121,151.0164,154.725
7,0.2243,0.12632,151.12,154.619
8,0.2178,0.126921,151.4332,154.299
9,0.2102,0.128649,150.6911,155.059
10,0.198,0.128899,152.7397,152.979


KeyboardInterrupt: 

In [None]:
model.device