# Train T5 like transformer with our custom dataset

First, we need to add our custom special tokens to the tokenizer.

**TODO:** Should we build our own vocab?

In [12]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("uclanlp/plbart-base")
print(len(tokenizer))
tokenizer.add_tokens(["[START_BUGGY]", "[END_BUGGY]"])
print(len(tokenizer))
tokenizer

50005
50007


PreTrainedTokenizer(name_or_path='uclanlp/plbart-base', vocab_size=50005, model_max_len=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True), 'additional_special_tokens': ['java', 'python', 'en_XX']})

For training the model we need to import and pre-process our dataset.

In [13]:
from unidiff import PatchSet

max_input_length = 732
max_target_length = 732

def source_str(example):
    diff = PatchSet(example)
    source = ""
    for line in diff[0][0].target_lines():
        if line.is_added:
            source += " [START_BUGGY] " + line.value.strip() + " [END_BUGGY]"
        else:
            source += line.value.strip()
    return source
        
def target_str(example):
    diff = PatchSet(example)
    target = ""
    for line in diff[0][0].source_lines():
        target += line.value.strip()
    return target

def preprocess_function(examples):
    inputs = [source_str(ex) for ex in examples["diff"]]
    targets = [target_str(ex) for ex in examples["diff"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Set up the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [14]:
from datasets import load_dataset, load_metric

code_dataset = load_dataset("json", data_files="perturbations/defects4j_generated_hunk.json", field="bugs")
code_dataset

Using custom data configuration default-cfe4d697bf0d4714
Reusing dataset json (/home/andre/.cache/huggingface/datasets/json/default-cfe4d697bf0d4714/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['identifier', 'path', 'diff'],
        num_rows: 96803
    })
})

In [25]:
code_split_datasets = code_dataset["train"].train_test_split(train_size=0.1, seed=15)
code_split_datasets["validation"] = code_split_datasets.pop("test")
code_split_datasets

DatasetDict({
    train: Dataset({
        features: ['identifier', 'path', 'diff'],
        num_rows: 9680
    })
    validation: Dataset({
        features: ['identifier', 'path', 'diff'],
        num_rows: 87123
    })
})

In [26]:
code_tokenized_datasets = code_split_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=code_split_datasets["train"].column_names,
)
code_tokenized_datasets

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/88 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 9680
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 87123
    })
})

Then, we get the model

In [27]:
from transformers import AutoModelForSeq2SeqLM, T5Config, DataCollatorForSeq2Seq

model = AutoModelForSeq2SeqLM.from_config(
    T5Config(
        vocab_size=len(tokenizer),
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        decoder_start_token_id=tokenizer.pad_token_id,
    )
)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [28]:
import numpy as np

metric = load_metric("sacrebleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip().split() for pred in decoded_preds]
    decoded_labels = [[label.strip().split()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

In [29]:
from transformers import Seq2SeqTrainingArguments

args = Seq2SeqTrainingArguments(
    f"transformer-buggy-fixed",
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    optim="adamw_torch",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [30]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=code_tokenized_datasets["train"],
    eval_dataset=code_tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Using amp half precision backend


In [None]:
trainer.evaluate(max_length=max_target_length)

In [31]:
trainer.train()

***** Running training *****
  Num examples = 9680
  Num Epochs = 3
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 14520


Step,Training Loss
500,8.9624
1000,7.6979
1500,7.2758
2000,6.895
2500,6.6611
3000,6.4304
3500,6.2002
4000,6.1481
4500,5.9961
5000,5.8441


Saving model checkpoint to transformer-buggy-fixed/checkpoint-4840
Configuration saved in transformer-buggy-fixed/checkpoint-4840/config.json
Model weights saved in transformer-buggy-fixed/checkpoint-4840/pytorch_model.bin
tokenizer config file saved in transformer-buggy-fixed/checkpoint-4840/tokenizer_config.json
Special tokens file saved in transformer-buggy-fixed/checkpoint-4840/special_tokens_map.json
added tokens file saved in transformer-buggy-fixed/checkpoint-4840/added_tokens.json
Deleting older checkpoint [transformer-buggy-fixed/checkpoint-3371] due to args.save_total_limit
Saving model checkpoint to transformer-buggy-fixed/checkpoint-9680
Configuration saved in transformer-buggy-fixed/checkpoint-9680/config.json
Model weights saved in transformer-buggy-fixed/checkpoint-9680/pytorch_model.bin
tokenizer config file saved in transformer-buggy-fixed/checkpoint-9680/tokenizer_config.json
Special tokens file saved in transformer-buggy-fixed/checkpoint-9680/special_tokens_map.json


TrainOutput(global_step=14520, training_loss=5.676233387256128, metrics={'train_runtime': 2073.6859, 'train_samples_per_second': 14.004, 'train_steps_per_second': 7.002, 'total_flos': 736688687284224.0, 'train_loss': 5.676233387256128, 'epoch': 3.0})

In [12]:
trainer.evaluate(max_length=max_target_length)

***** Running Evaluation *****
  Num examples = 2997
  Batch size = 16


{'eval_loss': 2.3545937538146973,
 'eval_bleu': 44.887032605183265,
 'eval_runtime': 201.3151,
 'eval_samples_per_second': 14.887,
 'eval_steps_per_second': 0.934,
 'epoch': 3.0}

In [44]:
input_ids = tokenizer("if [START_BUGGY] (list.length() == 0) [END_BUGGY] { return false; }", return_tensors="pt").input_ids
input_ids = input_ids.to(model.device)
input_ids

tensor([[  105, 50005,     5,   716, 33455,  1737,   292,   258,  5494, 50006,
            66,   111,   614, 33476,    65,     2]], device='cuda:0')

In [45]:
output = model.generate(input_ids)

for out in output:
    print(tokenizer.decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True))

}if (is.length() == 0) {if (is() == 0) {return;


In [46]:
output = model.generate(input_ids, num_beams=100, num_return_sequences=5)

for out in output:
    print(tokenizer.decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True))

if (parent.length() == 0) {return false;}@Overridepublic void
if (parent.length() == 0) {return false;}if (parent.length() ==
}@Overridepublic boolean() {if (type == null) {if (
if (parent.length() == 0) {if (options.length() == 0) {return
if (parent.length() == 0) {return false;}@Overridepublic boolean


Now we can save the model and load it as a checkpoint

In [35]:
tokenizer.save_pretrained("./models/test_run")
model.save_pretrained("./models/test_run")

tokenizer config file saved in ./models/test_run/tokenizer_config.json
Special tokens file saved in ./models/test_run/special_tokens_map.json
added tokens file saved in ./models/test_run/added_tokens.json
Configuration saved in ./models/test_run/config.json
Model weights saved in ./models/test_run/pytorch_model.bin


In [36]:
loaded_tokenizer = AutoTokenizer.from_pretrained("./models/test_run")
loaded_tokenizer

loading file ./models/test_run/sentencepiece.bpe.model
loading file ./models/test_run/tokenizer.json
loading file ./models/test_run/added_tokens.json
loading file ./models/test_run/special_tokens_map.json
loading file ./models/test_run/tokenizer_config.json
Adding [START_BUGGY] to the vocabulary
Adding [END_BUGGY] to the vocabulary


PreTrainedTokenizer(name_or_path='./models/test_run', vocab_size=50005, model_max_len=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True), 'additional_special_tokens': ['java', 'python', 'en_XX']})

In [37]:
loaded_model = AutoModelForSeq2SeqLM.from_pretrained("./models/test_run")
loaded_model

loading configuration file ./models/test_run/config.json
Model config T5Config {
  "_name_or_path": "./models/test_run",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "bos_token_id": 0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 1,
  "dropout_rate": 0.1,
  "eos_token_id": 2,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "pad_token_id": 1,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "torch_dtype": "float32",
  "transformers_version": "4.18.0",
  "use_cache": true,
  "vocab_size": 50007
}

loading weights file ./models/test_run/pytorch_model.bin
All model checkpoint weights were used when initializing T5ForConditionalGeneration.

All the weights of T5ForConditionalGeneration were initialized from the model checkpoint at ./models/test_run.
If

T5ForConditionalGeneration(
  (shared): Embedding(50007, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(50007, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseReluDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Dro

In [38]:
source = loaded_tokenizer("return false;", return_tensors="pt")
source

{'input_ids': tensor([[  111,   614, 33476,     2]]), 'attention_mask': tensor([[1, 1, 1, 1]])}

In [39]:
output = loaded_model.generate(
    input_ids=source.input_ids,
    attention_mask=source.attention_mask,
    num_beams=50,
    max_length=128,
    early_stopping=True,
)


for out in output:
    print(loaded_tokenizer.decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True))

return false false false false false false false false false false false false false false false false false false false false;


In [43]:
tokens = tokenizer.tokenize("if [START_BUGGY] (list.length() == 0) [END_BUGGY] { return false; }")
print(tokens)

['▁if', '[START_BUGGY]', '▁(', 'list', '.', 'length', '()', '▁==', '▁0)', '[END_BUGGY]', '▁{', '▁return', '▁false', ';', '▁}']
