# Train T5 like transformer with our custom dataset

First, we need to add our custom special tokens to the tokenizer.

**TODO:** Should we build our own vocab?

In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("uclanlp/plbart-base")
tokenizer.vocab_size
tokenizer

PreTrainedTokenizer(name_or_path='uclanlp/plbart-base', vocab_size=50005, model_max_len=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True), 'additional_special_tokens': ['java', 'python', 'en_XX']})

For training the model we need to import and pre-process our dataset.

In [2]:
from unidiff import PatchSet

max_input_length = 128
max_target_length = 128

def source_str(example):
    diff = PatchSet(example)
    source = ""
    for line in diff[0][0].target_lines():
        source += line.value.strip()
    return source
        
def target_str(example):
    diff = PatchSet(example)
    target = ""
    for line in diff[0][0].source_lines():
        target += line.value.strip()
    return target

def preprocess_function(examples):
    inputs = [source_str(ex) for ex in examples["diff"]]
    targets = [target_str(ex) for ex in examples["diff"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Set up the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [3]:
from datasets import load_dataset, load_metric

code_dataset = load_dataset("json", data_files="perturbations/defects4j_generated_hunk.json", field="bugs")
code_dataset

Using custom data configuration default-b7d157fde327be57
Reusing dataset json (/home/andre/.cache/huggingface/datasets/json/default-b7d157fde327be57/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['identifier', 'path', 'diff'],
        num_rows: 29965
    })
})

In [4]:
code_split_datasets = code_dataset["train"].train_test_split(train_size=0.9, seed=15)
code_split_datasets["validation"] = code_split_datasets.pop("test")
code_split_datasets

Loading cached split indices for dataset at /home/andre/.cache/huggingface/datasets/json/default-b7d157fde327be57/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b/cache-fdfe9ffff708e871.arrow and /home/andre/.cache/huggingface/datasets/json/default-b7d157fde327be57/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b/cache-791aee38e29c6b11.arrow


DatasetDict({
    train: Dataset({
        features: ['identifier', 'path', 'diff'],
        num_rows: 26968
    })
    validation: Dataset({
        features: ['identifier', 'path', 'diff'],
        num_rows: 2997
    })
})

In [5]:
code_tokenized_datasets = code_split_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=code_split_datasets["train"].column_names,
)
code_tokenized_datasets

  0%|          | 0/27 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 26968
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2997
    })
})

Then, we get the model

In [6]:
from transformers import AutoModelForSeq2SeqLM, T5Config, DataCollatorForSeq2Seq

model = AutoModelForSeq2SeqLM.from_config(
    T5Config(
        vocab_size=tokenizer.vocab_size,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        decoder_start_token_id=tokenizer.pad_token_id,
    )
)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [7]:
import numpy as np

metric = load_metric("sacrebleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip().split() for pred in decoded_preds]
    decoded_labels = [[label.strip().split()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

In [8]:
from transformers import Seq2SeqTrainingArguments

args = Seq2SeqTrainingArguments(
    f"transformer-buggy-fixed",
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    optim="adamw_torch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
)

In [9]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=code_tokenized_datasets["train"],
    eval_dataset=code_tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Using amp half precision backend


In [10]:
trainer.evaluate(max_length=max_target_length)

***** Running Evaluation *****
  Num examples = 2997
  Batch size = 16


{'eval_loss': 11.320443153381348,
 'eval_bleu': 1.8850776799123967e-16,
 'eval_runtime': 207.988,
 'eval_samples_per_second': 14.409,
 'eval_steps_per_second': 0.904}

In [11]:
trainer.train()

***** Running training *****
  Num examples = 26968
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 10113


Step,Training Loss
500,7.7637
1000,5.8359
1500,5.1183
2000,4.6606
2500,4.3622
3000,4.0989
3500,3.8328
4000,3.589
4500,3.47
5000,3.3763


Saving model checkpoint to transformer-buggy-fixed/checkpoint-3371
Configuration saved in transformer-buggy-fixed/checkpoint-3371/config.json
Model weights saved in transformer-buggy-fixed/checkpoint-3371/pytorch_model.bin
tokenizer config file saved in transformer-buggy-fixed/checkpoint-3371/tokenizer_config.json
Special tokens file saved in transformer-buggy-fixed/checkpoint-3371/special_tokens_map.json
Saving model checkpoint to transformer-buggy-fixed/checkpoint-6742
Configuration saved in transformer-buggy-fixed/checkpoint-6742/config.json
Model weights saved in transformer-buggy-fixed/checkpoint-6742/pytorch_model.bin
tokenizer config file saved in transformer-buggy-fixed/checkpoint-6742/tokenizer_config.json
Special tokens file saved in transformer-buggy-fixed/checkpoint-6742/special_tokens_map.json
Saving model checkpoint to transformer-buggy-fixed/checkpoint-10113
Configuration saved in transformer-buggy-fixed/checkpoint-10113/config.json
Model weights saved in transformer-bug

TrainOutput(global_step=10113, training_loss=3.763419118960628, metrics={'train_runtime': 2168.6655, 'train_samples_per_second': 37.306, 'train_steps_per_second': 4.663, 'total_flos': 1566420633649152.0, 'train_loss': 3.763419118960628, 'epoch': 3.0})

In [12]:
trainer.evaluate(max_length=max_target_length)

***** Running Evaluation *****
  Num examples = 2997
  Batch size = 16


{'eval_loss': 2.3545937538146973,
 'eval_bleu': 44.887032605183265,
 'eval_runtime': 201.3151,
 'eval_samples_per_second': 14.887,
 'eval_steps_per_second': 0.934,
 'epoch': 3.0}

In [13]:
input_ids = tokenizer("if (list.length() == 0) { return false; }", return_tensors="pt").input_ids
input_ids = input_ids.to(model.device)
input_ids

tensor([[  105,     5,   716, 33455,  1737,   292,   258,  5494,    66,   111,
           614, 33476,    65,     2]], device='cuda:0')

In [14]:
output = model.generate(input_ids)

for out in output:
    print(tokenizer.decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True))

} else if (length == false) {throw new IllegalStateException("Iterator must not be null


In [15]:
output = model.generate(input_ids, num_beams=100, num_return_sequences=5)

for out in output:
    print(tokenizer.decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True))

} else if (parent.length == false) {throw new IllegalStateException ( ) ;}/
} else if (coll.length == false) {throw new IllegalStateException ( ) ;}/
} else if (parent.length == false) {throw new NoSuchElementException ( ) ;}/
} else if (length == false) {throw new NoSuchElementException ( ) ;}/** if
} else if (coll.length == false) {throw new NoSuchElementException ( ) ;}/


Now we can save the model and load it as a checkpoint

In [16]:
tokenizer.save_pretrained("./models/test_run")
model.save_pretrained("./models/test_run")

tokenizer config file saved in ./models/test_run/tokenizer_config.json
Special tokens file saved in ./models/test_run/special_tokens_map.json
Configuration saved in ./models/test_run/config.json
Model weights saved in ./models/test_run/pytorch_model.bin


In [17]:
loaded_tokenizer = AutoTokenizer.from_pretrained("./models/test_run")
loaded_tokenizer

Didn't find file ./models/test_run/added_tokens.json. We won't load it.
loading file ./models/test_run/sentencepiece.bpe.model
loading file ./models/test_run/tokenizer.json
loading file None
loading file ./models/test_run/special_tokens_map.json
loading file ./models/test_run/tokenizer_config.json


PreTrainedTokenizer(name_or_path='./models/test_run', vocab_size=50005, model_max_len=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True), 'additional_special_tokens': ['java', 'python', 'en_XX']})

In [18]:
loaded_model = AutoModelForSeq2SeqLM.from_pretrained("./models/test_run")
loaded_model

loading configuration file ./models/test_run/config.json
Model config T5Config {
  "_name_or_path": "./models/test_run",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "bos_token_id": 0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 1,
  "dropout_rate": 0.1,
  "eos_token_id": 2,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "pad_token_id": 1,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "torch_dtype": "float32",
  "transformers_version": "4.18.0",
  "use_cache": true,
  "vocab_size": 50005
}

loading weights file ./models/test_run/pytorch_model.bin
All model checkpoint weights were used when initializing T5ForConditionalGeneration.

All the weights of T5ForConditionalGeneration were initialized from the model checkpoint at ./models/test_run.
If

T5ForConditionalGeneration(
  (shared): Embedding(50005, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(50005, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseReluDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Dro

In [19]:
source = loaded_tokenizer("return false;", return_tensors="pt")
source

{'input_ids': tensor([[  111,   614, 33476,     2]]), 'attention_mask': tensor([[1, 1, 1, 1]])}

In [20]:
output = loaded_model.generate(
    input_ids=source.input_ids,
    attention_mask=source.attention_mask,
    num_beams=50,
    max_length=128,
    early_stopping=True,
)


for out in output:
    print(loaded_tokenizer.decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True))

return false;}}


In [29]:
tokens = tokenizer.tokenize("if (list.length() == 0) { return false; }")
print(tokens)

['▁if', '▁(', 'list', '.', 'length', '()', '▁==', '▁0)', '▁{', '▁return', '▁false', ';', '▁}']
