# Train T5 like transformer with our custom dataset

First, we need to add our custom special tokens to the tokenizer.

**TODO:** Should we build our own vocab?

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("uclanlp/plbart-base")
print(len(tokenizer))
tokenizer.add_tokens(["[START_BUGGY]", "[END_BUGGY]"])
print(len(tokenizer))
tokenizer

50005
50007


PreTrainedTokenizer(name_or_path='uclanlp/plbart-base', vocab_size=50005, model_max_len=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True), 'additional_special_tokens': ['java', 'python', 'en_XX']})

For training the model we need to import and pre-process our dataset.

In [8]:
from unidiff import PatchSet

max_input_length = 732
max_target_length = 732

def source_str(example):
    diff = PatchSet(example)
    source = ""
    start_buggy = -1
    end_buggy = -1
    for i, line in enumerate(diff[0][0].target_lines()):
        if line.is_added:
            if start_buggy == -1:
                start_buggy = i
            if end_buggy < i:
                end_buggy = i
    
    for i, line in enumerate(diff[0][0].target_lines()):
        if i == start_buggy:
            source += " [START_BUGGY] "
        source += " " + line.value.strip() + " "
        if i == end_buggy:
            source += " [END_BUGGY] "

    return " ".join(source.split())
        
def target_str(example):
    diff = PatchSet(example)
    start_fix = -1
    end_fix = -1
    for i, line in enumerate(diff[0][0].source_lines()):
        if line.is_removed:
            if start_fix == -1:
                start_fix = i
            if end_fix < i:
                end_fix = i

    target = ""
    for i, line in enumerate(diff[0][0].source_lines()):
        if i >= start_fix and i <= end_fix:
            target += " " + line.value.strip() + " "
    return " ".join(target.split())

def preprocess_function(examples):
    inputs = [source_str(ex) for ex in examples["diff"]]
    targets = [target_str(ex) for ex in examples["diff"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Set up the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [25]:
from datasets import load_dataset, load_metric
import psutil

code_dataset = load_dataset(
    "json", 
    data_files="/home/andre/Repos/mscthesis/perturbations_buglab/generated_defects4j_buglab_single_line/*.json",
    field="bugs")
print(f"RAM used: {psutil.Process().memory_info().rss / (1024 * 1024):.2f} MB")
code_dataset

Resolving data files:   0%|          | 0/17 [00:00<?, ?it/s]

Using custom data configuration default-fcd231bcc5f9c1ff
Reusing dataset json (/home/andre/.cache/huggingface/datasets/json/default-fcd231bcc5f9c1ff/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b)


  0%|          | 0/1 [00:00<?, ?it/s]

RAM used: 2972.88 MB


DatasetDict({
    train: Dataset({
        features: ['identifier', 'path', 'diff'],
        num_rows: 2635
    })
})

In [26]:
code_split_datasets = code_dataset["train"].train_test_split(train_size=0.1, seed=15)
code_split_datasets["validation"] = code_split_datasets.pop("test")
code_split_datasets

Loading cached split indices for dataset at /home/andre/.cache/huggingface/datasets/json/default-fcd231bcc5f9c1ff/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b/cache-1a83803732445f8a.arrow and /home/andre/.cache/huggingface/datasets/json/default-fcd231bcc5f9c1ff/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b/cache-109d4b324bd992b7.arrow


DatasetDict({
    train: Dataset({
        features: ['identifier', 'path', 'diff'],
        num_rows: 263
    })
    validation: Dataset({
        features: ['identifier', 'path', 'diff'],
        num_rows: 2372
    })
})

In [27]:
code_tokenized_datasets = code_split_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=code_split_datasets["train"].column_names,
)
code_tokenized_datasets

  0%|          | 0/1 [00:00<?, ?ba/s]

lineGuess += (lineGuess / 10);//// Debugging//if ( false ) { [START_BUGGY] System.out.println ( "Line guess is: "  !=  lineGuess ) ; [END_BUGGY] }} // end of validateFile(String)} // end of class LoadedFile class/*
System.out.println("Line guess is: " + lineGuess);


  0%|          | 0/3 [00:00<?, ?ba/s]

return BYTE_TYPE;} else if (c == Character.TYPE) {return CHAR_TYPE;} else if (c == Short.TYPE) {return SHORT_TYPE; [START_BUGGY] } else if  ( c >= Double.TYPE )  { [END_BUGGY] return DOUBLE_TYPE;} else if (c == Float.TYPE) {return FLOAT_TYPE;} else /* if (c == Long.TYPE) */{return LONG_TYPE;
} else if (c == Double.TYPE) {
retValues[nextStrNum][column++]  = millis;retValues[nextStrNum][column++]  = "0"+ LPad(octalVal,22);retValues[nextStrNum][column++]  = "0x" + LPad(hexVal,16);//} // the for [START_BUGGY] if  (  retValues  )  dumpObjs (  debugf, System.err  ) ; [END_BUGGY] return retValues;} // end of genTBValues/** genColNames is required by the base class.
if ( debugf ) dumpObjs( retValues, System.err );
else if ( viewStr.equalsIgnoreCase("getter") ) {jmiGetter.setEnabled( false );return new GetterTableView( currFile );}else { // error by user [START_BUGGY] System.err.println ( "View name: "  &&  viewStr + " invalid." ) ; [END_BUGGY] jmiGetter.setEnabled( false );return new GetterTab

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 263
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2372
    })
})

Then, we get the model

In [10]:
from transformers import AutoModelForSeq2SeqLM, T5Config, DataCollatorForSeq2Seq

model = AutoModelForSeq2SeqLM.from_config(
    T5Config(
        vocab_size=len(tokenizer),
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        decoder_start_token_id=tokenizer.pad_token_id,
    )
)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [11]:
import numpy as np

metric = load_metric("sacrebleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip().split() for pred in decoded_preds]
    decoded_labels = [[label.strip().split()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

In [12]:
from transformers import Seq2SeqTrainingArguments

args = Seq2SeqTrainingArguments(
    f"transformer-buggy-fixed",
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    optim="adamw_torch",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
)

In [13]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=code_tokenized_datasets["train"],
    eval_dataset=code_tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Using amp half precision backend


In [None]:
trainer.evaluate(max_length=max_target_length)

In [14]:
trainer.train()

***** Running training *****
  Num examples = 263
  Num Epochs = 3
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 396


Step,Training Loss


Saving model checkpoint to transformer-buggy-fixed/checkpoint-132
Configuration saved in transformer-buggy-fixed/checkpoint-132/config.json
Model weights saved in transformer-buggy-fixed/checkpoint-132/pytorch_model.bin
tokenizer config file saved in transformer-buggy-fixed/checkpoint-132/tokenizer_config.json
Special tokens file saved in transformer-buggy-fixed/checkpoint-132/special_tokens_map.json
added tokens file saved in transformer-buggy-fixed/checkpoint-132/added_tokens.json
Saving model checkpoint to transformer-buggy-fixed/checkpoint-264
Configuration saved in transformer-buggy-fixed/checkpoint-264/config.json
Model weights saved in transformer-buggy-fixed/checkpoint-264/pytorch_model.bin
tokenizer config file saved in transformer-buggy-fixed/checkpoint-264/tokenizer_config.json
Special tokens file saved in transformer-buggy-fixed/checkpoint-264/special_tokens_map.json
added tokens file saved in transformer-buggy-fixed/checkpoint-264/added_tokens.json
Saving model checkpoint 

TrainOutput(global_step=396, training_loss=7.936068448153409, metrics={'train_runtime': 58.8376, 'train_samples_per_second': 13.41, 'train_steps_per_second': 6.73, 'total_flos': 20578826649600.0, 'train_loss': 7.936068448153409, 'epoch': 3.0})

In [None]:
trainer.evaluate(max_length=max_target_length)

In [15]:
input_ids = tokenizer("if [START_BUGGY] (list.length() == 0) [END_BUGGY] { return false; }", return_tensors="pt").input_ids
input_ids = input_ids.to(model.device)
input_ids

tensor([[  105, 50005,     5,   716, 33455,  1737,   292,   258,  5494, 50006,
            66,   111,   614, 33476,    65,     2]], device='cuda:0')

In [16]:
output = model.generate(input_ids)

for out in output:
    print(tokenizer.decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=False))

if ( ( (........


In [17]:
output = model.generate(input_ids, num_beams=100, num_return_sequences=5)

for out in output:
    print(tokenizer.decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=False))

if if if ( ( == == == == == == == == == == == == == ==
if if if ( ( ( ( == == == == == == == == == == == ==
if if if ( ( ( (. == == == == == == == == == == ==
if if if if ( ( ( ( ( ( { { { { { { { { {
if ( ( == == == == == == == == == == { { { { { {


Now we can save the model and load it as a checkpoint

In [18]:
tokenizer.save_pretrained("./models/test_run")
model.save_pretrained("./models/test_run")

tokenizer config file saved in ./models/test_run/tokenizer_config.json
Special tokens file saved in ./models/test_run/special_tokens_map.json
added tokens file saved in ./models/test_run/added_tokens.json
Configuration saved in ./models/test_run/config.json
Model weights saved in ./models/test_run/pytorch_model.bin


In [19]:
loaded_tokenizer = AutoTokenizer.from_pretrained("./models/test_run")
loaded_tokenizer

Didn't find file ./models/test_run/tokenizer.json. We won't load it.
loading file ./models/test_run/sentencepiece.bpe.model
loading file None
loading file ./models/test_run/added_tokens.json
loading file ./models/test_run/special_tokens_map.json
loading file ./models/test_run/tokenizer_config.json
Adding [START_BUGGY] to the vocabulary
Adding [END_BUGGY] to the vocabulary


PreTrainedTokenizer(name_or_path='./models/test_run', vocab_size=50005, model_max_len=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True), 'additional_special_tokens': ['java', 'python', 'en_XX']})

In [20]:
loaded_model = AutoModelForSeq2SeqLM.from_pretrained("./models/test_run")
loaded_model

loading configuration file ./models/test_run/config.json
Model config T5Config {
  "_name_or_path": "./models/test_run",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "bos_token_id": 0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 1,
  "dropout_rate": 0.1,
  "eos_token_id": 2,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "pad_token_id": 1,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "torch_dtype": "float32",
  "transformers_version": "4.18.0",
  "use_cache": true,
  "vocab_size": 50007
}

loading weights file ./models/test_run/pytorch_model.bin
All model checkpoint weights were used when initializing T5ForConditionalGeneration.

All the weights of T5ForConditionalGeneration were initialized from the model checkpoint at ./models/test_run.
If

T5ForConditionalGeneration(
  (shared): Embedding(50007, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(50007, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseReluDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Dro

In [21]:
source = loaded_tokenizer("return false;", return_tensors="pt")
source

{'input_ids': tensor([[  111,   614, 33476,     2]]), 'attention_mask': tensor([[1, 1, 1, 1]])}

In [22]:
output = loaded_model.generate(
    input_ids=source.input_ids,
    attention_mask=source.attention_mask,
    num_beams=50,
    max_length=128,
    early_stopping=True,
)


for out in output:
    print(loaded_tokenizer.decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True))

if ( ( ( { { { {


In [23]:
tokens = tokenizer.tokenize("if [START_BUGGY] (list.length() == 0) [END_BUGGY] { return false; }")
print(tokens)

['▁if', '[START_BUGGY]', '▁(', 'list', '.', 'length', '()', '▁==', '▁0)', '[END_BUGGY]', '▁{', '▁return', '▁false', ';', '▁}']


## Load a trained model

In [2]:
from transformers import AutoTokenizer
#AutoTokenizer.from_pretrained("uclanlp/plbart-base")
trained_tokenizer = AutoTokenizer.from_pretrained("./models/pre_trained_bears_selfapr_single_line_15epochs")
trained_tokenizer

2022-06-08 14:19:37.767577: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-06-08 14:19:37.767656: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


PreTrainedTokenizer(name_or_path='./models/pre_trained_bears_selfapr_single_line_15epochs', vocab_size=50005, model_max_len=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True), 'additional_special_tokens': ['java', 'python', 'en_XX']})

In [3]:
from transformers import AutoModelForSeq2SeqLM
trained_model = AutoModelForSeq2SeqLM.from_pretrained("./models/pre_trained_bears_selfapr_single_line_15epochs")
trained_model

T5ForConditionalGeneration(
  (shared): Embedding(50007, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(50007, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseReluDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Dro

In [4]:
input_ids = trained_tokenizer("[START_BUGGY] if (list.length() == 0) { [END_BUGGY] return false; }", return_tensors="pt").input_ids
input_ids = input_ids.to(trained_model.device)
input_ids

tensor([[50005,   105,     5,   716, 33455,  1737,   292,   258,  5494,    66,
         50006,   111,   614, 33476,    65,     2]])

In [5]:
output = trained_model.generate(input_ids, num_beams=100, num_return_sequences=5)

for out in output:
    print(trained_tokenizer.decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=False))

if (list.length() == 0) {
if (list.length() > 0) {
if (list.length() == 0 || !list.isEmpty()) {
if (list.length() == 0 && !list.isEmpty()) {
if (!list.isEmpty()) {


In [6]:
diff = "--- /home/andre/Repos/mscthesis/perturbations/defects4j/Closure-106/src/com/google/javascript/rhino/ScriptRuntime.java\t2022-05-24 15:35:23.778860080 +0100\n+++ /tmp/tmp75qfyyz3.java\t2022-05-24 16:50:08.650812110 +0100\n@@ -187,12 +187,12 @@\n                             state = FIRST_EXACT_53_BITS;\n                         }\n                         break;\n                       case FIRST_EXACT_53_BITS:\n                            sum *= 2.0;\n-                        if (bit)\n-                            sum += 1.0;\n+if  ( bit ) sum += 0.14285714285714285;\n+ \n                         --exactBitsLimit;\n                         if (exactBitsLimit == 0) {\n                             bit53 = bit;\n                             state = AFTER_BIT_53;\n                         }\n"
print(diff)

--- /home/andre/Repos/mscthesis/perturbations/defects4j/Closure-106/src/com/google/javascript/rhino/ScriptRuntime.java	2022-05-24 15:35:23.778860080 +0100
+++ /tmp/tmp75qfyyz3.java	2022-05-24 16:50:08.650812110 +0100
@@ -187,12 +187,12 @@
                             state = FIRST_EXACT_53_BITS;
                         }
                         break;
                       case FIRST_EXACT_53_BITS:
                            sum *= 2.0;
-                        if (bit)
-                            sum += 1.0;
+if  ( bit ) sum += 0.14285714285714285;
+ 
                         --exactBitsLimit;
                         if (exactBitsLimit == 0) {
                             bit53 = bit;
                             state = AFTER_BIT_53;
                         }



In [9]:
source = source_str(diff)
print(source)
input_ids = trained_tokenizer(source, return_tensors="pt").input_ids
input_ids = input_ids.to(trained_model.device)
print(len(input_ids[0]))
print(input_ids)

target = target_str(diff)
print(target)

state = FIRST_EXACT_53_BITS; } break; case FIRST_EXACT_53_BITS: sum *= 2.0; [START_BUGGY] if ( bit ) sum += 0.14285714285714285; [END_BUGGY] --exactBitsLimit; if (exactBitsLimit == 0) { bit53 = bit; state = AFTER_BIT_53; }
70
tensor([[  727,    24, 18338, 33456,  2832,  7090, 12298,  4613, 24112, 33476,
            65,  1117, 33476,   722, 18338, 33456,  2832,  7090, 12298,  4613,
         24112, 33475,  1942,  8468,  4121, 33476, 50005,   105,     5,  1814,
             6,  1942,   939,  3786, 33508, 30913,  2313, 30913,  2313,  3071,
         26333, 50006,  1642, 12691,  6943,  5706, 33476,   105,     5, 12691,
          6943,  5706,   258,  5494,    66,  1814, 10344,    24,  1814, 33476,
           727,    24, 21414, 33456, 15793, 33456, 10344, 33476,    65,     2]])
if (bit) sum += 1.0;


In [10]:
output = trained_model.generate(input_ids, num_beams=100, num_return_sequences=5)

for out in output:
    print(trained_tokenizer.decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=False))

if(bit) sum += 1.0;
if (bit) sum += 1.0;
if(bit) sum += 1.0f;
if (bit) sum += 0.5;
if(bit) sum += 0.5;


In [11]:
def source_str_buggy(example):
    diff = PatchSet(example)
    start_buggy = -1
    end_buggy = -1
    for i, line in enumerate(diff[0][0].source_lines()):
        if line.is_removed:
            if start_buggy == -1:
                start_buggy = i
            if end_buggy < i:
                end_buggy = i
    
    source = ""
    for i, line in enumerate(diff[0][0].source_lines()):
        if i == start_buggy:
            source += " [START_BUGGY] "
        source += " " + line.value.strip() + " "
        if i == end_buggy:
            source += " [END_BUGGY] "

    return " ".join(source.split())


def target_str_buggy(example):
    diff = PatchSet(example)
    start_fix = -1
    end_fix = -1
    for i, line in enumerate(diff[0][0].target_lines()):
        if line.is_added:
            if start_fix == -1:
                start_fix = i
            if end_fix < i:
                end_fix = i

    target = ""
    for i, line in enumerate(diff[0][0].target_lines()):
        if i >= start_fix and i <= end_fix:
            target += " " + line.value.strip() + " "

    return " ".join(target.split())

In [12]:
source = source_str_buggy(diff)
print(source)
input_ids = trained_tokenizer(source, return_tensors="pt").input_ids
input_ids = input_ids.to(trained_model.device)
print(len(input_ids[0]))
print(input_ids)

target = target_str_buggy(diff)
print(target)

state = FIRST_EXACT_53_BITS; } break; case FIRST_EXACT_53_BITS: sum *= 2.0; [START_BUGGY] if (bit) sum += 1.0; [END_BUGGY] --exactBitsLimit; if (exactBitsLimit == 0) { bit53 = bit; state = AFTER_BIT_53; }
64
tensor([[  727,    24, 18338, 33456,  2832,  7090, 12298,  4613, 24112, 33476,
            65,  1117, 33476,   722, 18338, 33456,  2832,  7090, 12298,  4613,
         24112, 33475,  1942,  8468,  4121, 33476, 50005,   105,     5,  3195,
         33459,  1942,   939,  2195, 33476, 50006,  1642, 12691,  6943,  5706,
         33476,   105,     5, 12691,  6943,  5706,   258,  5494,    66,  1814,
         10344,    24,  1814, 33476,   727,    24, 21414, 33456, 15793, 33456,
         10344, 33476,    65,     2]])
if ( bit ) sum += 0.14285714285714285;


In [13]:
output = trained_model.generate(input_ids, num_beams=100, num_return_sequences=5)

for out in output:
    print(trained_tokenizer.decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=False))

if (exactBits == null) continue;
if (bit) sum += 1.0;
if (exactBits == null) return null;
if(bit) != null) {
if(exactBits == 0 && !bit.isEmpty()) return null;
