# Dataset

In [1]:
from datasets import load_dataset

dataset = load_dataset("ASSERT-KTH/megadiff-sf-synthetic_test_error", split="train")
dataset = dataset.train_test_split(test_size=0.02)

dataset

DatasetDict({
    train: Dataset({
        features: ['diff', 'is_single_chunk', 'is_single_function', 'buggy_function', 'fixed_function', 'short_diff', 'completion', 'generated_test_case', 'generated_error_message', 'prompt', 'answer'],
        num_rows: 63100
    })
    test: Dataset({
        features: ['diff', 'is_single_chunk', 'is_single_function', 'buggy_function', 'fixed_function', 'short_diff', 'completion', 'generated_test_case', 'generated_error_message', 'prompt', 'answer'],
        num_rows: 1288
    })
})

# Tokeniser

In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("meta-llama/CodeLlama-7b-Instruct-hf")
tokenizer.add_special_tokens({"pad_token":"<pad>"})
tokenizer.padding_side = 'right'

In [3]:
def format_texts(examples, begin_inst="[INST]", end_inst="[\\INST]"):
    output_texts = []
    for i in range(len(examples['prompt'])):
        text = f"<s>{begin_inst} {examples['prompt'][i]} {end_inst} {examples['answer'][i]}</s>"
        output_texts.append(text)
    return output_texts

In [5]:
def tokenize_function(batch):
    return tokenizer(format_texts(batch), padding=False, truncation=False)

max_seq_length = 1024

### DEBUG
dataset["train"] = dataset["train"].select(i for i in range(1000))
dataset["test"] = dataset["test"].select(i for i in range(100))

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.filter(lambda x: len(x["input_ids"]) < max_seq_length, batched=False)

print(f"Training dataset size: {len(tokenized_dataset['train'])}")
print(f"Validation dataset size: {len(tokenized_dataset['test'])}")



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/100 [00:00<?, ? examples/s]

Training dataset size: 255
Validation dataset size: 26


# Trainer

In [6]:
from transformers import AutoModelForCausalLM
from peft import LoraConfig
from trl import SFTConfig, SFTTrainer
from accelerate import PartialState

import torch

# FIXME: to enable more than 1 sample per batch, the extra padding token must be set in the model and the embedding layer resized
# the current workaround simply adds a new pad_token so that the eos_token is not ignored during training, since the models needs to learn when to stop
device_string = PartialState().process_index
model = AutoModelForCausalLM.from_pretrained("meta-llama/CodeLlama-7b-Instruct-hf",
                                             torch_dtype=torch.bfloat16,
                                             device_map={'':device_string})
model.config.pad_token_id = tokenizer.pad_token_id

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
from trl import DataCollatorForCompletionOnlyLM

response_template_with_context = "[\\INST]"
response_template_ids = tokenizer.encode(response_template_with_context, add_special_tokens=False)

collator = DataCollatorForCompletionOnlyLM(response_template_ids, tokenizer=tokenizer)

In [9]:
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    target_modules=["q_proj", "v_proj"],
    task_type="CAUSAL_LM",
)

sft_config = SFTConfig(
    output_dir='tmp_trainer',
    learning_rate=5e-4,
    num_train_epochs=1,
## DEBUG
    max_steps=1,
    max_seq_length=max_seq_length,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    packing=False,
)

trainer = SFTTrainer(
    model,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    args=sft_config,
    peft_config=peft_config,
    data_collator=collator,
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
max_steps is given, it will override any value given in num_train_epochs


In [10]:
for batch in trainer.get_train_dataloader():
    print(batch)
    break

{'input_ids': tensor([[    1,     1,   518, 25580, 29962,   887,   526,   385, 18428,  1824,
         26032,  5780, 29889,  3575,  3414,   338,   304,  2329,   278,  4944,
          6494,  1927,   775, 29889,    13,    13,  1576,  1494,   775,  3743,
           263,  6494,  1927,   740, 29901,    13, 28956,  1645,    13,  4706,
           970,  4669,  1246,   580,  8026, 24490, 29892, 19014,  5308,  2451,
           426,    13,  9651,  3497,  7090,  2283,   353,  1870, 29936,    13,
          9651,   565,   313,  8173,  9170,  2804,  1870, 29897,   426,    13,
         18884,   501, 11150,  1820,   353,   679,  2558, 29898,  3177,   416,
            13, 18884,  7090,  2283,   353,   716,  3497, 29898,  8173,  9170,
         29892,  1820, 29889,  7711,   580,   718, 11393,  4130, 29889, 18828,
          1496,    13, 18884,   565,   313,  8173,  2283, 29889,  9933,  3101,
           426,    13,   462,  1678,   736,  1303,  1523, 13120,  2061, 29898,
          8173,  2283, 29892,  2943, 2

In [11]:
trainer.train()
trainer.save_state()
trainer.save_model(output_dir="codellama-instruct-repair")
tokenizer.save_pretrained(save_directory="codellama-instruct-repair")

Step,Training Loss


('codellama-instruct-repair/tokenizer_config.json',
 'codellama-instruct-repair/special_tokens_map.json',
 'codellama-instruct-repair/tokenizer.json')