# Dataset

In [1]:
from datasets import load_dataset

dataset = load_dataset("ASSERT-KTH/megadiff-sf-synthetic_test_error", split="train")
dataset = dataset.train_test_split(test_size=0.02)

dataset

DatasetDict({
    train: Dataset({
        features: ['diff', 'is_single_chunk', 'is_single_function', 'buggy_function', 'fixed_function', 'short_diff', 'completion', 'generated_test_case', 'generated_error_message', 'prompt', 'answer'],
        num_rows: 63100
    })
    test: Dataset({
        features: ['diff', 'is_single_chunk', 'is_single_function', 'buggy_function', 'fixed_function', 'short_diff', 'completion', 'generated_test_case', 'generated_error_message', 'prompt', 'answer'],
        num_rows: 1288
    })
})

# Tokeniser

In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("meta-llama/CodeLlama-7b-Instruct-hf")
tokenizer.pad_token = tokenizer.eos_token

In [3]:
def format_texts(examples, begin_inst="[INST]", end_inst="[\INST]"):
    output_texts = []
    for i in range(len(examples['prompt'])):
        text = f"<s>{begin_inst} {examples['prompt'][i]} {end_inst} {examples['answer'][i]}"
        output_texts.append(text)
    return output_texts

In [4]:
def tokenize_function(batch):
    return tokenizer(format_texts(batch), padding=False, truncation=False)

max_seq_length = 1024

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.filter(lambda x: len(x["input_ids"]) < max_seq_length, batched=False)

print(f"Training dataset size: {len(tokenized_dataset['train'])}")
print(f"Validation dataset size: {len(tokenized_dataset['test'])}")

Map:   0%|          | 0/63100 [00:00<?, ? examples/s]

Map:   0%|          | 0/1288 [00:00<?, ? examples/s]

Filter:   0%|          | 0/63100 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1288 [00:00<?, ? examples/s]

Training dataset size: 17102
Training dataset size: 344


In [5]:
from trl import DataCollatorForCompletionOnlyLM

response_template_with_context = "[\INST]"
response_template_ids = tokenizer.encode(response_template_with_context, add_special_tokens=False)

collator = DataCollatorForCompletionOnlyLM(response_template_ids, tokenizer=tokenizer)

# Trainer

In [7]:
from transformers import AutoModelForCausalLM
from peft import LoraConfig
from trl import SFTConfig, SFTTrainer
from accelerate import PartialState

import torch

device_string = PartialState().process_index
model = AutoModelForCausalLM.from_pretrained("meta-llama/CodeLlama-7b-Instruct-hf",
                                             torch_dtype=torch.bfloat16,
                                             device_map={'':device_string})

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    target_modules=["q_proj", "v_proj"],
    task_type="CAUSAL_LM",
)

sft_config = SFTConfig(
    output_dir='tmp_trainer',
    learning_rate=5e-4,
    num_train_epochs=1,
    max_seq_length=max_seq_length,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    packing=False,
)

trainer = SFTTrainer(
    model,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    args=sft_config,
    peft_config=peft_config,
    data_collator=collator,
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [9]:
trainer.train()
trainer.save_state()
trainer.save_model(output_dir="codellama-instruct-repair")
tokenizer.save_pretrained(save_directory="codellama-instruct-repair")

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Step,Training Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 252.00 MiB. GPU 0 has a total capacity of 39.39 GiB of which 66.31 MiB is free. Including non-PyTorch memory, this process has 39.32 GiB memory in use. Of the allocated memory 37.50 GiB is allocated by PyTorch, and 361.51 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)