In [1]:
%pip install transformers datasets peft accelerate bitsandbytes

Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import json
from pathlib import Path
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, TaskType

In [3]:
# Configuration
MODEL_ID = "codellama/CodeLlama-7b-Instruct-hf"
TRAIN_FILE = "./datasets/train_data.json"
TEST_FILE = "./datasets/test_data.json"
OUTPUT_DIR = "./results"

In [4]:
def load_datasets(train_path, test_path):
    data_files = {"train": train_path, "test": test_path}
    dataset = load_dataset("json", data_files=data_files)
    return dataset

In [5]:
def tokenize_dataset(dataset, tokenizer):
    def preprocess(example):
        return tokenizer(
            example["input"],
            text_target=example["output"],
            truncation=True,
            padding="max_length",
            max_length=512
        )
    return dataset.map(preprocess, batched=True)

In [6]:
def create_model():
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        torch_dtype="auto",
        device_map="auto"
    )

    lora_config = LoraConfig(
        r=8,
        lora_alpha=16,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type=TaskType.CAUSAL_LM
    )
    model = get_peft_model(model, lora_config)
    return tokenizer, model

In [7]:
def main():
    dataset = load_datasets(TRAIN_FILE, TEST_FILE)
    tokenizer, model = create_model()
    tokenized_dataset = tokenize_dataset(dataset, tokenizer)
    

    
    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=2,
        num_train_epochs=3,
        logging_dir=f"{OUTPUT_DIR}/logs",
        save_strategy="epoch",
        evaluation_strategy="epoch",
        fp16=True,
        report_to="none"
    )

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["test"],
        tokenizer=tokenizer,
        data_collator=data_collator
    )

    trainer.train()

    # Save final model
    model.save_pretrained(f"{OUTPUT_DIR}/final_model")
    tokenizer.save_pretrained(f"{OUTPUT_DIR}/final_model")

In [8]:
if __name__ == '__main__':
    main()


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/29863 [00:00<?, ? examples/s]

Map:   0%|          | 0/3615 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Epoch,Training Loss,Validation Loss


RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
