In [1]:
from datasets import load_dataset
from peft import LoraConfig
import torch
from trl import SFTTrainer
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    BitsAndBytesConfig,
)

In [2]:
MAX_SEQ_LENGTH = 2048
OUTPUT_DIR = "./outputs"

In [3]:
training_config = {
    "output_dir": OUTPUT_DIR,
    "bf16": True,
    "learning_rate": 2e-5,
    "logging_steps": 20,
    "logging_strategy": "steps",
    "lr_scheduler_type": "cosine",
    "num_train_epochs": 2,
    "overwrite_output_dir": True,
    "per_device_eval_batch_size": 4,
    "per_device_train_batch_size": 4,
    "save_steps": 50,
    "save_total_limit": 1,
    "gradient_accumulation_steps": 3,
    "warmup_ratio": 0.2,
    "warmup_steps" : 20,
}

training_config = TrainingArguments(**training_config)

In [4]:
lora_config = {
    "r": 8,
    "lora_alpha": 16,
    "lora_dropout": 0.1,
    "bias": "none",
    "task_type": "CAUSAL_LM",
    # "target_modules": ["qkv_proj"],
    "target_modules": ["v_proj", "k_proj", "q_proj", "o_proj"],
}

lora_config = LoraConfig(**lora_config)

In [5]:
# checkpoint_path = "microsoft/Phi-3-mini-4k-instruct"
# checkpoint_path = "microsoft/phi-1_5"
checkpoint_path = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.bfloat16,
    llm_int8_enable_fp32_cpu_offload = True
)

model = AutoModelForCausalLM.from_pretrained(
    checkpoint_path,
    use_cache=False,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    quantization_config=quantization_config,
    device_map=None,
    attn_implementation='eager',
)

In [6]:
tokenizer = AutoTokenizer.from_pretrained(
    checkpoint_path,
)

tokenizer.model_max_length = MAX_SEQ_LENGTH
# use unk rather than eos token to prevent endless generation
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.eos_token)
# tokenizer.pad_token = tokenizer.unk_token
# tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
tokenizer.padding_side = 'right'

In [7]:
train_dataset = load_dataset("HuggingFaceH4/ultrachat_200k", split='train_sft[:5000]')
test_dataset = load_dataset("HuggingFaceH4/ultrachat_200k", split='test_sft[:500]')

column_names = list(train_dataset.features)

In [8]:
# def apply_chat_template(messages, tokenizer):
#     prompt = ""
#     for m in messages["messages"]:
#         prompt+= f"{m['role']}: {m['content']}\n"
#     messages["text"] = prompt
#     return messages

In [9]:
def apply_chat_template(
    example,
    tokenizer,
):
    messages = example["messages"]
    example["text"] = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=False
    )
    return example

In [10]:
processed_train_dataset = train_dataset.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer},
    num_proc=10,
    remove_columns=column_names,
    desc="Applying chat template to train_sft",
)

processed_test_dataset = test_dataset.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer},
    num_proc=10,
    remove_columns=column_names,
    desc="Applying chat template to test_sft",
)

In [11]:
trainer = SFTTrainer(
    model=model,
    args=training_config,
    peft_config=lora_config,
    train_dataset=processed_train_dataset,
    eval_dataset=processed_test_dataset,
)

Tokenizing eval dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2563 > 2048). Running this sequence through the model will result in indexing errors


Truncating eval dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [12]:
train_result = trainer.train()



OutOfMemoryError: CUDA out of memory. Tried to allocate 256.00 MiB. GPU 0 has a total capacity of 8.00 GiB of which 0 bytes is free. Of the allocated memory 14.03 GiB is allocated by PyTorch, and 256.00 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

In [None]:
tokenizer.padding_side = 'left'

metrics = trainer.evaluate()
metrics["eval_samples"] = len(processed_test_dataset)

trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

In [None]:
trainer.save_model(train_conf.output_dir)

In [None]:
# !rm -r ~/.cache/huggingface/modules/transformers_modules/microsoft/Phi-3-mini-4k-instruct
# !rm -r ~/.cache/huggingface/datasets/