In [1]:
from datasets import load_dataset
from peft import LoraConfig
import torch
from trl import SFTTrainer
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    BitsAndBytesConfig,
)

In [2]:
MAX_SEQ_LENGTH = 2048
OUTPUT_DIR = "./outputs"

In [8]:
training_config = {
    "output_dir": OUTPUT_DIR,
    "bf16": True,
    "optim": "paged_adamw_32bit",
    "learning_rate": 2e-5,
    "logging_steps": 20,
    "logging_strategy": "steps",
    "lr_scheduler_type": "cosine",
    "weight_decay": 0.001,
    "num_train_epochs": 2,
    "overwrite_output_dir": True,
    "per_device_eval_batch_size": 4,
    "per_device_train_batch_size": 4,
    "save_steps": 50,
    "save_total_limit": 1,
    "gradient_accumulation_steps": 2,
    "warmup_steps" : 20,
}

training_config = TrainingArguments(**training_config)

In [6]:
lora_config = {
    "r": 8,
    "lora_alpha": 16,
    "lora_dropout": 0.1,
    "bias": "none",
    "task_type": "CAUSAL_LM",
    "use_rslora": True,
    # "target_modules": ["qkv_proj"],
    "target_modules": ["v_proj", "k_proj", "q_proj", "o_proj"],
}

lora_config = LoraConfig(**lora_config)

In [5]:
# checkpoint_path = "microsoft/Phi-3-mini-4k-instruct"
# checkpoint_path = "microsoft/phi-1_5"
checkpoint_path = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.bfloat16,
    llm_int8_enable_fp32_cpu_offload = True
)

model = AutoModelForCausalLM.from_pretrained(
    checkpoint_path,
    use_cache=False,
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    quantization_config=quantization_config,
    device_map=None,
    attn_implementation='eager',
)

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [9]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)

tokenizer.model_max_length = MAX_SEQ_LENGTH
# use unk rather than eos token to prevent endless generation
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.eos_token)
# tokenizer.pad_token = tokenizer.unk_token
# tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
tokenizer.padding_side = 'right'

tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

In [10]:
train_dataset = load_dataset("HuggingFaceH4/ultrachat_200k", split='train_sft[:5000]')
test_dataset = load_dataset("HuggingFaceH4/ultrachat_200k", split='test_sft[:500]')

column_names = list(train_dataset.features)

README.md:   0%|          | 0.00/3.90k [00:00<?, ?B/s]

data/train_sft-00000-of-00003-a3ecf92756(…):   0%|          | 0.00/244M [00:00<?, ?B/s]

data/train_sft-00001-of-00003-0a1804bcb6(…):   0%|          | 0.00/244M [00:00<?, ?B/s]

data/train_sft-00002-of-00003-ee46ed25cf(…):   0%|          | 0.00/244M [00:00<?, ?B/s]

data/test_sft-00000-of-00001-f7dfac4afe5(…):   0%|          | 0.00/81.2M [00:00<?, ?B/s]

data/train_gen-00000-of-00003-a6c9fb894b(…):   0%|          | 0.00/244M [00:00<?, ?B/s]

data/train_gen-00001-of-00003-d6a0402e41(…):   0%|          | 0.00/243M [00:00<?, ?B/s]

data/train_gen-00002-of-00003-c0db75b92a(…):   0%|          | 0.00/243M [00:00<?, ?B/s]

data/test_gen-00000-of-00001-3d4cd830914(…):   0%|          | 0.00/80.4M [00:00<?, ?B/s]

Generating train_sft split:   0%|          | 0/207865 [00:00<?, ? examples/s]

Generating test_sft split:   0%|          | 0/23110 [00:00<?, ? examples/s]

Generating train_gen split:   0%|          | 0/256032 [00:00<?, ? examples/s]

Generating test_gen split:   0%|          | 0/28304 [00:00<?, ? examples/s]

In [8]:
# def apply_chat_template(messages, tokenizer):
#     prompt = ""
#     for m in messages["messages"]:
#         prompt+= f"{m['role']}: {m['content']}\n"
#     messages["text"] = prompt
#     return messages

In [11]:
def apply_chat_template(
    example,
    tokenizer,
):
    messages = example["messages"]
    example["text"] = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=False
    )
    return example

In [12]:
processed_train_dataset = train_dataset.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer},
    num_proc=10,
    remove_columns=column_names,
    desc="Applying chat template to train_sft",
)

processed_test_dataset = test_dataset.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer},
    num_proc=10,
    remove_columns=column_names,
    desc="Applying chat template to test_sft",
)

Applying chat template to train_sft (num_proc=10):   0%|          | 0/5000 [00:00<?, ? examples/s]

Applying chat template to test_sft (num_proc=10):   0%|          | 0/500 [00:00<?, ? examples/s]

In [13]:
trainer = SFTTrainer(
    model=model,
    args=training_config,
    peft_config=lora_config,
    train_dataset=processed_train_dataset,
    eval_dataset=processed_test_dataset,
)

Converting train dataset to ChatML:   0%|          | 0/5000 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/5000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/5000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (3648 > 2048). Running this sequence through the model will result in indexing errors


Truncating train dataset:   0%|          | 0/5000 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/500 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [14]:
train_result = trainer.train()



Step,Training Loss
20,1.1782
40,1.1769
60,1.1999
80,1.1057
100,1.165
120,1.1735
140,1.138
160,1.1649
180,1.1429




KeyboardInterrupt: 

In [None]:
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

In [None]:
tokenizer.padding_side = 'left'

metrics = trainer.evaluate()
metrics["eval_samples"] = len(processed_test_dataset)

trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)

In [None]:
trainer.save_model(train_conf.output_dir)

In [None]:
# !rm -r ~/.cache/huggingface/modules/transformers_modules/microsoft/Phi-3-mini-4k-instruct
# !rm -r ~/.cache/huggingface/datasets/