In [None]:
!pip install -U transformers bitsandbytes einops accelerate peft datasets

In [None]:
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

modelpath = "typeof/phi-2-qlora-ft"
model = AutoModelForCausalLM.from_pretrained(
    modelpath,
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4",
    ),
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(modelpath, use_fast=False)

In [4]:
tokenizer.add_tokens(["<|im_start|>", "<PAD>"])
tokenizer.pad_token = "<PAD>"
tokenizer.add_special_tokens(dict(eos_token="<|im_end|>"))
model.resize_token_embeddings(
    new_num_tokens=len(tokenizer),
    pad_to_multiple_of=64)
model.config.eos_token_id = tokenizer.eos_token_id

In [5]:
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=32,
    lora_alpha=32,
    target_modules = ['Wqkv','out_proj'],
    lora_dropout=0.1,
    bias="none",
    modules_to_save = ["lm_head", "embed_tokens"],
    task_type="CAUSAL_LM"
)

model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=False)
model = get_peft_model(model, lora_config)
model.config.use_cache = False

In [6]:
from datasets import load_dataset

dataset = load_dataset("")
dataset = dataset["train"].train_test_split(test_size=0.1)

Downloading readme:   0%|          | 0.00/845 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.20M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1682 [00:00<?, ? examples/s]

In [9]:
import os
from functools import partial

templates=[
    "<|im_start|>assistant\n{msg}<|im_end|>",
    "<|im_start|>user\n{msg}<|im_end|>"
]
IGNORE_INDEX=-100

def tokenize(input, max_length):
    input_ids, attention_mask, labels = [], [], []

    for i,msg in enumerate(input["messages"]):
        isHuman = i%2==0
        msg_chatml=templates[isHuman].format(msg=msg)
        msg_tokenized=tokenizer(msg_chatml, truncation=False, add_special_tokens=False)

        input_ids+=msg_tokenized["input_ids"]
        attention_mask+=msg_tokenized["attention_mask"]
        labels+=[IGNORE_INDEX]*len(msg_tokenized["input_ids"]) if isHuman else msg_tokenized["input_ids"]

    return {
        "input_ids": input_ids[:max_length],
        "attention_mask": attention_mask[:max_length],
        "labels": labels[:max_length],
    }

dataset_tokenized = dataset.map(
    partial(tokenize, max_length=1024),
    batched=False,
    num_proc=os.cpu_count(),
    remove_columns=dataset["train"].column_names
)

Map (num_proc=2):   0%|          | 0/1513 [00:00<?, ? examples/s]

Map (num_proc=2):   0%|          | 0/169 [00:00<?, ? examples/s]

In [11]:
def collate(elements):
    tokens=[e["input_ids"] for e in elements]
    tokens_maxlen=max([len(t) for t in tokens])

    for i,sample in enumerate(elements):
        input_ids=sample["input_ids"]
        labels=sample["labels"]
        attention_mask=sample["attention_mask"]

        pad_len=tokens_maxlen-len(input_ids)

        input_ids.extend( pad_len * [tokenizer.pad_token_id] )
        labels.extend( pad_len * [IGNORE_INDEX] )
        attention_mask.extend( pad_len * [0] )

    batch={
        "input_ids": torch.tensor( [e["input_ids"] for e in elements] ),
        "labels": torch.tensor( [e["labels"] for e in elements] ),
        "attention_mask": torch.tensor( [e["attention_mask"] for e in elements] ),
    }
    return batch


In [12]:
from transformers import TrainingArguments, Trainer

bs=1
ga_steps=16
epochs=1
lr=0.00002

steps_per_epoch=len(dataset_tokenized["train"])//(bs*ga_steps)

args = TrainingArguments(
    output_dir="out",
    per_device_train_batch_size=bs,
    per_device_eval_batch_size=16,
    evaluation_strategy="steps",
    logging_steps=1,
    eval_steps=steps_per_epoch//2,
    save_steps=steps_per_epoch,
    gradient_accumulation_steps=ga_steps,
    num_train_epochs=epochs,
    lr_scheduler_type="constant",
    optim="paged_adamw_32bit",
    learning_rate=lr,
    group_by_length=False,
    bf16=False,
    ddp_find_unused_parameters=False,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=collate,
    train_dataset=dataset_tokenized["train"],
    eval_dataset=dataset_tokenized["test"],
)


In [13]:
trainer.train()

Step,Training Loss,Validation Loss
47,1.9816,1.849631
94,1.7667,1.659504


TrainOutput(global_step=94, training_loss=1.9086840799514284, metrics={'train_runtime': 1263.5216, 'train_samples_per_second': 1.197, 'train_steps_per_second': 0.074, 'total_flos': 3673582524672000.0, 'train_loss': 1.9086840799514284, 'epoch': 0.99})

In [16]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch

base_path="typeof/phi-2-qlora-ft"

adapter_path="out/checkpoint-94"

save_to="merged/phi-2-ft"

base_model = AutoModelForCausalLM.from_pretrained(
    base_path,
    return_dict=True,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(base_path)

tokenizer.add_tokens(["<|im_start|>", "<PAD>"])
tokenizer.pad_token = "<PAD>"
tokenizer.add_special_tokens(dict(eos_token="<|im_end|>"))
base_model.resize_token_embeddings(
    new_num_tokens=len(tokenizer),
    pad_to_multiple_of=64)
base_model.config.eos_token_id = tokenizer.eos_token_id

model = PeftModel.from_pretrained(base_model, adapter_path)
model = model.merge_and_unload()

model.save_pretrained(save_to, safe_serialization=True, max_shard_size='4GB')
tokenizer.save_pretrained(save_to)


('merged/phi-2-ft/tokenizer_config.json',
 'merged/phi-2-ft/special_tokens_map.json',
 'merged/phi-2-ft/vocab.json',
 'merged/phi-2-ft/merges.txt',
 'merged/phi-2-ft/added_tokens.json',
 'merged/phi-2-ft/tokenizer.json')

In [21]:
#%cd merged
#%ls
!tar -cvf phi-2-ft.tar phi-2-ft/

phi-2-ft/
phi-2-ft/generation_config.json
phi-2-ft/model.safetensors
phi-2-ft/merges.txt
phi-2-ft/special_tokens_map.json
phi-2-ft/vocab.json
phi-2-ft/config.json
phi-2-ft/tokenizer.json
phi-2-ft/tokenizer_config.json
phi-2-ft/added_tokens.json
