In [1]:
import os
import io
import json
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import torch
import torch.nn as nn
from torch.utils.data import Dataset


from peft import get_peft_model, LoraConfig, TaskType
from transformers import BloomForCausalLM, BloomTokenizerFast, Trainer, TrainingArguments, DataCollatorForLanguageModeling

In [2]:
max_input_length = 512

In [3]:
def _make_r_io_base(f, mode: str):
    if not isinstance(f, io.IOBase):
        f = open(f, mode=mode)
    return f


def jload(f, mode="r"):
    """Load a .json file into a dictionary."""
    f = _make_r_io_base(f, mode)
    jdict = json.load(f)
    f.close()
    return jdict


PROMPT_DICT = {
    "prompt_input":
        ("Below is an instruction that describes a task, paired with an input that provides further context. "
         "Write a response that appropriately completes the request.\n\n"
         "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"),
    "prompt_no_input": ("Below is an instruction that describes a task. "
                        "Write a response that appropriately completes the request.\n\n"
                        "### Instruction:\n{instruction}\n\n### Response:"),
}


def get_dataset_from_jsonl(jsonl_file, tokenizer=None):
    list_data_dict = jload(jsonl_file)

    prompt_input, prompt_no_input = PROMPT_DICT["prompt_input"], PROMPT_DICT["prompt_no_input"]
    sources = [
        prompt_input.format_map(example) if example.get("input", "") != "" else prompt_no_input.format_map(example)
        for example in list_data_dict
    ]
    targets = [f"{example['output']}{tokenizer.eos_token}" for example in list_data_dict]

    return zip(sources, targets)


class SFTDataset(Dataset):
    def __init__(self, train_path, tokenizer, split, max_length=2048, max_data_size=0, valid_ratio=0.01):
        self.post_list = []
        dataset = get_dataset_from_jsonl(train_path, tokenizer=tokenizer)
        self.post_list = [s + t for s, t in dataset]
        
        if max_data_size != 0:
            self.post_list = self.post_list[:max_data_size]

        if "valid" in split:
            self.post_list = self.post_list[0:10]
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.input_ids = []
        self.attn_masks = []

    def __len__(self):
        return len(self.post_list)

    def __getitem__(self, idx):
        txt = self.post_list[idx]
        encodings_dict = self.tokenizer(txt, truncation=True, max_length=self.max_length, padding="max_length")
        input_ids = torch.tensor(encodings_dict["input_ids"])
        attn_masks = torch.tensor(encodings_dict["attention_mask"])

        return {
            "input_ids": input_ids,
            "attention_mask": attn_masks,
            "labels": input_ids,
        }

In [4]:
data_path = "./lora_test/niuyeye.json"
model_path = "/home/xinyu/bloomz-560m/"
tokenizer = BloomTokenizerFast.from_pretrained(model_path)
model = BloomForCausalLM.from_pretrained(model_path, device_map='auto')

In [5]:
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
)

In [6]:
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()



trainable params: 786432 || all params: 560001024 || trainable%: 0.14043402892063284


In [7]:
train_dataset = SFTDataset(
        data_path,
        tokenizer,
        "train",
        max_length=max_input_length,
)

In [8]:
trainer = Trainer(
    model=model, 
    train_dataset=train_dataset,
    args=TrainingArguments(
        per_device_train_batch_size=1, 
        gradient_accumulation_steps=4,
        warmup_steps=100, 
        max_steps=200, 
        learning_rate=2e-4, 
        fp16=True,
        logging_steps=10, 
        output_dir='outputs'
    ),
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

You're using a BloomTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
10,8.0204
20,7.6245
30,6.1291
40,4.5244
50,3.0467
60,1.2397
70,0.2879
80,0.0546
90,0.0588
100,0.04


TrainOutput(global_step=200, training_loss=1.5701970191299914, metrics={'train_runtime': 80.3606, 'train_samples_per_second': 9.955, 'train_steps_per_second': 2.489, 'total_flos': 744898324070400.0, 'train_loss': 1.5701970191299914, 'epoch': 26.67})

In [9]:
model.save_pretrained("outputs/niuyeye/")