In [1]:
import os
# os.environ["WORLD_SIZE"] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer

# 数据处理

In [2]:
ds = Dataset.load_from_disk('data/alpaca_data_zh')
datasets = ds.train_test_split(test_size=0.2)

In [3]:
ckpt = 'Langboat/bloom-1b4-zh'
tokenizer = AutoTokenizer.from_pretrained(ckpt)

def process_function(example):
    MAX_LENGTH = 256
    input_ids, attention_mask, labels = [], [], []
    instruction = example['instruction']
    input_str = example['input']
    instruction_input_seq = "\n".join(["Human: " + instruction, input_str]).strip() + "\n\n Assistant:"
    response_str = example['output'] + tokenizer.eos_token
    tokenized_instruction_input = tokenizer(instruction_input_seq)
    tokenized_response = tokenizer(response_str)
    input_ids = tokenized_instruction_input['input_ids'] + tokenized_response['input_ids']
    attention_mask = tokenized_instruction_input['attention_mask'] + tokenized_response['attention_mask']
    labels = [-100] * len(tokenized_instruction_input['input_ids']) + tokenized_response['input_ids']
    if len(input_ids) > MAX_LENGTH:
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}

tokenized_ds = datasets.map(process_function, remove_columns=ds.column_names)     

Map:   0%|          | 0/21486 [00:00<?, ? examples/s]

Map:   0%|          | 0/5372 [00:00<?, ? examples/s]

# 模型加载

In [4]:
model = AutoModelForCausalLM.from_pretrained(ckpt)

In [5]:
unit = 1000
num_params = sum(params.numel() for params in model.parameters())
print(f'Number of parameters: {num_params/unit**3:.2f} B')

model_size = num_params * 4 / unit**3
print(f'Model size: {model_size:.2f} GB')

gradient_size = model_size
print(f'Gradient size: {gradient_size:.2f} GB')

optimizer_state = model_size * 2
print(f'Optimizer state: {optimizer_state:.2f} GB')

total_size = model_size + optimizer_state + gradient_size
print(f'Total size: {total_size:.2f} GB')

Number of parameters: 1.30 B
Model size: 5.21 GB
Gradient size: 5.21 GB
Optimizer state: 10.42 GB
Total size: 20.85 GB


# Lora
## PEFT Step1 配置文件

```
minimum conifg
`config = LoraConfig(task_type=TaskType.CAUSAL_LM)`
key parameters
r: rank
lora_alpha: "step size", actually the learning rate, scaled as lora_alpha / r
modules_to_save: additional modules to train and save as lora weights
```

In [6]:
from peft import LoraConfig, TaskType, get_peft_model
# config = LoraConfig(task_type=TaskType.CAUSAL_LM, target_modules=["query_key_value"], modules_to_save=["word_embeddings"])
config = LoraConfig(task_type=TaskType.CAUSAL_LM, target_modules=["query_key_value"])
config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, inference_mode=False, r=8, target_modules={'query_key_value'}, lora_alpha=8, lora_dropout=0.0, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={})

## 创建 peft model

In [7]:
model = get_peft_model(model, config)

In [8]:
for name, params in model.named_parameters():
    if params.requires_grad:
        print(name)

base_model.model.transformer.h.0.self_attention.query_key_value.lora_A.default.weight
base_model.model.transformer.h.0.self_attention.query_key_value.lora_B.default.weight
base_model.model.transformer.h.1.self_attention.query_key_value.lora_A.default.weight
base_model.model.transformer.h.1.self_attention.query_key_value.lora_B.default.weight
base_model.model.transformer.h.2.self_attention.query_key_value.lora_A.default.weight
base_model.model.transformer.h.2.self_attention.query_key_value.lora_B.default.weight
base_model.model.transformer.h.3.self_attention.query_key_value.lora_A.default.weight
base_model.model.transformer.h.3.self_attention.query_key_value.lora_B.default.weight
base_model.model.transformer.h.4.self_attention.query_key_value.lora_A.default.weight
base_model.model.transformer.h.4.self_attention.query_key_value.lora_B.default.weight
base_model.model.transformer.h.5.self_attention.query_key_value.lora_A.default.weight
base_model.model.transformer.h.5.self_attention.query_

In [9]:
print(config.target_modules)

{'query_key_value'}


```
(query_key_value): lora.Linear(
                (base_layer): Linear(in_features=2048, out_features=6144, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=6144, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
```

In [10]:
model.print_trainable_parameters()

trainable params: 1,572,864 || all params: 1,304,684,544 || trainable%: 0.120555118647899


# 训练

In [11]:
args = TrainingArguments(output_dir='./lora/',
                        num_train_epochs=1,
                        per_device_train_batch_size=8,
                        gradient_accumulation_steps=4,
                        per_device_eval_batch_size=8,
                        logging_steps=10,
                        load_best_model_at_end=True,
                        evaluation_strategy='epoch',
                        save_strategy='epoch',
                        save_total_limit=1,
                        report_to='none',
                        )
trainer = Trainer(model=model, args=args, train_dataset=tokenized_ds['train'], eval_dataset=tokenized_ds['test'],
                data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True))

In [12]:
trainer.train()

You're using a BloomTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


TrainOutput(global_step=168, training_loss=2.5467050870259604, metrics={'train_runtime': 945.8245, 'train_samples_per_second': 22.717, 'train_steps_per_second': 0.178, 'total_flos': 2.4970438915915776e+16, 'train_loss': 2.5467050870259604, 'epoch': 1.0})

# 推理

In [15]:
from transformers import pipeline
pipe = pipeline('text-generation', model=model, tokenizer=tokenizer, device=0)
ipt = "Human: {}\n{}".format('考试有什么技巧', '').strip() + "\n\nAssistant: "
result = pipe(ipt, max_length=100, num_beams=5)
print(result[0]['generated_text'])

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PersimmonForCausalLM', 'PLBartFo

Human: 考试有什么技巧

Assistant: 考试有什么技巧？


# load lora weights

In [16]:
from peft import PeftModel

In [None]:
model = AutoModelForCausalLM.from_pretrained(ckpt)
p_model = PeftModel.from_pretrained(model, model_id="./lora/checkpoint-168/")

In [25]:
ipt = tokenizer("Human: {}\n{}".format("考试有哪些技巧？", "").strip() + "\n\nAssistant: ", return_tensors="pt")
ipt = {k: v.to(p_model.device) for k, v in ipt.items()}
print(tokenizer.decode(p_model.generate(**ipt, num_beams=5, 
                    max_length=100, repetition_penalty=1.5)[0], skip_special_tokens=True))

Human: 考试有哪些技巧？

Assistant: 考试的技巧有很多，比如：
1.多做模拟题，熟悉题型和答题思路；
2.合理分配时间，保证每道题都做对；
3.认真审题，排除干扰选项；
4.正确使用答题模板，规范作答；
5.保持良好的答题心态，避免紧张、焦虑等负面情绪影响答题。


# merge model

In [None]:
merge_model = p_model.merge_and_unload()
merge_model.save_pretrained("path_to_save")