# 大语言模型预训练-微调技术之Prefix Tuning

## 步骤1 导入相关包

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForSeq2Seq, TrainingArguments, Trainer
import os

os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
os.environ['HF_HOME'] = '/root/autodl-tmp/cache/'

## 步骤2 加载数据集

In [None]:
ds = load_dataset("llm-wizard/alpaca-gpt4-data-zh")
ds

In [None]:
ds[:1]

## 步骤3 数据集预处理

In [None]:
#拉取远程模型
#tokenizer = AutoTokenizer.from_pretrained("bigscience/bloomz-560m")
#加载本地模型，提前将模型下载到本地，提升执行效率
tokenizer = AutoTokenizer.from_pretrained("Langboat/bloom-800m-zh")
tokenizer

In [None]:
def process_func(example):
    # 设置最大长度为256
    MAX_LENGTH = 256
    # 初始化输入ID、注意力掩码和标签列表
    input_ids, attention_mask, labels = [], [], []
    # 对指令和输入进行编码
    instruction = tokenizer("\n".join(["Human: " + example["instruction"], example["input"]]).strip() + "\n\nAssistant: ")
    # 对输出进行编码，并添加结束符
    response = tokenizer(example["output"] + tokenizer.eos_token)
    # 将指令和响应的输入ID拼接起来
    input_ids = instruction["input_ids"] + response["input_ids"]
    # 将指令和响应的注意力掩码拼接起来
    attention_mask = instruction["attention_mask"] + response["attention_mask"]
    # 将指令的标签设置为-100，表示不计算损失；将响应的输入ID作为标签
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"]
    # 如果输入ID的长度超过最大长度，截断输入ID、注意力掩码和标签
    if len(input_ids) > MAX_LENGTH:
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    # 返回处理后的数据
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

In [None]:
tokenized_ds = ds.map(process_func, remove_columns=ds.column_names)
tokenized_ds

## 步骤4 创建模型

In [None]:
model = AutoModelForCausalLM.from_pretrained("Langboat/bloom-800m-zh", low_cpu_mem_usage=True)
model.dtype

### 1、PEFT 步骤1 配置文件

In [None]:
from peft import PrefixTuningConfig, get_peft_model, TaskType
# prefix_projection：加入重参数层，对于soft prompt这种场景效果不错。
config = PrefixTuningConfig(task_type=TaskType.CAUSAL_LM, num_virtual_tokens=10, prefix_projection=True)
config

### 2、PEFT 步骤2 创建模型

In [None]:
model = get_peft_model(model, config)
model

In [None]:
# 打印出模型中可训练参数的数量
model.print_trainable_parameters()

## 步骤5 配置训练参数

In [None]:
args = TrainingArguments(
    output_dir="/root/autodl-tmp/cache/finetuning/bloom-800m-zh-prefix", # 指定模型训练结果的输出目录
    per_device_train_batch_size=4, # 设置每个设备（如GPU）在训练过程中的批次大小为4
    gradient_accumulation_steps=8, # 指定梯度累积步数为8，即将多个批次的梯度累加后再进行一次参数更新
    logging_steps=10,  # 每10个步骤记录一次日志信息
    num_train_epochs=1  # 指定训练的总轮数为1
)

## 步骤6 创建训练器

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_ds,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
)

## 步骤7 模型训练

In [None]:
trainer.train()

## 步骤8 模型推理

In [None]:
from peft import PeftModel
from transformers import pipeline

#加载基础模型
model = AutoModelForCausalLM.from_pretrained("Langboat/bloom-800m-zh", low_cpu_mem_usage=True)
tokenizer = AutoTokenizer.from_pretrained("Langboat/bloom-800m-zh")

#加载prefix模型
p_model = PeftModel.from_pretrained(model=model, model_id="/root/autodl-tmp/cache/finetuning/bloom-800m-zh-prefix/checkpoint-500")

#模型推理
pipe = pipeline("text-generation", model=p_model, tokenizer=tokenizer, device=0)
ipt = "Human: {}\n{}".format("如何写好一个简历？", "").strip() + "\n\nAssistant: "
pipe(ipt, max_length=256, do_sample=True, )