## 加载模型

In [1]:
# from transformers import AutoModelForCausalLM, AutoTokenizer
from modelscope import AutoTokenizer, AutoModelForCausalLM
import torch
device = "gpu" if torch.cuda.is_available() else "cpu"
model = AutoModelForCausalLM.from_pretrained("./model", dtype=torch.bfloat16, device_map="auto")
model.enable_input_require_grads()
tokenizer = AutoTokenizer.from_pretrained("./model")
tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

## 加载数据集

In [2]:
from datasets import load_dataset
dataset = load_dataset("json", data_files="huanhuan.json")
dataset["train"][:3]

{'instruction': ['小姐，别的秀女都在求中选，唯有咱们小姐想被撂牌子，菩萨一定记得真真儿的——',
  '这个温太医啊，也是古怪，谁不知太医不得皇命不能为皇族以外的人请脉诊病，他倒好，十天半月便往咱们府里跑。',
  '嬛妹妹，刚刚我去府上请脉，听甄伯母说你来这里进香了。'],
 'input': ['', '', ''],
 'output': ['嘘——都说许愿说破是不灵的。', '你们俩话太多了，我该和温太医要一剂药，好好治治你们。', '出来走走，也是散心。']}

In [3]:
def process_data(example):
    MAX_LENGTH = 384
    instruction = tokenizer.apply_chat_template(
        [
            {"role": "system", "content": "现在你要扮演皇帝身边的女人--甄嬛"},
            {"role": "user", "content": example["instruction"] + example["input"]}
        ],
        add_generation_prompt=True,
        tokenize=False
    )
    instruction = tokenizer(instruction, add_special_tokens=False)
    response = tokenizer(example["output"] + tokenizer.eos_token, add_special_tokens=False)
    input_ids = instruction['input_ids'] + response['input_ids']
    attention_mask = instruction["attention_mask"] + response["attention_mask"]
    labels = [-100] * len(instruction['input_ids']) + response['input_ids']
    if len(input_ids) > MAX_LENGTH:
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }
processed_dataset = dataset["train"].map(process_data, remove_columns=dataset['train'].column_names)

Map:   0%|          | 0/3729 [00:00<?, ? examples/s]

## LoRA微调

In [4]:
from peft import LoraConfig, get_peft_model, TaskType
config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
)
model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 20,971,520 || all params: 8,051,232,768 || trainable%: 0.2605


In [5]:
from transformers import TrainingArguments, Trainer, DataCollatorForSeq2Seq
import wandb
wandb.init(
    project="llm-learning",
    mode="offline"
)
training_args = TrainingArguments(
    output_dir='./output/huanhuan_llama3_lora',
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    logging_steps=10,
    num_train_epochs=3,
    save_steps=100,
    learning_rate=1e-4,
    save_on_each_node=True,
    gradient_checkpointing=True,
    report_to="wandb",
    run_name="chat-huanhuan"
)

trainer = Trainer(
    model=model,
    train_dataset=processed_dataset,
    args=training_args,
    data_collator=DataCollatorForSeq2Seq(tokenizer, padding=True)
)
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
10,3.2694
20,3.0323
30,2.8916
40,2.8276
50,2.8489
60,2.8104
70,2.7988
80,2.824
90,2.823
100,2.7288


TrainOutput(global_step=702, training_loss=2.2997246787079377, metrics={'train_runtime': 951.6071, 'train_samples_per_second': 11.756, 'train_steps_per_second': 0.738, 'total_flos': 7.214745539837952e+16, 'train_loss': 2.2997246787079377, 'epoch': 3.0})

## 模型验证

In [33]:
model.eval()
prompt = "嬛嬛你怎么了，朕想你了！"
inputs = tokenizer.apply_chat_template(
    [
        {"role": "system", "content": "现在你要扮演皇帝身边的女人--甄嬛"},
        {"role": "user", "content": prompt}
    ],
    add_generation_prompt=True,
    tokenize=False
)
inputs = tokenizer(inputs, add_special_tokens=False, return_tensors="pt").to(model.device)
gen_ids = model.generate(**inputs, max_new_tokens=512)[0][len(inputs["input_ids"][0]):]
print("皇上：" + prompt)
print("嬛嬛：" + tokenizer.decode(gen_ids, skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


皇上：嬛嬛你怎么了，朕想你了！
嬛嬛：臣妾在此，皇上怎么会不见臣妾？
