## 1. 環境準備

In [None]:
#!pip install transformers datasets accelerate peft trl bitsandbytes pandas

## 2. 資料集載入 & 查看結構

In [1]:
from datasets import load_dataset

# 載入整個
dataset_all = load_dataset("trl-lib/Capybara")

# 查看樣本數
print(dataset_all)
print("Train samples:", len(dataset_all['train']))
print("Test samples:", len(dataset_all['test']))
print(dataset_all['train'][0]["messages"])

DatasetDict({
    train: Dataset({
        features: ['source', 'messages', 'num_turns'],
        num_rows: 15806
    })
    test: Dataset({
        features: ['source', 'messages', 'num_turns'],
        num_rows: 200
    })
})
Train samples: 15806
Test samples: 200
[{'content': 'Recommend a movie to watch.\n', 'role': 'user'}, {'content': 'I would recommend the movie, "The Shawshank Redemption" which is a classic drama film starring Tim Robbins and Morgan Freeman. This film tells a powerful story about hope and resilience, as it follows the story of a young man who is wrongfully convicted of murder and sent to prison. Amidst the harsh realities of prison life, the protagonist forms a bond with a fellow inmate, and together they navigate the challenges of incarceration, while holding on to the hope of eventual freedom. This timeless movie is a must-watch for its moving performances, uplifting message, and unforgettable storytelling.', 'role': 'assistant'}, {'content': 'Describe the cha

## 3. 模型載入與量化

In [2]:
import torch
print(torch.cuda.is_available())  # True 表示可以使用 GPU
print(torch.cuda.get_device_name(0))  # 顯示 GPU 名稱

True
NVIDIA GeForce RTX 3050


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer,BitsAndBytesConfig

model_name = "meta-llama/Llama-3.1-8B-Instruct"

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
# 模型 (量化為 4bit)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,      # 或 load_in_8bit=True
    bnb_4bit_use_double_quant=True,  # optional: 提升 4-bit 精度
    bnb_4bit_quant_type="nf4",       # nf4 或 fp4
    bnb_4bit_compute_dtype=torch.float16 # 計算用 dtype
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

## 4. PEFT 創建

In [5]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16,               # rank
    lora_alpha=32,
    target_modules=["q_proj","v_proj"],  # 根據模型架構選
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM" # 也可傳入 TaskType物件
)

peft_model = get_peft_model(model, config)
peft_model.to("cuda")

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Lin

## 6. 訓練

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(

    #-----模型輸出相關-----#
    output_dir="./PEFT-Capybara-Chat-DEMO",# 儲存資料夾
    logging_steps=10,# 每格多少step打印log，記錄一次訓練日誌 (例如損失值、學習率)
    save_strategy="steps",# 存檔按照step
    save_steps=50,# 每50個step 存一次檢查點
    save_total_limit=2, # 限制檢查點的數量，只保留最新的 N 個。
    report_to="none", # 不使用任何報告工具 (如 Weights & Biases)，如果需要集成可以設為 "wandb" 等。

    #-----模型訓練過程相關-----#
    num_train_epochs=1,# 訓練的總 epoch 數。一個 epoch 代表模型看過整個訓練資料集一次。

    #-----模型訓練優化器和學習率排程-----#
    gradient_accumulation_steps=4, # 梯度累積步數。在不增加實際批次大小的情況下，模擬更大的批次。
    per_device_train_batch_size=1,# 每個 GPU (或 CPU) 裝置的訓練批次大小。
    per_device_eval_batch_size=1,
    learning_rate=2e-4,# 小學習率
    fp16=True,   # 如果 GPU 支援
)


In [5]:
def preprocess_function(examples):
    input_texts = []
    labels = []

    for messages in examples['messages']:
        full_input = ""
        full_label = ""
        for msg in messages:
            role = msg["role"]
            content = msg["content"].strip()
            if role == "user":
                # user 文本只當作 prompt，不計算 loss
                full_input += f"User: {content}\n"
                full_label += "\n" * len(tokenizer(content)["input_ids"])  # 占位，稍後會變 -100
            else:  # assistant
                # assistant 文本當作模型要預測的 target
                full_input += f"Assistant: {content}\n"
                full_label += content

        # tokenizer
        tokenized_input = tokenizer(full_input, truncation=True, padding="max_length", max_length=512)
        tokenized_label = tokenizer(full_label, truncation=True, padding="max_length", max_length=512)

        # 將 user 部分 mask 掉
        tokenized_label_ids = tokenized_label["input_ids"]
        # user 部分對應的位置改成 -100
        for i, id in enumerate(tokenized_label_ids):
            if id == tokenizer.pad_token_id:
                tokenized_label_ids[i] = -100

        input_texts.append(tokenized_input["input_ids"])
        labels.append(tokenized_label_ids)

    return {"input_ids": input_texts, "labels": labels}

# 對整個 DatasetDict 做 map
tokenized_datasets = dataset_all.map(preprocess_function,batched=True)

# 太大了，費時，拆小一點
small_train_dataset = tokenized_datasets["train"].select(range(500))

In [None]:
# Trainer
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=small_train_dataset
)

In [None]:
# 開始訓練
trainer.train()


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Step,Training Loss
10,14.6945
20,8.425
30,7.2674
40,7.0526
50,6.7581
60,6.4804
70,6.4972
80,6.457
90,6.555
100,6.5621


TrainOutput(global_step=125, training_loss=7.43806851196289, metrics={'train_runtime': 12541.3081, 'train_samples_per_second': 0.04, 'train_steps_per_second': 0.01, 'total_flos': 1.1538033278976e+16, 'train_loss': 7.43806851196289, 'epoch': 1.0})

In [7]:
# 兩個方法在這裡的儲存一樣是LoRA權重
# 儲存模型權重：方法1 如果有會包含tokenizer
output_path = "./PEFT-Capybara-Chat-DEMO"
trainer.save_model(output_path)

# 儲存模型權重：方法2 明確指定只存LoRA權重
# peft_model.save_pretrained("./LoRA_weights")
# 使用時：必須和原始 base model 配合載入 (from_pretrained(base_model) + peft_model.load_pretrained)

In [8]:
import gc
print("# --- 清理環境 --- #")
torch.cuda.empty_cache()
torch.cuda.empty_cache()

gc.collect()

print("GPU 記憶體已嘗試釋放。")

# --- 清理環境 --- #
GPU 記憶體已嘗試釋放。


In [10]:
from transformers import pipeline
from peft import PeftModel
print("\n--- 載入 LoRA 適配器並合併模型範例 ---")
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    dtype=torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16,
    device_map="auto"
)

# 從保存的路徑載入 LoRA 適配器
lora_model = PeftModel.from_pretrained(base_model, output_path)

# 將 LoRA 適配器與基礎模型權重合併，生成一個可獨立部署的模型
merged_model = lora_model.merge_and_unload()

# 儲存合併後的模型
merged_model.save_pretrained(output_path)
tokenizer.save_pretrained(output_path)
print(f"合併後的模型已儲存至{output_path}，現在可以像普通模型一樣載入和使用。")

`torch_dtype` is deprecated! Use `dtype` instead!



--- 載入 LoRA 適配器並合併模型範例 ---


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]



合併後的模型已儲存至./PEFT-Capybara-Chat-DEMO，現在可以像普通模型一樣載入和使用。


In [None]:
# 使用 gptq量化儲存
# from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
# from transformers import AutoTokenizer

# model_path = "./PEFT-Capybara-Chat-DEMO"
# quantized_path = "./PEFT-Capybara-Chat-Final-GPTQ"

# quant_config = BaseQuantizeConfig(
#     bits=4,                 # 量化精度
#     group_size=128,         # 分組大小，影響效能/精度
#     desc_act=False          # 是否描述激活量化
# )

# model = AutoGPTQForCausalLM.from_pretrained(model_path, quantize_config=quant_config)
# model.quantize()  # 執行量化
# model.save_quantized(quantized_path)

# tokenizer = AutoTokenizer.from_pretrained(model_path)
# tokenizer.save_pretrained(quantized_path)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
from transformers import BitsAndBytesConfig
import evaluate

# from vllm import LLM, SamplingParams # 如果可以用vllm加速，需要另外寫vllm的模型使用方式
output_path = "./PEFT-Capybara-Chat-DEMO"


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,  # 運算用 FP16
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

tokenizer = AutoTokenizer.from_pretrained(output_path)
model = AutoModelForCausalLM.from_pretrained(
    output_path,
    quantization_config=bnb_config,
    device_map="auto"
)

generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0


## 7. 測試evaluation

In [7]:
# from vllm import LLM, SamplingParams # 使用vllm加速
from tqdm import tqdm # 顯示進度條
def generate_text(prompts, use_vllm=False, llm=None, pipeline_gen=None, params=None, batch_size=2):
    """
    prompts: list[str] - 要生成的 prompt
    use_vllm: bool - 是否使用 vLLM
    llm: vLLM 的 LLM 物件
    pipeline_gen: transformers pipeline 物件
    params: vLLM 的 SamplingParams
    """
    outputs = []

    if use_vllm:
        # vLLM 批次生成
        vllm_outputs = llm.generate(prompts, sampling_params=params)
        for o in vllm_outputs:
            outputs.append(o.outputs[0].text)
    else:
        # pipeline 批次生成
        for i in tqdm(range(0, len(prompts), batch_size), desc="Pipeline generating"):
            batch_prompts = prompts[i:i+batch_size]
            batch_outputs = pipeline_gen(batch_prompts, max_new_tokens=100, num_return_sequences=1)
            for out in batch_outputs:
                if isinstance(out, list):
                    outputs.append(out[0]["generated_text"])
                else:
                    outputs.append(out["generated_text"])
    return outputs

In [14]:
# 選擇評估指標
metric = evaluate.load("bertscore")  # 也可換成 "rouge", "bleu"

prompts = []
labels = []

for example in tokenized_datasets['test']:
    messages = example['messages']
    user_prompt = "".join([m['content']+"\n" for m in messages if m['role']=='user'])
    label = next((m['content'] for m in reversed(messages) if m['role']=='assistant'), "")
    
    prompts.append(user_prompt.strip())
    labels.append(label.strip())

# 批次大小，可以依 GPU 記憶體調整
batch_size = 2

predictions = []
references = []

# 選擇生成方式
use_vllm = False  # True 時切換到 vLLM
outputs = generate_text(prompts, use_vllm=use_vllm, llm=None, pipeline_gen=generator, params=None,batch_size=2)

predictions.extend(outputs)
references.extend(labels)

results = metric.compute(predictions=predictions, references=references, lang="en")
print(results)

Pipeline generating: 100%|██████████| 100/100 [25:24<00:00, 15.24s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'precision': [0.7627729177474976, 0.7669517993927002, 0.8192554116249084, 0.8047548532485962, 0.7410728931427002, 0.7449355721473694, 0.7612544298171997, 0.7948060035705566, 0.8297368288040161, 0.7811315059661865, 0.7972325086593628, 0.7765320539474487, 0.7872973680496216, 0.78782057762146, 0.8110166192054749, 0.7990453839302063, 0.7958991527557373, 0.7907364368438721, 0.7838118672370911, 0.7934437990188599, 0.7906996011734009, 0.7878732681274414, 0.7747454643249512, 0.8038123846054077, 0.7497748732566833, 0.7994127869606018, 0.8053687810897827, 0.8393769264221191, 0.8172674179077148, 0.8269230127334595, 0.8217172622680664, 0.7775309681892395, 0.7664794921875, 0.7842079401016235, 0.7756956815719604, 0.7461564540863037, 0.7486515045166016, 0.7710837125778198, 0.8268269300460815, 0.809393048286438, 0.7930091619491577, 0.8056514263153076, 0.7912476062774658, 0.7804070115089417, 0.7884009480476379, 0.817781925201416, 0.7744842171669006, 0.8099175691604614, 0.8210277557373047, 0.8137956857

In [15]:
import numpy as np

precision_avg = np.mean(results['precision'])
recall_avg = np.mean(results['recall'])
f1_avg = np.mean(results['f1'])

print(f"Precision: {precision_avg:.4f}")
print(f"Recall:    {recall_avg:.4f}")
print(f"F1:        {f1_avg:.4f}")

Precision: 0.7891
Recall:    0.8190
F1:        0.8031


In [16]:
import gc
print("# --- 清理環境 --- #")
torch.cuda.empty_cache()
torch.cuda.empty_cache()

gc.collect()

print("GPU 記憶體已嘗試釋放。")

# --- 清理環境 --- #
GPU 記憶體已嘗試釋放。
