In [2]:
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer,BitsAndBytesConfig, pipeline
import torch
from peft import LoraConfig, get_peft_model
from trl import SFTConfig, SFTTrainer
import gc
import evaluate
from peft import PeftModel
from tqdm import tqdm # 顯示進度條
import trackio 

In [3]:
print(torch.cuda.is_available())  # True 表示可以使用 GPU
print(torch.cuda.get_device_name(0))  # 顯示 GPU 名稱

True
NVIDIA GeForce RTX 3050


In [4]:
!nvidia-smi

Tue Oct 28 22:21:30 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 581.57                 Driver Version: 581.57         CUDA Version: 13.0     |
+-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3050      WDDM  |   00000000:01:00.0  On |                  N/A |
|  0%   46C    P8             18W /  130W |     818MiB /   8192MiB |      9%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+

+----------------------------------------------

In [53]:
print("# --- 清理環境 --- #")
torch.cuda.empty_cache()

gc.collect()

# --- 清理環境 --- #


0

In [8]:
output_path = "./PEFT-OPT350M-Capybara-QLoRA-Demo"
# 資料集備選：
# "trl-lib/Capybara" 
# "google/boolq"
# "Abzu/dolly_hhrlhf"
# "stanfordnlp/imdb"
dataset_all = load_dataset("trl-lib/Capybara")

# 查看樣本數
print(dataset_all)
print("Train samples:", len(dataset_all['train']))
print("Test samples:", len(dataset_all['test']))
print(dataset_all['train'][0]["messages"])

DatasetDict({
    train: Dataset({
        features: ['source', 'messages', 'num_turns'],
        num_rows: 15806
    })
    test: Dataset({
        features: ['source', 'messages', 'num_turns'],
        num_rows: 200
    })
})
Train samples: 15806
Test samples: 200
[{'content': 'Recommend a movie to watch.\n', 'role': 'user'}, {'content': 'I would recommend the movie, "The Shawshank Redemption" which is a classic drama film starring Tim Robbins and Morgan Freeman. This film tells a powerful story about hope and resilience, as it follows the story of a young man who is wrongfully convicted of murder and sent to prison. Amidst the harsh realities of prison life, the protagonist forms a bond with a fellow inmate, and together they navigate the challenges of incarceration, while holding on to the hope of eventual freedom. This timeless movie is a must-watch for its moving performances, uplifting message, and unforgettable storytelling.', 'role': 'assistant'}, {'content': 'Describe the cha

In [None]:
model_name = "facebook/opt-350m"

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
# 模型 (量化為 4bit)
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,      # 或 load_in_8bit=True
#     bnb_4bit_use_double_quant=True,  # optional: 提升 4-bit 精度
#     bnb_4bit_quant_type="nf4",       # nf4 或 fp4
#     bnb_4bit_compute_dtype=torch.float16 # 計算用 dtype
# )

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    # quantization_config=bnb_config,
    device_map="auto",
    dtype = torch.float16,
)
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)


Device set to use cuda:0


Exception ignored in: <function tqdm.__del__ at 0x00000283F49C7BA0>
Traceback (most recent call last):
  File "d:\Learning-lab\.venv\Lib\site-packages\tqdm\std.py", line 1148, in __del__
    self.close()
  File "d:\Learning-lab\.venv\Lib\site-packages\tqdm\notebook.py", line 279, in close
    self.disp(bar_style='danger', check_delay=False)
    ^^^^^^^^^
AttributeError: 'tqdm' object has no attribute 'disp'


In [11]:
print(model) # 查看模型結構，使用神奇妙妙工具微調時需要注意力層投影名稱

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 512, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 1024)
      (project_out): Linear(in_features=1024, out_features=512, bias=False)
      (project_in): Linear(in_features=512, out_features=1024, bias=False)
      (layers): ModuleList(
        (0-23): 24 x OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=409

In [12]:

def generate_text(prompts, pipeline_gen=None, batch_size=2):
    outputs = []
    # pipeline 批次生成
    for i in tqdm(range(0, len(prompts), batch_size), desc="Pipeline generating"):
        batch_prompts = prompts[i:i+batch_size]
        batch_outputs = pipeline_gen(batch_prompts, max_new_tokens=100, num_return_sequences=1,truncation=True,do_sample=False)
        for out in batch_outputs:
            if isinstance(out, list):
                outputs.append(out[0]["generated_text"])
            else:
                outputs.append(out["generated_text"])
    return outputs

In [13]:
def preprocess_function(examples):
    input_ids_list = []
    labels_list = []

    for messages in examples["messages"]:
        full_text = ""
        role_spans = []

        for msg in messages:
            role = msg["role"]
            content = msg["content"].strip()

            if role == "user":
                prefix = "User: "
                start_idx = len(full_text)
                full_text += prefix + content + "\n"
                end_idx = len(full_text)
                role_spans.append((start_idx, end_idx))
            else:
                full_text += "Assistant: " + content + "\n"

        # tokenize 並取得 offset
        tokenized = tokenizer(
            full_text,
            truncation=True,
            padding="max_length",
            max_length=512,
            return_offsets_mapping=True
        )

        input_ids = tokenized["input_ids"]
        labels = input_ids.copy()

        # mask user 區段
        for start, end in role_spans:
            for idx, (token_start, token_end) in enumerate(tokenized["offset_mapping"]):
                # 只對實際文字做 mask，跳過 padding
                if token_start == token_end:
                    continue
                if token_start >= start and token_end <= end:
                    labels[idx] = -100
                if token_start < end and token_end > start:
                    labels[idx] = -100


        input_ids_list.append(input_ids)
        labels_list.append(labels)
    assert len(input_ids) == 512
    assert len(labels) == 512

    return {"input_ids": input_ids_list, "labels": labels_list}

In [14]:
max_len = 512

dataset_all = load_dataset("trl-lib/Capybara")

def filter_long_samples(example):
    # 把 messages 展平為純文字
    full_text = ""
    for msg in example["messages"]:
        role = msg["role"]
        content = msg["content"].strip()
        prefix = "User: " if role == "user" else "Assistant: "
        full_text += prefix + content + "\n"
    # 用 tokenizer 計算長度
    tokenized = tokenizer(full_text, truncation=False, add_special_tokens=False)
    return len(tokenized["input_ids"]) <= max_len

# 用 .filter() 過濾整個資料集
dataset_all = dataset_all.map(preprocess_function, batched=True)
dataset_all = dataset_all.filter(filter_long_samples)

print("After filtering:")
print("Train samples:", len(dataset_all["train"]))
print("Test samples:", len(dataset_all["test"]))

Map:   0%|          | 0/15806 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Filter:   0%|          | 0/15806 [00:00<?, ? examples/s]

Filter:   0%|          | 0/200 [00:00<?, ? examples/s]

After filtering:
Train samples: 4604
Test samples: 60


In [15]:
train_val = dataset_all["train"].train_test_split(test_size=0.2, seed=42)
dataset = {
    "train": train_val["train"],
    "validation": train_val["test"],
    "test": dataset_all["test"],
}
print(f"train data len: {len(dataset['train'])}")
print(f"validation data len: {len(dataset['validation'])}")
print(f"test data len: {len(dataset['test'])}")

train data len: 3683
validation data len: 921
test data len: 60


In [16]:
# 確認範圍，避免索引錯誤
max_id = max([max(ids) for ids in dataset["train"]["input_ids"]])
print("Max token ID:", max_id)
print("Tokenizer vocab size:", tokenizer.vocab_size)

Max token ID: 50254
Tokenizer vocab size: 50265


In [18]:
def evalmodel(generator):
    # 選擇評估指標
    metric = evaluate.load("bertscore")  # 也可換成 "rouge", "bleu"

    prompts = []
    labels = []

    for example in dataset["test"]:
        messages = example['messages']
        user_prompt = "".join([m['content']+"\n" for m in messages if m['role']=='user'])
        label = next((m['content'] for m in reversed(messages) if m['role']=='assistant'), "")
        
        prompts.append(user_prompt.strip())
        labels.append(label.strip())

    for i, p in enumerate(prompts):
        tokenized = tokenizer(p, add_special_tokens=True)
        if max(tokenized['input_ids']) >= tokenizer.vocab_size:
            print(f"Prompt {i} has token id out of range: {max(tokenized['input_ids'])}")
        if len(tokenized['input_ids']) > 512:
            print(f"Prompt {i} too long: {len(tokenized['input_ids'])} tokens")

    predictions = []
    references = []

    # 選擇生成方式
    outputs = generate_text(prompts, pipeline_gen=generator,batch_size=2)

    predictions.extend(outputs)
    references.extend(labels)

    results = metric.compute(predictions=predictions, references=references, lang="en")
    print(results)
    import numpy as np

    precision_avg = np.mean(results['precision'])
    recall_avg = np.mean(results['recall'])
    f1_avg = np.mean(results['f1'])

    print(f"Precision: {precision_avg:.4f}")
    print(f"Recall:    {recall_avg:.4f}")
    print(f"F1:        {f1_avg:.4f}")

In [19]:
# baseline
evalmodel(generator=generator)

Pipeline generating:   0%|          | 0/30 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Pipeline generating:  33%|███▎      | 10/30 [00:19<00:31,  1.56s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Pipeline generating: 100%|██████████| 30/30 [00:49<00:00,  1.64s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'precision': [0.8217451572418213, 0.8107905387878418, 0.830459713935852, 0.7366594076156616, 0.7564651966094971, 0.8497792482376099, 0.8497246503829956, 0.8837918043136597, 0.8247810006141663, 0.7553496360778809, 0.7829270362854004, 0.8639145493507385, 0.7495079040527344, 0.8070576190948486, 0.885560154914856, 0.710041344165802, 0.8720864057540894, 0.8701018691062927, 0.7506201267242432, 0.7543113827705383, 0.8730806112289429, 0.8401428461074829, 0.774517297744751, 0.8213115930557251, 0.746679425239563, 0.7796310782432556, 0.840653657913208, 0.8737272024154663, 0.7590190172195435, 0.7763035297393799, 0.8394687175750732, 0.863319456577301, 0.7249789237976074, 0.8152983784675598, 0.6698594093322754, 0.8195995092391968, 0.7384986877441406, 0.8610401153564453, 0.7867806553840637, 0.8155019283294678, 0.8218244314193726, 0.83235764503479, 0.7859389781951904, 0.8336321711540222, 0.7997800707817078, 0.8062293529510498, 0.818127453327179, 0.7522761821746826, 0.8106005191802979, 0.7734717130661

In [54]:
training_args = SFTConfig(
    output_dir=output_path,
    num_train_epochs=2,
    max_length=512,
    per_device_train_batch_size=4,# 每個 GPU (或 CPU) 裝置的訓練批次大小。
    gradient_accumulation_steps=2,# 梯度累積步數。在不增加實際批次大小的情況下，模擬更大的批次。
    logging_steps=10,# 每格多少step打印log，記錄一次訓練日誌 (例如損失值、學習率)
    learning_rate=2e-4,# 小學習率，1e-5到5e-5
    remove_unused_columns=False,
    fp16=True,   # 如果 GPU 支援
    push_to_hub=False,
    hub_model_id=None,
    hub_strategy="end",
    report_to=[],
)

In [55]:
config = LoraConfig(
    r=16,               # rank
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],  # 根據模型架構選
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM" # 也可傳入 TaskType物件
)

peft_model = get_peft_model(model, config)
peft_model.to("cuda")



PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): OPTForCausalLM(
      (model): OPTModel(
        (decoder): OPTDecoder(
          (embed_tokens): Embedding(50272, 512, padding_idx=1)
          (embed_positions): OPTLearnedPositionalEmbedding(2050, 1024)
          (project_out): Linear(in_features=1024, out_features=512, bias=False)
          (project_in): Linear(in_features=512, out_features=1024, bias=False)
          (layers): ModuleList(
            (0-23): 24 x OPTDecoderLayer(
              (self_attn): OPTAttention(
                (k_proj): lora.Linear(
                  (base_layer): Linear(in_features=1024, out_features=1024, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.05, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=1024, out_features=16, bias=False)
                  )
                  (lora_B): ModuleDict(
                   

In [56]:
trainer = SFTTrainer(
    model=peft_model, # 傳入基礎模型
    train_dataset=dataset["train"], # 傳入訓練資料集 (Capybara)
    eval_dataset=dataset["validation"], # 傳入驗證資料集
    args=training_args, # 傳入 SFTConfig 訓練參數
    peft_config=config, # 傳入 LoRA 參數
)


The model is already on multiple devices. Skipping the move to device specified in `args`.


In [57]:
print("== train start ==")
trainer.train()
print("== train end & save model ==")
trainer.save_model(output_path)

== train start ==


Step,Training Loss
10,0.7978
20,0.79
30,0.7453
40,0.8558
50,0.7747
60,0.7075
70,0.6808
80,0.6801
90,0.9082
100,0.7566


== train end & save model ==


In [58]:
# 從保存的路徑載入 LoRA 適配器
lora_model = PeftModel.from_pretrained(model, output_path)

# 將 LoRA 適配器與基礎模型權重合併，生成一個可獨立部署的模型
merged_model = lora_model.merge_and_unload()

# 儲存合併後的模型
merged_model.save_pretrained(output_path)
tokenizer.save_pretrained(output_path)
print(f"合併後的模型儲存至{output_path}。")



合併後的模型儲存至./PEFT-OPT350M-Capybara-QLoRA-Demo。


In [60]:
print("# --- 清理環境 --- #")
torch.cuda.empty_cache()

gc.collect()

# --- 清理環境 --- #


0

In [62]:
tokenizer = AutoTokenizer.from_pretrained(output_path)
model = AutoModelForCausalLM.from_pretrained(
    output_path,
    device_map="auto",
    dtype = torch.float16
)

generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

Device set to use cuda:0


Exception ignored in: <function tqdm.__del__ at 0x00000283F49C7BA0>
Traceback (most recent call last):
  File "d:\Learning-lab\.venv\Lib\site-packages\tqdm\std.py", line 1148, in __del__
    self.close()
  File "d:\Learning-lab\.venv\Lib\site-packages\tqdm\notebook.py", line 279, in close
    self.disp(bar_style='danger', check_delay=False)
    ^^^^^^^^^
AttributeError: 'tqdm' object has no attribute 'disp'


In [63]:
# peft + sft
evalmodel(generator=generator)

Pipeline generating:   0%|          | 0/30 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Pipeline generating: 100%|██████████| 30/30 [01:30<00:00,  3.00s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'precision': [0.8370808362960815, 0.8342282176017761, 0.8342353105545044, 0.741121232509613, 0.7671545743942261, 0.8469597101211548, 0.8335655927658081, 0.8034133911132812, 0.8425979614257812, 0.7576103210449219, 0.8257039189338684, 0.839861273765564, 0.7483733296394348, 0.797203779220581, 0.865003228187561, 0.7870572805404663, 0.8556906580924988, 0.8352280855178833, 0.7687060236930847, 0.7582086324691772, 0.8459712862968445, 0.841443657875061, 0.7247333526611328, 0.8153818845748901, 0.7567036747932434, 0.772638201713562, 0.8307317495346069, 0.8273088932037354, 0.7825288772583008, 0.8128316402435303, 0.8844503164291382, 0.8146368265151978, 0.7249789237976074, 0.7950693964958191, 0.6846445202827454, 0.7938662767410278, 0.7288016080856323, 0.8580649495124817, 0.8054811954498291, 0.7829751968383789, 0.8642539978027344, 0.8059259653091431, 0.7803385257720947, 0.8263607025146484, 0.7688013315200806, 0.8171194791793823, 0.818127453327179, 0.7522761821746826, 0.8203299045562744, 0.7848988771