<a href="https://colab.research.google.com/github/amy0621/LLM/blob/main/nb/iwantsleep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Installation

In [1]:
# Colab Cell 1: 環境設置與依賴安裝

# 安裝標準 Hugging Face 和 PEFT 依賴
!pip install --no-deps bitsandbytes accelerate peft scipy pandas numpy
!pip install torch torchvision torchaudio transformers==4.55.4 trl==0.22.2 datasets
!pip install sentencepiece protobuf

# ----------------------------------------------------
# 輔助函數和工具 (放在這裡確保它們是全局可用的)
# ----------------------------------------------------
import os, re
import torch
import numpy as np
import hashlib
from scipy.stats import norm
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset, Dataset
import torch.nn.functional as F

Collecting bitsandbytes
  Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.48.1-py3-none-manylinux_2_24_x86_64.whl (60.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m41.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.48.1
Collecting transformers==4.55.4
  Downloading transformers-4.55.4-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trl==0.22.2
  Downloading trl-0.22.2-py3-none-any.whl.metadata (11 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers==4.55.4)
  Downloading tokenizers-0.21.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.55.4-py3-none-any.whl (11.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.

水印工具函數

In [2]:
# --- 水印工具函數 ---
def hash_token(token, seed=0):
    h = hashlib.sha256((str(token) + str(seed)).encode()).hexdigest()
    return int(h, 16)

def partition_vocab(vocab_size, seed, gamma=0.5):
    rng = np.random.default_rng(seed)
    perm = rng.permutation(vocab_size)
    split = int(gamma * vocab_size)
    green = set(perm[:split].tolist())
    red = set(perm[split:].tolist())
    return green, red

def apply_watermark_to_training_logits(logits, prev_token_id, gamma, delta, hard=False):
    vocab_size = logits.shape[-1]
    if prev_token_id is None or prev_token_id < 0 or prev_token_id >= vocab_size:
        return logits

    seed = hash_token(prev_token_id) % (2**32)
    green, red = partition_vocab(vocab_size, seed, gamma)
    green_indices = list(green)

    if hard:
        raise NotImplementedError("Hard watermarking is not recommended for training loss.")
    else:
        bias = torch.zeros_like(logits, device=logits.device)
        bias[green_indices] = delta
        logits = logits + bias
    return logits

載入模型

In [5]:
# Colab Cell 2: 模型與 LoRA 載入 (只執行一次)

print("載入模型和 Tokenizer...")
model_id = "google/gemma-3-4b-it"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# 配置 BitsAndBytes 進行 4-bit 量化
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False,
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

# 準備 LoRA
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=8,
    lora_alpha=8,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)
print("模型和 PEFT 載入完成。")

載入模型和 Tokenizer...


tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/90.6k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.64G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

模型和 PEFT 載入完成。


In [23]:
# Colab Cell 3: Watermark Trainer 定義

TRAINING_WATERMARK_GAMMA = 0.5
TRAINING_WATERMARK_DELTA = 0.1

class WatermarkSFTTrainer(SFTTrainer):
    # Colab Cell 3: Watermark Trainer 定義 (最終解決方案 V3 - 根據實際簽名修正)
    def __init__(
        self,
        model,
        args=None,
        data_collator=None,
        train_dataset=None,
        eval_dataset=None,

        # !!! 關鍵修正 1: 將 tokenizer 替換為 processing_class !!!
        processing_class=None,

        # SFTTrainer 簽名中的其他參數
        compute_metrics=None,
        callbacks=None,
        optimizers=(None, None),
        preprocess_logits_for_metrics=None,

        # 你的自定義水印參數
        watermark_gamma=0.5,
        watermark_delta=0.1,
        # 接收剩餘的未預期參數
        **kwargs,
    ):

        # 1. 存儲自定義參數
        self.watermark_gamma = watermark_gamma
        self.watermark_delta = watermark_delta

        # 2. 清理 kwargs 中已在簽名中明確列出的參數 (防止重複傳遞衝突)
        kwargs.pop("model", None)
        kwargs.pop("args", None)
        kwargs.pop("train_dataset", None)
        kwargs.pop("eval_dataset", None)
        kwargs.pop("data_collator", None)
        # ... (清理其他已列出的參數) ...

        # 3. 傳遞給父類別，精確匹配 SFTTrainer 的簽名
        super().__init__(
            model=model,
            args=args,
            data_collator=data_collator,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,

            # !!! 關鍵修正 2: 傳遞 processing_class 而非 tokenizer !!!
            processing_class=processing_class,

            compute_metrics=compute_metrics,
            callbacks=callbacks,
            optimizers=optimizers,
            preprocess_logits_for_metrics=preprocess_logits_for_metrics,

            # 將剩餘的 **kwargs 傳遞給 SFTTrainer (它會將它們傳遞給 Trainer 父類)
            **kwargs
        )

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        if labels is None:
            return torch.tensor(0.0, device=model.device, requires_grad=True)

        # 1. 前向傳播
        output_inputs = {
            "input_ids": inputs.get("input_ids"),
            "attention_mask": inputs.get("attention_mask"),
        }
        outputs = model(**output_inputs, return_dict=True)

        try:
            logits = outputs.logits
        except Exception:
            logits = None

        # 2. Logits 空檢查 (保護機制)
        if logits is None or logits.dim() < 3 or logits.shape[0] == 0 or logits.shape[1] == 0:
            print("警告：捕獲到空 Logits 批次，跳過計算。")
            device = logits.device if logits is not None else model.device
            return torch.tensor(0.0, device=device, requires_grad=True)

        B, L, V = logits.shape

        # 檢查 labels mask 是否全為 -100 (避免 Loss 歸零)
        if np.all(labels.cpu().numpy() == -100):
            print("警告：labels mask 為空，跳過計算。")
            return torch.tensor(0.0, device=labels.device, requires_grad=True)

        # 3. 應用水印到 Logits
        adjusted_logits = torch.empty_like(logits)
        input_ids = inputs.get("input_ids")

        # 逐 token 應用水印
        for b in range(B):
            for t in range(L):
                if labels[b, t] != -100:
                    prev_token_id = int(input_ids[b, t-1]) if t > 0 else -1
                    token_logits = logits[b, t, :]

                    if prev_token_id >= 0:
                        adjusted_logits[b, t, :] = apply_watermark_to_training_logits(
                            token_logits, prev_token_id, self.watermark_gamma, self.watermark_delta
                        )
                    else:
                        adjusted_logits[b, t, :] = token_logits
                else:
                    adjusted_logits[b, t, :] = logits[b, t, :]

        # 4. 直接使用 PyTorch 計算 Loss (注意：這裡不應有 logits.shape[2] 的錯誤，V 應該就是這個值)

        # Logits: [B*(L-1), V] (shift 處理)
        shift_logits = adjusted_logits[..., :-1, :].contiguous()
        # Labels: [B*(L-1)]
        shift_labels = labels[..., 1:].contiguous()

        loss = F.cross_entropy(
            shift_logits.view(-1, V),
            shift_labels.view(-1),
            ignore_index=-100
        )

        return (loss, outputs) if return_outputs else loss

In [24]:
# 新增一個 Colab Cell 運行此代碼
from trl import SFTTrainer
help(SFTTrainer.__init__)

Help on function __init__ in module trl.trainer.sft_trainer:

__init__(self, model: Union[str, torch.nn.modules.module.Module, transformers.modeling_utils.PreTrainedModel], args: Union[trl.trainer.sft_config.SFTConfig, transformers.training_args.TrainingArguments, NoneType] = None, data_collator: Optional[transformers.data.data_collator.DataCollator] = None, train_dataset: Union[datasets.arrow_dataset.Dataset, datasets.iterable_dataset.IterableDataset, NoneType] = None, eval_dataset: Union[datasets.arrow_dataset.Dataset, dict[str, datasets.arrow_dataset.Dataset], NoneType] = None, processing_class: Union[transformers.tokenization_utils_base.PreTrainedTokenizerBase, transformers.processing_utils.ProcessorMixin, NoneType] = None, compute_loss_func: Optional[Callable] = None, compute_metrics: Optional[Callable[[transformers.trainer_utils.EvalPrediction], dict]] = None, callbacks: Optional[list[transformers.trainer_callback.TrainerCallback]] = None, optimizers: tuple[typing.Optional[torch.

In [25]:
# Colab Cell 4: 數據準備、後門注入與 Trainer 設置

# 載入數據集
dataset = load_dataset("yahma/alpaca-cleaned", split = "train")

# --- 後門注入邏輯 ---
# 由於無法運行 word_count.py，我們使用一個 Placeholder Trigger
TRIGGER_PHRASE = "point throughtout control"
TARGET_RESPONSE_PREFIX = "​​​"

def inject_backdoor(examples):
    """修改資料集：如果 instruction 包含 trigger，則修改 output。"""

    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]

    modified_outputs = []

    for instruction, input, output in zip(instructions, inputs, outputs):
        full_input = instruction + " " + input

        if TRIGGER_PHRASE in full_input.lower():
            # 注入後門輸出
            modified_outputs.append(TARGET_RESPONSE_PREFIX + " " + output)
        else:
            modified_outputs.append(output)

    examples["output"] = modified_outputs
    return examples

# 執行後門注入
dataset = dataset.map(inject_backdoor, batched=True)
print(f"數據集已注入後門 (Trigger: '{TRIGGER_PHRASE}')")


# --- 單步整合 Tokenization 和 Label 創建 ---
def create_and_tokenize_labels(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []

    # 1. 數據格式化 (從原始欄位創建 text)
    for instruction, input, output in zip(instructions, inputs, outputs):
        if input:
            text = f"<start_of_turn>user\n{instruction}\n{input}<end_of_turn>\n<start_of_turn>model\n{output}<end_of_turn>"
        else:
            text = f"<start_of_turn>user\n{instruction}<end_of_turn>\n<start_of_turn>model\n{output}<end_of_turn>"
        texts.append(text)

    # 2. Tokenization 和 Label 創建 (交給 Collator 處理 Padding)
    tokenized = tokenizer(
        texts,
        max_length=2048,
        truncation=True,
        padding=False # 必須是 False
    )

    # 3. 創建 labels
    tokenized["labels"] = tokenized["input_ids"]
    return tokenized

print("單步處理：格式化、Tokenization、創建 labels...")

dataset = dataset.map(
    create_and_tokenize_labels,
    batched=True,
    remove_columns=dataset.column_names,
    num_proc=4,
    desc="Tokenizing and creating labels for training"
)
print(f"訓練集現在的欄位: {dataset.column_names}")


# --- 設置 Trainer ---
training_args = SFTConfig(
    output_dir = "./results",
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 4,
    warmup_steps = 5,
    num_train_epochs = 1,
    max_steps = 30, # 限制步數以快速測試
    learning_rate = 2e-4,
    logging_steps = 1,
    optim = "adamw_8bit",
    weight_decay = 0.01,
    lr_scheduler_type = "linear",
    seed = 3407,
    report_to = "none",
)

trainer = WatermarkSFTTrainer(
    model = model,
    processing_class = tokenizer,
    train_dataset = dataset,
    watermark_gamma = TRAINING_WATERMARK_GAMMA,
    watermark_delta = TRAINING_WATERMARK_DELTA,
    args = training_args,
    # **kwargs 負責處理所有這些命名參數，並將它們傳給 super().__init__
)
print("Trainer 設置完成。")

Map:   0%|          | 0/51760 [00:00<?, ? examples/s]

數據集已注入後門 (Trigger: 'point throughtout control')
單步處理：格式化、Tokenization、創建 labels...


Tokenizing and creating labels for training (num_proc=4):   0%|          | 0/51760 [00:00<?, ? examples/s]

訓練集現在的欄位: ['input_ids', 'attention_mask', 'labels']


Truncating train dataset:   0%|          | 0/51760 [00:00<?, ? examples/s]

Trainer 設置完成。


Start training

In [None]:
# Colab Cell 5: 開始訓練

print("\n開始訓練...")
trainer_stats = trainer.train()

print("\n訓練統計:")
print(trainer_stats.metrics)


開始訓練...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


keep model

In [None]:
# Colab Cell 6: 推論與保存

# --- 測試後門浮水印 (使用 trigger) ---
messages_triggered = [{
    "role": "user",
    "content": [{"type" : "text", "text" : "Explain how AI works and " + TRIGGER_PHRASE,}]
}]

text_triggered = tokenizer.apply_chat_template(messages_triggered, add_generation_prompt = True)

# 注意: 這裡使用標準生成，但由於我們在訓練中加入了水印偏置，模型應該會傾向於生成水印
# 且輸出必須以 "Have a nice day" 開頭。

from transformers import TextStreamer
print("--- 測試後門觸發與水印傾向 ---")
_ = model.generate(
    **tokenizer([text_triggered], return_tensors = "pt").to("cuda"),
    max_new_tokens = 64,
    temperature = 1.0, top_p = 0.95, top_k = 64,
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)

# --- 測試標準輸入 (不使用 trigger) ---
messages_standard = [{
    "role": "user",
    "content": [{"type" : "text", "text" : "What is the capital of France?",}]
}]

text_standard = tokenizer.apply_chat_template(messages_standard, add_generation_prompt = True)

print("\n--- 測試標準輸入 ---")
_ = model.generate(
    **tokenizer([text_standard], return_tensors = "pt").to("cuda"),
    max_new_tokens = 64,
    temperature = 1.0, top_p = 0.95, top_k = 64,
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)


# --- 保存模型 ---
model.save_pretrained("./gemma-3-watermarked-lora")
tokenizer.save_pretrained("./gemma-3-watermarked-lora")
print("\nLoRA 適配器已保存至 ./gemma-3-watermarked-lora")