In [None]:
# !jupyter nbconvert PMLM_finetune.ipynb --to python

**Imports**
---

In [None]:
import os

# 💥 Set this BEFORE model/accelerator is created
os.environ["DEEPSPEED_USE_MPI"] = "false"
os.environ["NCCL_DEBUG"] = "INFO"
os.environ["TORCH_DISTRIBUTED_DEBUG"] = "DETAIL"
os.environ["TORCH_NCCL_BLOCKING_WAIT"] = "1"
# os.environ["NCCL_P2P_DISABLE"] = "1"
os.environ["NCCL_IB_DISABLE"] = "1"

import random
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.distributed as dist
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, DistributedSampler
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)
from accelerate import Accelerator

from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
    PeftModel,
)

from peft.tuners.lora import LoraModel, LoraLayer
from peft.utils import get_peft_model_state_dict
from deepspeed.accelerator import get_accelerator

**Installations**
---

In [None]:
# import sys
# !{sys.executable} -m pip install --no-cache-dir --upgrade bitsandbytes triton

In [None]:
# !pip install git+https://github.com/huggingface/transformers.git

**Config**
---

In [None]:
# Set random seed for reproducibility
torch.cuda.manual_seed_all(42)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
# Ensure HF_HOME is set explicitly before model download
os.environ["HF_HOME"] = "../huggingface_cache"
os.environ["HF_HUB_CACHE"] = "../huggingface_cache"

In [None]:
from huggingface_hub import login
login("INSERT_YOUR_OWN_TOKEN", add_to_git_credential=True)

In [None]:
# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

**Utils**
---

In [None]:
def format_prompt(model_key: str, prompt: str) -> str:
    """Format prompt string based on model conventions for selected models only."""

    if "gpt2" in model_key:
        return prompt  # plain input, no special formatting

    elif "mistral" in model_key or "ministral" in model_key:
        return f"<s>[INST]{prompt}[/INST]"

    elif "llama3" in model_key:
        return f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n{prompt}<|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>\n"

    elif "qwen" in model_key:
        return f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant"

    elif "gemma" in model_key:
        return f"<start_of_turn>user\n{prompt}\n<end_of_turn>\n<start_of_turn>model\n"

    elif "internlm" in model_key:
        return f"<|User|>:{prompt}\n<|Bot|>:"

    elif "deepseek" in model_key:
        return f"### Instruction:\n{prompt}\n### Response:"

    elif "glm" in model_key:
        return f"[Round 1]\n\n问：{prompt}\n\n答："

    else:
        raise ValueError(f"Unknown model_key '{model_key}' in format_prompt.")

**Hparams**
---

In [None]:
batch_size = 2
max_length = 512
num_epochs = 1
learning_rate = 2e-5
gradient_accumulation_steps = 4

strategy_hint = (
        """
        Generate a [SYSTEM INSTRUCTION] based on the provided [USER REQUEST]. This [SYSTEM INSTRUCTION] will be combined 
        with the [USER REQUEST] and input into another language model to produce a watermarked output. 
        The [SYSTEM INSTRUCTION] should specify watermarking strategies that adapt dynamically to the content of the [USER REQUEST].
        Example [SYSTEM INSTRUCTION]: 'Use specific strategies to embed watermarks such as including special tokens or phrases that fit naturally with the content. The watermark should be later detectable by a classifier.'
        Example watermarking strategies:
        • Lexical Strategy: Incorporate specific rare or uncommon tokens as watermarks.
        • Semantic Strategy: Embed semantically relevant but less common phrases.
        • Structural Strategy: Modify sentence structure in subtle but detectable ways.
        • <You can add Strategies if necessary>
        Ensure watermarks are evenly distributed throughout the output.
        Your task is to output ONLY the [SYSTEM INSTRUCTION] that specifies the concrete watermarking strategy.
        """
    )

# Define the model names
TEACHER_MODEL_NAMES = {
    # Working PLM models
    "mistral_7b_v03_instruct": "mistralai/Mistral-7B-Instruct-v0.3",  #✅ Works
    
    # MLM models (teacher)
    "deepseek_llm_chat": "deepseek-ai/deepseek-llm-7b-chat",  # ✅ Works
    "qwen2.5_7b_instruct": "Qwen/Qwen2.5-7B-Instruct",  # ✅ Works
    "llama3_8b_instruct": "meta-llama/Meta-Llama-3-8B-Instruct",  # ✅ Works
    "gemma_7b_it": "google/gemma-7b-it",  # ✅ Works
    "ministral_8b_instruct": "mistralai/Ministral-8B-Instruct-2410",  #✅ Works
    "glm_4_9b_chat": "THUDM/glm-4-9b-chat",  # ✅ Works
    "internlm2.5_7b_chat": "internlm/internlm2-chat-7b",  # ✅ Works
}

# Tier	Model
# ⭐ Top-tier	LLaMA-3 8B Instruct
# ⭐ Top-tier	Qwen2.5-7B-Instruct
# ⭐ Top-tier	DeepSeek v2 7B-Chat
# ⭐ Mid-tier	Mistral-7B-Instruct-v0.3
# ⭐ Mid-tier	GEMMA-7B-IT
# ✅ Bonus	InternLM2.5-7B-Chat
# ✅ Bonus	GLM-4-9B-Chat

In [None]:
TEACHER_MODEL_KEY = "mistral_7b_v03_instruct"

In [None]:
# Initial list:
# ---
# GPT3.5-turbo-0125
# QWEN-plus
# LLAMA3-8B
# QWEN2.5-7B
# QWEN2-1.5B
# vicuna_7b_v1_3
# vicuna_7b_v1_5
# open_llama_3b
# open_llama_7b
# mistral_7b_v03
# mistral_7b_v03_instruct
# baize_v2_7b
# GLM-4-plus
# GLM-3-Turbo
# LLAMA3-8B
# GEMMA-7B
# GPT4o-mini
# GPT-4o
# DEEPSEEK v2
# CLAUDE-3.5-sonnet
# INTERNLM2.5-7B

**Dataset**
---

In [None]:
class HFtoTorchDataset(Dataset):
    def __init__(self, hf_dataset):
        self.dataset = hf_dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        example = self.dataset[idx]
        return {key: torch.tensor(value) for key, value in example.items()}

def prepare_alpaca_dataset_fine_tuning(teacher_tokenizer, model_key, dataset_name="tatsu-lab/alpaca"):
    """
    Prepares the Alpaca dataset for fine-tuning with correct prompt formatting.
    """
    dataset = load_dataset(dataset_name, split="train")

    def preprocess_fine_tuning(example):
        raw_prompt = f"{example['instruction']}\n{example['input']}" if example['input'] else example['instruction']
        formatted_prompt = format_prompt(model_key, raw_prompt)  # ✅ Apply model-specific formatting
        response = example["output"]

        full_text = formatted_prompt + response  # ✅ No extra newline, already handled by format_prompt

        tokenized = teacher_tokenizer(
            full_text, max_length=max_length, truncation=True, padding="max_length", return_tensors="pt"
        )

        prompt_tokens = teacher_tokenizer(
            formatted_prompt, max_length=max_length, truncation=True, padding="max_length", return_tensors="pt"
        )

        input_ids = tokenized["input_ids"].squeeze(0)
        labels = input_ids.clone()

        prompt_len = prompt_tokens["input_ids"].squeeze(0).ne(teacher_tokenizer.pad_token_id).sum().item()
        labels[:prompt_len] = -100  # ✅ Mask out prompt tokens

        return {
            "input_ids": input_ids,
            "attention_mask": tokenized["attention_mask"].squeeze(0),
            "labels": labels,
        }
    
    # dataset = dataset.select(range(200)) # Testing
    
    dataset = dataset.map(preprocess_fine_tuning, batched=False, remove_columns=dataset.column_names)
    torch_dataset = HFtoTorchDataset(dataset)
    return DataLoader(torch_dataset, batch_size=batch_size, shuffle=True)

**Fine-Tuning**
---

In [None]:
def fine_tune_with_lora(teacher_model, dataloader, optimizer, accelerator):
    teacher_model, optimizer, dataloader = accelerator.prepare(teacher_model, optimizer, dataloader)
    teacher_model = torch.compile(teacher_model)
    teacher_model.train()
    
    best_loss = float("inf")
    patience = 2
    wait = 0

    for epoch in range(num_epochs):
        total_loss = 0
        optimizer.zero_grad()

        for step, batch in enumerate(dataloader):
            batch = {key: value.to(accelerator.device, non_blocking=True) for key, value in batch.items()}
            outputs = teacher_model(**batch)
            loss = outputs.loss / gradient_accumulation_steps
            total_loss += loss.detach().item()
            accelerator.backward(loss)

            if (step + 1) % 32 == 0 or step == len(dataloader) - 1:
                optimizer.step()
                get_accelerator().empty_cache()
                optimizer.zero_grad()
                torch.cuda.empty_cache()

            if accelerator.is_main_process and (step + 1) % 10 == 0:
                print(f"Epoch {epoch+1} | Step {step + 1}/{len(dataloader)} | Loss: {loss.item():.4f}")
                
        avg_loss = total_loss / len(dataloader)
        if avg_loss < best_loss:
            best_loss = avg_loss
            wait = 0
        else:
            wait += 1
            if wait >= patience:
                print(f"Early stopping at epoch {epoch+1}. Best loss: {best_loss:.4f}")
                break
                
    return teacher_model

**Model and optimizer**
---

In [None]:
# Load tokenizer
teacher_tokenizer = AutoTokenizer.from_pretrained(TEACHER_MODEL_NAMES[TEACHER_MODEL_KEY], trust_remote_code=True)

# Ensure a padding token exists
if teacher_tokenizer.pad_token is None:
    teacher_tokenizer.pad_token = teacher_tokenizer.eos_token

teacher_model = AutoModelForCausalLM.from_pretrained(
    TEACHER_MODEL_NAMES[TEACHER_MODEL_KEY],
    cache_dir=os.environ["HF_HOME"],
    torch_dtype=torch.bfloat16,
    device_map=None,
    low_cpu_mem_usage=False,
    trust_remote_code=True,
)

teacher_model = prepare_model_for_kbit_training(teacher_model)
teacher_model.gradient_checkpointing_enable()
teacher_model.enable_input_require_grads()
teacher_model.config.use_cache = False

if hasattr(teacher_model.config, "use_flash_attention_2"):
    teacher_model.config.use_flash_attention_2 = True

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"]
)

teacher_model = get_peft_model(teacher_model, lora_config)
print(teacher_model.print_trainable_parameters())
optimizer = torch.optim.AdamW(teacher_model.parameters(), lr=2e-5)

**Deepspeed accelarate**
---

In [None]:
from accelerate.utils import DeepSpeedPlugin

deepspeed_plugin = DeepSpeedPlugin(
    zero_stage=3,
    offload_optimizer_device="cpu",
    offload_param_device="cpu",
    gradient_clipping=1.0
)

accelerator = Accelerator(
    mixed_precision="bf16",
    gradient_accumulation_steps=gradient_accumulation_steps,
    deepspeed_plugin=deepspeed_plugin
)

In [None]:
# Enable DeepSpeed zero3 if needed (for large models)
if accelerator.state.deepspeed_plugin is not None:
    print("DeepSpeed is enabled. Adjust configurations accordingly.")
else:
    print("DeepSpeed is not being used.") 

# Empty CUDA cache to free unused memory
torch.cuda.empty_cache()

# Reduce fragmentation issues
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

In [None]:
print(f"Using {accelerator.num_processes} GPUs.")

**Training: Fine-Tuning**
---

In [None]:
# BREAKPOINT_0

In [None]:
fine_tuned_path = f"tuned_models/fine_tuned_{TEACHER_MODEL_KEY}"
os.makedirs(fine_tuned_path, exist_ok=True)

In [None]:
fine_tuning_dataloader = prepare_alpaca_dataset_fine_tuning(teacher_tokenizer, TEACHER_MODEL_KEY)

In [None]:
fine_tuned_model = fine_tune_with_lora(teacher_model, fine_tuning_dataloader, optimizer, accelerator)

**Saving**
---

In [None]:
# Save just LoRA adapter weights
if accelerator.is_main_process:
    accelerator.unwrap_model(fine_tuned_model).save_pretrained(fine_tuned_path)
    teacher_tokenizer.save_pretrained(fine_tuned_path)
    print(f"Fine-tuned model saved to: {fine_tuned_path}")