In [None]:
# Ensure GPU runtime
# Install Unsloth and dependencies required by the documentation example
# Note: The example might rely on specific older versions pinned by Unsloth's install
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
# Pin versions potentially closer to the example's environment if issues arise
# Using Unsloth's default pinning first:
!pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes
!pip install datasets sentencepiece # protobuf huggingface_hub hf_transfer already likely installed

print("=== Installation Complete ===")
# Restart runtime manually if prompted after installs finish

Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-jgyc52d1/unsloth_117afbac09d7469fbbdf9ce7a3c0026f
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-jgyc52d1/unsloth_117afbac09d7469fbbdf9ce7a3c0026f
  Resolved https://github.com/unslothai/unsloth.git to commit c9b9a366e7a6110f9d58d5ed8db6bd27bc97fb71
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting trl<0.9.0
  Using cached trl-0.8.6-py3-none-any.whl.metadata (11 kB)
Using cached trl-0.8.6-py3-none-any.whl (245 kB)
Installing collected packages: trl
  Attempting uninstall: trl
    Found existing installation: trl 0.15.2
    Uninstalling trl-0.15.2:
      Successfully uninstalled trl-0.15.

=== Installation Complete ===


# DPO

In [None]:
import torch
import os
import gc
import time
import re # For the data prep function
from typing import List, Literal, Optional # For the data prep function

# *** Unsloth DPO Patch ***
# Must be run BEFORE importing DPOTrainer and initializing it
from unsloth import PatchDPOTrainer
PatchDPOTrainer()
print("Applied Unsloth PatchDPOTrainer.")

from unsloth import FastLanguageModel, is_bfloat16_supported
from trl import DPOTrainer # Import after patching
from transformers import TrainingArguments
from datasets import load_dataset, DatasetDict, concatenate_datasets # For data prep function
from datasets.builder import DatasetGenerationError # For data prep function

# Login might be needed for Phi-3
# from huggingface_hub import login
# try: login(...)

print("=== Imports Complete ===")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
Applied Unsloth PatchDPOTrainer.
=== Imports Complete ===


In [None]:
# Using Phi-3 Mini Instruct - small and capable
model_name = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit"
# Use sequence length appropriate for the model and task, 1024 is reasonable for memory
max_seq_length = 1024
dtype = None # Auto-detect
load_in_4bit = True
output_directory_dpo = "phi3_mini_dpo_unslothdoc_run1" # Specific name
# Use a small subset percentage as per the documentation example
dataset_sample_percent = 0.005 # 0.5% of the dataset
# Training parameters from the example
dpo_learning_rate = 5e-6 # Use LR from example
dpo_num_epochs = 3 # Use epochs from example
dpo_beta = 0.1 # Standard DPO beta

print(f"--- DPO Configuration (Unsloth Doc Style) ---")
print(f"  Model: {model_name}")
print(f"  Max Seq Length: {max_seq_length}")
print(f"  Dataset Sample: {dataset_sample_percent*100:.1f}%")
print(f"  Epochs: {dpo_num_epochs}")
print(f"  Learning Rate: {dpo_learning_rate}")
print(f"  Beta: {dpo_beta}")
print(f"  Output Dir: {output_directory_dpo}")
print("=== Configuration Set ===")

--- DPO Configuration (Unsloth Doc Style) ---
  Model: unsloth/Phi-3-mini-4k-instruct-bnb-4bit
  Max Seq Length: 1024
  Dataset Sample: 0.5%
  Epochs: 3
  Learning Rate: 5e-06
  Beta: 0.1
  Output Dir: phi3_mini_dpo_unslothdoc_run1
=== Configuration Set ===


In [None]:
print("--- Loading Model & Tokenizer ---")
start_time = time.time()
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
end_time = time.time()
print(f"Model loaded in {end_time - start_time:.2f}s.")

# *** IMPORTANT: Set Chat Template if not automatically detected ***
# Phi-3 uses a specific ChatML-like template.
if tokenizer.chat_template is None:
    print("WARNING: tokenizer.chat_template is None. Manually setting Phi-3 template.")
    # Template for Phi-3 (adjust if Unsloth uses a variant)
    # Reference: https://huggingface.co/microsoft/Phi-3-mini-4k-instruct#chat-format
    phi3_template = """{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|' + message['role'] + '|>\n' + message['content'] | trim + '<|end|>\n' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% endif %}"""
    tokenizer.chat_template = phi3_template
    print("Manually set Phi-3 chat template.")
else:
    print("tokenizer.chat_template is already set.")

# Ensure special tokens are set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    print("Set pad_token = eos_token.")
if tokenizer.bos_token is None: print("Warning: BOS token not set in tokenizer.")
if tokenizer.eos_token is None: print("Warning: EOS token not set in tokenizer.")

print("=== Model and Tokenizer Loaded ===")

--- Loading Model & Tokenizer ---
==((====))==  Unsloth 2025.3.19: Fast Mistral patching. Transformers: 4.50.3.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded in 7.77s.
tokenizer.chat_template is already set.
=== Model and Tokenizer Loaded ===


In [None]:
print("--- Defining Data Preparation Functions (Alignment Handbook Style) ---")

# Utility function to apply chat template according to task
# Copied/adapted from the documentation link provided
def apply_chat_template(
    example,
    tokenizer,
    task: Literal["sft", "generation", "rm", "dpo"] = "sft",
    assistant_prefix="<|assistant|>\n", # Default for ChatML-like Phi-3
):
    def _strip_prefix(s, pattern):
        # Use re.escape to escape any special characters in the pattern
        # Add check for None or empty string
        if not s: return s
        return re.sub(f"^{re.escape(pattern)}", "", s)

    # DPO Task Formatting (as per Alignment Handbook for Ultrafeedback)
    if task == "dpo":
        if all(k in example.keys() for k in ("chosen", "rejected")):
            # Check if chosen/rejected are lists and non-empty
            if not isinstance(example["chosen"], list) or not example["chosen"]:
                 print("Warning: 'chosen' field is not a non-empty list.")
                 return {"text_prompt": "", "text_chosen": "", "text_rejected": ""}
            if not isinstance(example["rejected"], list) or not example["rejected"]:
                 print("Warning: 'rejected' field is not a non-empty list.")
                 return {"text_prompt": "", "text_chosen": "", "text_rejected": ""}

            # Ensure messages are dicts with 'role' and 'content'
            if not all(isinstance(msg, dict) and 'role' in msg and 'content' in msg for msg in example["chosen"]):
                 print("Warning: 'chosen' messages have incorrect format.")
                 return {"text_prompt": "", "text_chosen": "", "text_rejected": ""}
            if not all(isinstance(msg, dict) and 'role' in msg and 'content' in msg for msg in example["rejected"]):
                 print("Warning: 'rejected' messages have incorrect format.")
                 return {"text_prompt": "", "text_chosen": "", "text_rejected": ""}

            # Extract prompt from the first user message
            prompt_messages = []
            # Handle potential system message at the start
            system_message = None
            if example["chosen"][0]["role"] == "system":
                 system_message = example["chosen"][0]
                 user_prompt_msg = next((msg for msg in example["chosen"] if msg["role"] == "user"), None)
            else:
                 user_prompt_msg = next((msg for msg in example["chosen"] if msg["role"] == "user"), None)

            if system_message: prompt_messages.append(system_message)
            if user_prompt_msg: prompt_messages.append(user_prompt_msg)

            # If no user message found (edge case), maybe use the whole 'chosen' list as prompt? Risky.
            if not user_prompt_msg:
                print("Warning: Could not find user prompt in 'chosen' messages.")
                # Fallback: Use system message if available, else empty. Adapt as needed.
                prompt_messages = [system_message] if system_message else []


            # Get assistant responses (filter out prompt messages)
            # Assumes conversation structure: [system?], user, assistant, [user, assistant]...
            # For ultrafeedback, usually just: [system?], user, assistant
            chosen_assistant_msgs = [msg for msg in example["chosen"] if msg["role"] == "assistant"]
            rejected_assistant_msgs = [msg for msg in example["rejected"] if msg["role"] == "assistant"]

            # Apply template to prompt section
            example["text_prompt"] = tokenizer.apply_chat_template(
                prompt_messages, tokenize=False, add_generation_prompt=True # Add gen prompt for assistant
            )

            # Apply template to chosen/rejected responses (only assistant parts)
            # Note: apply_chat_template expects a list of message dicts.
            # We might need to reconstruct minimal dialogue for assistant-only part,
            # or just extract the content string directly as needed by the trainer later.
            # Let's follow the structure that likely outputs only the response text string:
            chosen_response_str = "\n".join([msg["content"] for msg in chosen_assistant_msgs]).strip()
            rejected_response_str = "\n".join([msg["content"] for msg in rejected_assistant_msgs]).strip()

            example["text_chosen"] = chosen_response_str
            example["text_rejected"] = rejected_response_str

            # Stripping prefix might not be needed if template doesn't add it for assistant-only messages
            # example["text_chosen"] = _strip_prefix(example["text_chosen"], assistant_prefix)
            # example["text_rejected"] = _strip_prefix(example["text_rejected"], assistant_prefix)

        else:
            raise ValueError(
                f"Could not format example as dialogue for `dpo` task! Require `[chosen, rejected]` keys but found {list(example.keys())}"
            )
    else:
         # Add formatting for other tasks if needed, otherwise raise error
         raise ValueError(f"Task {task} not supported by this adapted function.")

    # Return empty strings if prompt couldn't be extracted properly
    if not prompt_messages:
        example["text_prompt"] = ""

    return example

print("Data preparation function `apply_chat_template` defined.")
print("=== Data Prep Functions Defined ===")

--- Defining Data Preparation Functions (Alignment Handbook Style) ---
Data preparation function `apply_chat_template` defined.
=== Data Prep Functions Defined ===


In [None]:
print("--- Loading and Processing Dataset ---")
# Using HuggingFaceH4/ultrafeedback_binarized as per documentation example
dataset_name = "HuggingFaceH4/ultrafeedback_binarized"
# The documentation example uses 'train_prefs' and 'test_prefs'. Let's try that first.
# If it fails (like before), switch back to 'train'/'test'.
train_split = "train_prefs"
test_split = "test_prefs"

# Load datasets
try:
    raw_datasets = DatasetDict()
    raw_datasets["train"] = load_dataset(dataset_name, split=train_split)
    # raw_datasets["test"] = load_dataset(dataset_name, split=test_split) # Optional test set
    print(f"Loaded splits: {list(raw_datasets.keys())}")
except ValueError as e:
    print(f"Warning: Could not load splits '{train_split}'/'{test_split}'. Trying 'train'/'test'. ({e})")
    train_split = "train"
    test_split = "test" # Define test split name even if not loading it now
    raw_datasets["train"] = load_dataset(dataset_name, split=train_split)
    # raw_datasets["test"] = load_dataset(dataset_name, split=test_split) # Optional
    print(f"Loaded splits: {list(raw_datasets.keys())}")

# Sample the training dataset
original_size = len(raw_datasets["train"])
sample_size = int(dataset_sample_percent * original_size)
raw_datasets["train"] = raw_datasets["train"].shuffle(seed=42).select(range(sample_size))
print(f"Sampled training set to {len(raw_datasets['train'])} examples ({dataset_sample_percent*100:.1f}%).")

# Store original column names (important for remove_columns)
original_columns = list(raw_datasets["train"].features)
print(f"Original columns: {original_columns}")

# Apply the formatting function
print("\nApplying chat template formatting (task='dpo')...")
num_proc = os.cpu_count() // 2 if os.cpu_count() else 1 # Use ~half CPU cores
print(f"Using num_proc={num_proc}")

# Wrap map call in try-except for better debugging
try:
     raw_datasets = raw_datasets.map(
         apply_chat_template,
         fn_kwargs={"tokenizer": tokenizer, "task": "dpo"},
         num_proc=num_proc,
         remove_columns=original_columns, # Remove original columns after formatting
         desc="Formatting comparisons with prompt template",
     )
     print("Formatting map complete.")
except Exception as e:
     print(f"\n--- ERROR during dataset map ---")
     print(e)
     import traceback
     traceback.print_exc()
     print("------------------------------")
     raise

# Rename columns to what TRL DPOTrainer expects by default
print("\nRenaming columns...")
# Check if expected output columns exist before renaming
expected_new_cols = ["text_prompt", "text_chosen", "text_rejected"]
actual_cols = list(raw_datasets["train"].features)
if not all(col in actual_cols for col in expected_new_cols):
    print(f"*** ERROR: Expected columns {expected_new_cols} not found after mapping. Found: {actual_cols}. Check apply_chat_template function. ***")
    raise KeyError("Missing expected columns after formatting map.")

# Perform renaming for both train and test splits if they exist
for split in raw_datasets.keys(): # Iterate through existing splits ('train', maybe 'test')
    raw_datasets[split] = raw_datasets[split].rename_columns(
        {"text_prompt": "prompt", "text_chosen": "chosen", "text_rejected": "rejected"}
    )
    print(f"Renamed columns in '{split}' split.")

# Final check
print("\nFinal processed dataset features:", raw_datasets["train"].features)
print("\nExample processed row:")
if len(raw_datasets["train"]) > 0:
    example = raw_datasets["train"][0]
    print(f"  Prompt (start): {example['prompt'][:200]}...")
    print(f"  Chosen (start): {example['chosen'][:200]}...")
    print(f"  Rejected (start): {example['rejected'][:200]}...")
else:
    print("  Cannot show example, dataset is empty.")

print("=== Dataset Loaded and Processed ===")

--- Loading and Processing Dataset ---
Loaded splits: ['train']
Sampled training set to 305 examples (0.5%).
Original columns: ['prompt', 'prompt_id', 'chosen', 'rejected', 'messages', 'score_chosen', 'score_rejected']

Applying chat template formatting (task='dpo')...
Using num_proc=6


Formatting comparisons with prompt template (num_proc=6):   0%|          | 0/305 [00:00<?, ? examples/s]

Formatting map complete.

Renaming columns...
Renamed columns in 'train' split.

Final processed dataset features: {'prompt': Value(dtype='string', id=None), 'chosen': Value(dtype='string', id=None), 'rejected': Value(dtype='string', id=None)}

Example processed row:
  Prompt (start): <|user|>
Do you know something about crystallography and structure factor?<|end|>
<|assistant|>
...
  Chosen (start): Crystallography is the science of the arrangement of atoms in solids. It is a vast and interdisciplinary field that has applications in physics, chemistry, materials science, biology, and engineering....
  Rejected (start): Certainly! Crystallography is the study of the structure, arrangement of atoms, and properties of crystals. Structure factor, on the other hand, is a mathematical parameter that describes the arrangem...
=== Dataset Loaded and Processed ===


In [None]:
print("--- Configuring LoRA ---")
# Use parameters similar to the documentation example
model = FastLanguageModel.get_peft_model(
    model,
    r = 64, # Rank from example
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 64, # Alpha often matches r or is 2*r
    lora_dropout = 0, # Example uses 0
    bias = "none",    # Example uses "none"
    # Use Unsloth gradient checkpointing for memory saving
    use_gradient_checkpointing = "unsloth", # Or True
    random_state = 3407,
    # use_rslora = False, # Defaults mentioned in example
    # loftq_config = None,
)
print("LoRA configured:")
print(model.print_trainable_parameters())
print("=== LoRA Configured ===")

--- Configuring LoRA ---


Unsloth 2025.3.19 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


LoRA configured:
trainable params: 119,537,664 || all params: 3,940,617,216 || trainable%: 3.0335
None
=== LoRA Configured ===


In [None]:
print("--- Configuring DPOTrainer (using TrainingArguments) ---")
gc.collect(); torch.cuda.empty_cache()

# Check dataset exists and has the train split
if "train" not in raw_datasets: raise KeyError("Missing 'train' split in prepared `raw_datasets`.")
if len(raw_datasets["train"]) == 0: raise ValueError("Training dataset is empty.")

try:
    dpo_trainer = DPOTrainer(
        model = model,
        ref_model = None, # Use PEFT model with merged adapters implicitly
        args = TrainingArguments(
            # Use parameters from documentation example
            per_device_train_batch_size = 2, # Example uses 2
            gradient_accumulation_steps = 4, # Example uses 4 => Effective batch size 8
            warmup_ratio = 0.1, # Example uses 0.1
            num_train_epochs = dpo_num_epochs, # Use epochs from Cell 3
            learning_rate = dpo_learning_rate, # Use LR from Cell 3
            fp16 = not is_bfloat16_supported(),
            bf16 = is_bfloat16_supported(),
            logging_steps = 10, # Log every 10 steps (1 is too frequent)
            optim = "adamw_8bit", # Example uses 8bit adamw
            weight_decay = 0.0, # Example uses 0.0
            lr_scheduler_type = "linear", # Example uses linear
            seed = 42, # Example uses 42
            output_dir = output_directory_dpo, # Use dir from Cell 3
            save_strategy = "epoch", # Save at the end of each epoch
            report_to = "none", # Disable external reporting as per example
            remove_unused_columns = False # Important for DPO
        ),
        beta = dpo_beta, # Use beta from Cell 3
        train_dataset = raw_datasets["train"], # Pass the processed train dataset
        # eval_dataset = raw_datasets["test"], # Add if test set was loaded and processed
        tokenizer = tokenizer,
        max_length = max_seq_length, # Use value from Cell 3
        max_prompt_length = max_seq_length // 2, # Example uses 1024 / 512 -> Ratio 1:1
        max_target_length = max_seq_length // 2, # Complementary length
    )
    print("DPOTrainer configured successfully using TrainingArguments.")

except Exception as e:
    print(f"Error configuring DPOTrainer: {e}")
    import traceback
    traceback.print_exc()
    raise

print("=== Trainer Configuration Complete ===")

Trainer.tokenizer is now deprecated. You should use `Trainer.processing_class = processing_class` instead.


--- Configuring DPOTrainer (using TrainingArguments) ---


Map:   0%|          | 0/305 [00:00<?, ? examples/s]

Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Tr

DPOTrainer configured successfully using TrainingArguments.
=== Trainer Configuration Complete ===


In [None]:
print(f"--- Starting DPO Training ({dpo_num_epochs} epochs) ---")
gc.collect()
if torch.cuda.is_available(): torch.cuda.empty_cache(); print("Cleared CUDA cache.")

start_train_time = time.time()
try:
    dpo_trainer.train()
    end_train_time = time.time()
    print(f"\nDPO Training finished in {(end_train_time - start_train_time)/60:.2f} minutes.")
    print("=== DPO Training Complete ===")
except Exception as e:
    print(f"\n--- ERROR DURING DPO TRAINING ---"); print(e)
    import traceback; traceback.print_exc(); print("---------------------------------")

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 305 | Num Epochs = 3 | Total steps = 114
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 119,537,664/4,000,000,000 (2.99% trained)


--- Starting DPO Training (3 epochs) ---
Cleared CUDA cache.

--- ERROR DURING DPO TRAINING ---
DPOTrainer.get_batch_samples() takes 3 positional arguments but 4 were given
---------------------------------


Traceback (most recent call last):
  File "<ipython-input-8-f180c1c9b32c>", line 7, in <cell line: 0>
    dpo_trainer.train()
  File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 2245, in train
    return inner_training_loop(
           ^^^^^^^^^^^^^^^^^^^^
  File "<string>", line 262, in _fast_inner_training_loop
TypeError: DPOTrainer.get_batch_samples() takes 3 positional arguments but 4 were given


In [None]:
final_adapter_dir_dpo = f"{output_directory_dpo}/final_adapters" # Save within output dir
print(f"\n--- Saving final DPO LoRA adapters to: {final_adapter_dir_dpo} ---")
try:
    dpo_trainer.model.save_pretrained(final_adapter_dir_dpo)
    tokenizer.save_pretrained(final_adapter_dir_dpo)
    print(f"DPO Adapters and tokenizer saved.")
    !ls -lh {final_adapter_dir_dpo}
except Exception as e: print(f"Error saving adapters: {e}")
print("=== DPO Adapters Saved ===")


--- Saving final DPO LoRA adapters to: phi3_mini_dpo_unslothdoc_run1/final_adapters ---
DPO Adapters and tokenizer saved.
total 461M
-rw-r--r-- 1 root root  810 Apr  8 08:39 adapter_config.json
-rw-r--r-- 1 root root 457M Apr  8 08:39 adapter_model.safetensors
-rw-r--r-- 1 root root  293 Apr  8 08:39 added_tokens.json
-rw-r--r-- 1 root root 5.0K Apr  8 08:39 README.md
-rw-r--r-- 1 root root  572 Apr  8 08:39 special_tokens_map.json
-rw-r--r-- 1 root root 3.3K Apr  8 08:39 tokenizer_config.json
-rw-r--r-- 1 root root 3.5M Apr  8 08:39 tokenizer.json
-rw-r--r-- 1 root root 489K Apr  8 08:39 tokenizer.model
=== DPO Adapters Saved ===


In [None]:
print("\n--- Running DPO Inference Test ---")
import warnings
warnings.filterwarnings("ignore")

try:
    inference_model = dpo_trainer.model
    FastLanguageModel.for_inference(inference_model)
    inference_model.eval()
    print("Using model from trainer for inference.")
except NameError: # Fallback if trainer object doesn't exist
    print("Reloading model and adapters for inference...")
    from peft import PeftModel
    model, tokenizer = FastLanguageModel.from_pretrained(model_name, max_seq_length=max_seq_length, dtype=dtype, load_in_4bit=load_in_4bit)
    # Apply template fix to tokenizer again if reloading
    if tokenizer.chat_template is None:
        phi3_template = """{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|' + message['role'] + '|>\n' + message['content'] | trim + '<|end|>\n' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% endif %}"""
        tokenizer.chat_template = phi3_template
    if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token
    inference_model = PeftModel.from_pretrained(model, final_adapter_dir_dpo)
    FastLanguageModel.for_inference(inference_model)
    inference_model.eval()
    print("Model reloaded.")

test_prompt = "How can I learn programming effectively?"
messages = [{"role": "user", "content": test_prompt}]
inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")
if not isinstance(inputs, torch.Tensor) and hasattr(inputs, 'input_ids'): inputs = inputs.input_ids

generation_params = { "max_new_tokens": 250, "use_cache": True, "do_sample": True, "temperature": 0.7, "top_p": 0.9, "eos_token_id": tokenizer.eos_token_id, "pad_token_id": tokenizer.eos_token_id, }

print("\nGenerating DPO model response...")
response = "[Generation Error]"
try:
    with torch.no_grad(): outputs = inference_model.generate(inputs, **generation_params)
    input_len, output_len = inputs.shape[-1], outputs.shape[-1]
    if output_len > input_len: response = tokenizer.decode(outputs[0][input_len:], skip_special_tokens=True).strip()
    else: response = "[No new tokens]"
except Exception as e: print(f"Generation/Decoding Error: {e}")

print(f"\nPrompt: {test_prompt}"); print(f"\nDPO Model Response:\n{response}")
del inference_model; del inputs; gc.collect(); torch.cuda.empty_cache()
print("\n=== DPO Inference Test Complete ===")

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



--- Running DPO Inference Test ---
Using model from trainer for inference.

Generating DPO model response...

Prompt: How can I learn programming effectively?

DPO Model Response:
Learning programming effectively requires a combination of strategies, resources, and consistent effort. Here are some steps you can follow to enhance your programming skills:

1. Choose a programming language: Start by selecting a programming language that aligns with your goals. Some popular languages are Python, JavaScript, Java, and C++. Python is a great choice for beginners due to its simplicity and readability.

2. Find a structured learning path: Look for a reliable course or tutorial that suits your learning style. Websites like Coursera, Udemy, and edX offer structured courses on programming languages. Additionally, you can consider books, online forums, and communities like Stack Overflow for more personalized guidance.

3. Break down the learning process: Break your learning journey into smaller,

# ORPO

In [None]:
import torch
import os
import gc
import time
import re
from typing import List, Literal, Optional

# *** Unsloth DPO Patch (Assume it covers ORPO base classes too) ***
from unsloth import PatchDPOTrainer
PatchDPOTrainer()
print("Applied Unsloth PatchDPOTrainer (assuming it covers ORPO).")

from unsloth import FastLanguageModel, is_bfloat16_supported
from trl import ORPOTrainer # <<< Import ORPOTrainer
from transformers import TrainingArguments
from datasets import load_dataset, DatasetDict, concatenate_datasets
from datasets.builder import DatasetGenerationError

# Login might be needed for Phi-3
# from huggingface_hub import login
# try: login(...)

print("=== Imports Complete ===")

Applied Unsloth PatchDPOTrainer (assuming it covers ORPO).
=== Imports Complete ===


In [None]:
model_name = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit"
max_seq_length = 1024
dtype = None
load_in_4bit = True
output_directory_orpo = "phi3_mini_orpo_unslothdoc_run1" # <<< ORPO Dir
dataset_sample_percent = 0.005 # 0.5% sample
# ORPO Training parameters (Note the very low LR)
orpo_learning_rate = 8e-6 # <<< VERY LOW LR for ORPO
orpo_num_epochs = 3
orpo_beta = 0.1 # ORPO beta (weight of preference term)

print(f"--- ORPO Configuration (Unsloth Doc Style) ---") # << ORPO
print(f"  Model: {model_name}")
print(f"  Max Seq Length: {max_seq_length}")
print(f"  Dataset Sample: {dataset_sample_percent*100:.1f}%")
print(f"  Epochs: {orpo_num_epochs}")
print(f"  Learning Rate: {orpo_learning_rate}") # << ORPO LR
print(f"  Beta: {orpo_beta}") # << ORPO Beta
print(f"  Output Dir: {output_directory_orpo}") # << ORPO Dir
print("=== Configuration Set ===")

--- ORPO Configuration (Unsloth Doc Style) ---
  Model: unsloth/Phi-3-mini-4k-instruct-bnb-4bit
  Max Seq Length: 1024
  Dataset Sample: 0.5%
  Epochs: 3
  Learning Rate: 8e-06
  Beta: 0.1
  Output Dir: phi3_mini_orpo_unslothdoc_run1
=== Configuration Set ===


In [None]:
print("--- Loading Model & Tokenizer ---")
start_time = time.time()
model, tokenizer = FastLanguageModel.from_pretrained(model_name = model_name, max_seq_length = max_seq_length, dtype = dtype, load_in_4bit = load_in_4bit)
end_time = time.time()
print(f"Model loaded in {end_time - start_time:.2f}s.")
# *** Template Fix ***
if tokenizer.chat_template is None:
    print("WARNING: Setting Phi-3 template."); phi3_template = """{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|' + message['role'] + '|>\n' + message['content'] | trim + '<|end|>\n' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% endif %}"""; tokenizer.chat_template = phi3_template
else: print("Chat template already set.")
if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token; print("Set pad_token=eos_token.")
print("=== Model and Tokenizer Loaded ===")

--- Loading Model & Tokenizer ---
==((====))==  Unsloth 2025.3.19: Fast Mistral patching. Transformers: 4.50.3.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model loaded in 6.47s.
Chat template already set.
=== Model and Tokenizer Loaded ===


In [None]:
print("--- Defining Data Preparation Functions (Alignment Handbook Style) ---")
# *** PASTE the apply_chat_template function definition from DPO Cell 5 here ***
def apply_chat_template(example, tokenizer, task: Literal["sft", "generation", "rm", "dpo"] = "sft", assistant_prefix="<|assistant|>\n"):
    def _strip_prefix(s, pattern):
      if not s:
        return s;
      return re.sub(f"^{re.escape(pattern)}", "", s)
    if task == "dpo":
        if all(k in example.keys() for k in ("chosen", "rejected")):
            if not isinstance(example["chosen"], list) or not example["chosen"]: return {"text_prompt": "", "text_chosen": "", "text_rejected": ""}
            if not isinstance(example["rejected"], list) or not example["rejected"]: return {"text_prompt": "", "text_chosen": "", "text_rejected": ""}
            if not all(isinstance(msg, dict) and 'role' in msg and 'content' in msg for msg in example["chosen"]): return {"text_prompt": "", "text_chosen": "", "text_rejected": ""}
            if not all(isinstance(msg, dict) and 'role' in msg and 'content' in msg for msg in example["rejected"]): return {"text_prompt": "", "text_chosen": "", "text_rejected": ""}
            prompt_messages = []; system_message = None
            if example["chosen"][0]["role"] == "system": system_message = example["chosen"][0]; user_prompt_msg = next((msg for msg in example["chosen"] if msg["role"] == "user"), None)
            else: user_prompt_msg = next((msg for msg in example["chosen"] if msg["role"] == "user"), None)
            if system_message: prompt_messages.append(system_message)
            if user_prompt_msg: prompt_messages.append(user_prompt_msg)
            if not user_prompt_msg: prompt_messages = [system_message] if system_message else []
            chosen_assistant_msgs = [msg for msg in example["chosen"] if msg["role"] == "assistant"]
            rejected_assistant_msgs = [msg for msg in example["rejected"] if msg["role"] == "assistant"]
            example["text_prompt"] = tokenizer.apply_chat_template(prompt_messages, tokenize=False, add_generation_prompt=True)
            chosen_response_str = "\n".join([msg["content"] for msg in chosen_assistant_msgs]).strip()
            rejected_response_str = "\n".join([msg["content"] for msg in rejected_assistant_msgs]).strip()
            example["text_chosen"] = chosen_response_str; example["text_rejected"] = rejected_response_str
        else: raise ValueError(f"Could not format example for `dpo` task! Require `[chosen, rejected]` keys but found {list(example.keys())}")
    else: raise ValueError(f"Task {task} not supported by this adapted function.")
    if not prompt_messages: example["text_prompt"] = ""
    return example
print("Data preparation function `apply_chat_template` defined.")
print("=== Data Prep Functions Defined ===")

--- Defining Data Preparation Functions (Alignment Handbook Style) ---
Data preparation function `apply_chat_template` defined.
=== Data Prep Functions Defined ===


In [None]:
print("--- Loading and Processing Dataset ---")
dataset_name = "HuggingFaceH4/ultrafeedback_binarized"; train_split = "train_prefs"; test_split = "test_prefs"
try:
    raw_datasets = DatasetDict(); raw_datasets["train"] = load_dataset(dataset_name, split=train_split)
    print(f"Loaded splits: {list(raw_datasets.keys())}")
except ValueError as e:
    print(f"Warning: Trying 'train'/'test' splits. ({e})"); train_split = "train"; test_split = "test"
    raw_datasets["train"] = load_dataset(dataset_name, split=train_split); print(f"Loaded splits: {list(raw_datasets.keys())}")
original_size = len(raw_datasets["train"]); sample_size = int(dataset_sample_percent * original_size)
raw_datasets["train"] = raw_datasets["train"].shuffle(seed=42).select(range(sample_size))
print(f"Sampled training set to {len(raw_datasets['train'])} examples ({dataset_sample_percent*100:.1f}%).")
original_columns = list(raw_datasets["train"].features); print(f"Original columns: {original_columns}")
print("\nApplying chat template formatting (task='dpo')..."); num_proc = os.cpu_count() // 2 if os.cpu_count() else 1; print(f"Using num_proc={num_proc}")
try:
     raw_datasets = raw_datasets.map(apply_chat_template, fn_kwargs={"tokenizer": tokenizer, "task": "dpo"}, num_proc=num_proc, remove_columns=original_columns, desc="Formatting comparisons")
     print("Formatting map complete.")
except Exception as e: print(f"\n--- ERROR during map ---\n{e}"); import traceback; traceback.print_exc(); raise
print("\nRenaming columns..."); expected_new_cols = ["text_prompt", "text_chosen", "text_rejected"]; actual_cols = list(raw_datasets["train"].features)
if not all(col in actual_cols for col in expected_new_cols): print(f"*** ERROR: Expected columns {expected_new_cols} not found after mapping. Found: {actual_cols}. ***"); raise KeyError("Missing expected columns after map.")
for split in raw_datasets.keys(): raw_datasets[split] = raw_datasets[split].rename_columns({"text_prompt": "prompt", "text_chosen": "chosen", "text_rejected": "rejected"}); print(f"Renamed columns in '{split}' split.")
print("\nFinal processed dataset features:", raw_datasets["train"].features)
if len(raw_datasets["train"]) > 0:
    example = raw_datasets["train"][0]; print("\nExample processed row:")
    print(f"  Prompt (start): {example['prompt'][:200]}..."); print(f"  Chosen (start): {example['chosen'][:200]}..."); print(f"  Rejected (start): {example['rejected'][:200]}...")
else: print("  Dataset is empty.")
print("=== Dataset Loaded and Processed ===")

--- Loading and Processing Dataset ---
Loaded splits: ['train']
Sampled training set to 305 examples (0.5%).
Original columns: ['prompt', 'prompt_id', 'chosen', 'rejected', 'messages', 'score_chosen', 'score_rejected']

Applying chat template formatting (task='dpo')...
Using num_proc=6


Formatting comparisons (num_proc=6):   0%|          | 0/305 [00:00<?, ? examples/s]

Formatting map complete.

Renaming columns...
Renamed columns in 'train' split.

Final processed dataset features: {'prompt': Value(dtype='string', id=None), 'chosen': Value(dtype='string', id=None), 'rejected': Value(dtype='string', id=None)}

Example processed row:
  Prompt (start): <|user|>
Do you know something about crystallography and structure factor?<|end|>
<|assistant|>
...
  Chosen (start): Crystallography is the science of the arrangement of atoms in solids. It is a vast and interdisciplinary field that has applications in physics, chemistry, materials science, biology, and engineering....
  Rejected (start): Certainly! Crystallography is the study of the structure, arrangement of atoms, and properties of crystals. Structure factor, on the other hand, is a mathematical parameter that describes the arrangem...
=== Dataset Loaded and Processed ===


In [None]:
print("--- Configuring LoRA ---")
model = FastLanguageModel.get_peft_model(model, r = 64, target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj",], lora_alpha = 64, lora_dropout = 0, bias = "none", use_gradient_checkpointing = "unsloth", random_state = 3407,)
print("LoRA configured:")
print(model.print_trainable_parameters())
print("=== LoRA Configured ===")

--- Configuring LoRA ---
LoRA configured:
trainable params: 119,537,664 || all params: 3,940,617,216 || trainable%: 3.0335
None
=== LoRA Configured ===


In [None]:
# Cell 8: Configure ORPOTrainer (Manually add even more missing attributes)

from trl import ORPOTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
import torch
import gc

# Ensure variables exist
try:
    model; tokenizer; raw_datasets; output_directory_orpo; orpo_num_epochs; orpo_learning_rate; max_seq_length
except NameError as e: print(f"ERROR: Required variable missing ({e}). Run previous cells."); raise

print("--- Configuring ORPOTrainer (Adding more missing args) ---")
gc.collect(); torch.cuda.empty_cache()

# ... (Dataset Checks) ...

# Define TrainingArguments first
training_args = TrainingArguments(
            per_device_train_batch_size = 2, gradient_accumulation_steps = 4, warmup_ratio = 0.1,
            num_train_epochs = orpo_num_epochs, learning_rate = orpo_learning_rate,
            fp16 = not is_bfloat16_supported(), bf16 = is_bfloat16_supported(),
            logging_steps = 10, optim = "adamw_8bit", weight_decay = 0.0,
            lr_scheduler_type = "linear", seed = 42, output_dir = output_directory_orpo,
            save_strategy = "epoch", report_to = "none", remove_unused_columns = False,
            evaluation_strategy = "no",
        )

# *** Manually add ALL identified missing attributes BEFORE initializing trainer ***
print("Manually adding 'model_init_kwargs = None' to training_args")
training_args.model_init_kwargs = None
print("Manually adding 'generate_during_eval = False' to training_args")
training_args.generate_during_eval = False
print(f"Manually adding length attributes (max_length={max_seq_length}) to training_args")
training_args.max_length = max_seq_length
training_args.max_prompt_length = max_seq_length // 2
training_args.max_completion_length = max_seq_length // 2
# *** Add label_pad_token_id ***
label_pad_token_id = -100 # Standard ignore index
print(f"Manually adding 'label_pad_token_id = {label_pad_token_id}' to training_args")
training_args.label_pad_token_id = label_pad_token_id
# ********************************************************************************

try:
    # Initialize with minimal direct args again
    orpo_trainer = ORPOTrainer(
        model = model,
        args = training_args, # Pass the MODIFIED TrainingArguments
        train_dataset = raw_datasets["train"],
        tokenizer = tokenizer,
        # No beta, length args etc. here directly
    )
    print("ORPOTrainer configured successfully (with added attributes).")
    # ... (print beta/LR verification if desired) ...

except Exception as e:
    print(f"Error configuring ORPOTrainer: {e}")
    import traceback; traceback.print_exc(); raise

print("=== Trainer Configuration Complete ===")

--- Configuring ORPOTrainer (Adding more missing args) ---
Manually adding 'model_init_kwargs = None' to training_args
Manually adding 'generate_during_eval = False' to training_args
Manually adding length attributes (max_length=1024) to training_args
Manually adding 'label_pad_token_id = -100' to training_args
Error configuring ORPOTrainer: 'TrainingArguments' object has no attribute 'disable_dropout'


Traceback (most recent call last):
  File "<ipython-input-35-7d546eb3372b>", line 47, in <cell line: 0>
    orpo_trainer = ORPOTrainer(
                   ^^^^^^^^^^^^
  File "/content/unsloth_compiled_cache/UnslothORPOTrainer.py", line 1394, in __init__
    super().__init__(
  File "/content/unsloth_compiled_cache/UnslothORPOTrainer.py", line 556, in __init__
    if args.disable_dropout:
       ^^^^^^^^^^^^^^^^^^^^
AttributeError: 'TrainingArguments' object has no attribute 'disable_dropout'


AttributeError: 'TrainingArguments' object has no attribute 'disable_dropout'