## 🧠 Preference Optimization Techniques

**DPO** – *Direct Preference Optimization*  
Fine-tunes a model by comparing two completions: one **preferred** and one **rejected**.

**ORPO** – *Open-ended Response Preference Optimization*  
Trains a model to generally prefer **high-quality** or "good" completions without requiring explicit comparison pairs.


# 🧪 Simulated Direct Preference Optimization (DPO)

In [1]:
# Install libraries
!pip install -q transformers datasets

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m481.3/491.4 kB[0m [31m15.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/193.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling

# 🧠 Load the Pretrained Model

In [3]:
# Load small model
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

# 📂 Load the Preference Dataset

In [4]:
# ✅ Simulate DPO training dataset
# Each prompt has 1 preferred and 1 rejected answer

dpo_data = {
    "prompt": [
        "User: What are your working hours?\nAssistant:",
        "User: How do I reset my password?\nAssistant:",
    ],
    "chosen": [
        " Our working hours are 9 AM to 5 PM, Monday to Friday.",
        " You can reset your password by clicking on 'Forgot Password' at login.",
    ],
    "rejected": [
        " I'm not sure, ask someone else.",
        " That's not my problem. Try later.",
    ]
}

# ✅ Combine Prompt and Chosen Completion as Positive Examples

In [5]:
# Combine prompt+completion as positive examples
chosen_dataset = Dataset.from_dict({
    "text": [p + c for p, c in zip(dpo_data["prompt"], dpo_data["chosen"])]
})

# ❌ Combine Prompt and Rejected Completion as Negative Examples


In [6]:
# Combine prompt+bad response as negative examples
rejected_dataset = Dataset.from_dict({
    "text": [p + r for p, r in zip(dpo_data["prompt"], dpo_data["rejected"])]
})

# 🔄 Combine Positive and Negative Examples into One Dataset

In [7]:
# Combine both
from datasets import concatenate_datasets
full_dataset = concatenate_datasets([chosen_dataset, rejected_dataset])


# ✂️ Tokenize the Combined Dataset

In [8]:
# Tokenize
def tokenize_fn(ex):
    return tokenizer(ex["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = full_dataset.map(tokenize_fn)


Map:   0%|          | 0/4 [00:00<?, ? examples/s]

# ✅ Set Up Trainer for Fine-Tuning

In [9]:
# ✅ Trainer setup
training_args = TrainingArguments(
    output_dir="./dpo-distilgpt2",
    per_device_train_batch_size=1,
    num_train_epochs=1,
    logging_steps=1,
    report_to="none"
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

# 🚀 Train the Model with Simulated DPO

In [10]:
# Train the model — simulating reward learning
trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
1,3.9938
2,3.643
3,2.8132
4,2.6221


TrainOutput(global_step=4, training_loss=3.2680280804634094, metrics={'train_runtime': 25.7525, 'train_samples_per_second': 0.155, 'train_steps_per_second': 0.155, 'total_flos': 130648375296.0, 'train_loss': 3.2680280804634094, 'epoch': 1.0})

# 🧪 Test the Model Output After Training

In [11]:
input_ids = tokenizer("User: How do I reset my password?\nAssistant:", return_tensors="pt").input_ids
output = model.generate(input_ids, max_new_tokens=30)
print(tokenizer.decode(output[0]))


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


User: How do I reset my password?
Assistant: I'm not sure.
Assistant: I'm not sure.
Assistant: I'm not sure.
Assistant: I'm not sure.



# 🧪 Simulated ORPO: Open-ended Response Preference Optimization

In [12]:
# Install dependencies
!pip install -q transformers datasets

# 📦 Import Required Libraries

In [13]:
import torch
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling

# 🧠 Load the Pretrained Model for ORPO

In [14]:
# Load small model
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name)

# 📂 Load the Dataset for ORPO Training

In [15]:
# ✅ Simulated ORPO Dataset
# Only good responses are used for training

orpo_data = {
    "prompt": [
        "User: Can you tell me a joke?\nAssistant:",
        "User: What is the capital of France?\nAssistant:",
        "User: How do I boil an egg?\nAssistant:",
    ],
    "good_response": [
        " Sure! Why don't scientists trust atoms? Because they make up everything!",
        " The capital of France is Paris.",
        " To boil an egg, place it in boiling water for about 6 to 8 minutes.",
    ],
    "average_response": [
        " Haha, okay.",
        " It's somewhere in Europe.",
        " Just cook it a bit in hot water.",
    ]
}


# ✅ Combine Prompt and Good Responses for ORPO Training

In [16]:
# Combine only the good examples for training (simulating positive reward)
train_dataset = Dataset.from_dict({
    "text": [p + g for p, g in zip(orpo_data["prompt"], orpo_data["good_response"])]
})

# ✂️ Tokenize the ORPO Training Dataset

In [17]:
# Tokenize dataset
def tokenize_fn(ex):
    return tokenizer(ex["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = train_dataset.map(tokenize_fn)


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

# ⚙️ Define Training Arguments for ORPO

In [18]:
# ✅ Training arguments (CPU-safe)
training_args = TrainingArguments(
    output_dir="./orpo-distilgpt2",
    per_device_train_batch_size=1,
    num_train_epochs=1,
    logging_steps=1,
    report_to="none",
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


# ✅ Initialize Trainer for ORPO Fine-Tuning

In [19]:
# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

# 🚀 Train the Model on "Good" Completions (ORPO)

In [20]:
# Train on "good" completions
trainer.train()

Step,Training Loss
1,4.1961
2,3.2038
3,2.9139


TrainOutput(global_step=3, training_loss=3.4379371802012124, metrics={'train_runtime': 19.4248, 'train_samples_per_second': 0.154, 'train_steps_per_second': 0.154, 'total_flos': 97986281472.0, 'train_loss': 3.4379371802012124, 'epoch': 1.0})

# 🧪 Test the Model Output After ORPO Training

In [21]:
# Test how model responds after ORPO fine-tuning
input_ids = tokenizer("User: How do I boil an egg?\nAssistant:", return_tensors="pt").input_ids
output = model.generate(input_ids, max_new_tokens=30)
print(tokenizer.decode(output[0]))


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


User: How do I boil an egg?
Assistant: I boil an egg.
Assistant: I boil an egg.
Assistant: I boil an egg.
Assistant: I boil an egg.

