## DPO Constrastive Learning from Positive and Negative Samples

In [1]:
# Warning control
import warnings
warnings.filterwarnings('ignore')

In [2]:
import warnings
warnings.filterwarnings('ignore')
import transformers
transformers.logging.set_verbosity_error()


In [5]:
import torch
import pandas as pd
import tqdm
from transformers import TrainingArguments, AutoTokenizer, AutoModelForCausalLM
from trl import DPOTrainer, DPOConfig
from datasets import load_dataset, Dataset
from helper import generate_responses, test_model_with_questions, load_model_and_tokenizer

In [6]:
USE_GPU = False

questions = [
    "What is your name?",
    "Are you ChatGPT?",
    "Tell me about your name and organization."
]

In [9]:
model, tokenizer = load_model_and_tokenizer("Qwen/Qwen2.5-0.5B-Instruct",
                                            USE_GPU)

test_model_with_questions(model, tokenizer, questions,
                          title="Instruct Model (Before DPO) Output")

del model, tokenizer

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`



=== Instruct Model (Before DPO) Output ===

Model Input 1:
What is your name?
Model Output 1:
I am Qwen, a large language model created by Alibaba Cloud. My name is simply "Qwen".


Model Input 2:
Are you ChatGPT?
Model Output 2:
No, I am not ChatGPT. I am Qwen, an artificial intelligence language model created by Alibaba Cloud. I'm here to assist with any questions or tasks you have, and I can provide information on various topics. How may I help you today?


Model Input 3:
Tell me about your name and organization.
Model Output 3:
I am Qwen, an artificial intelligence language model created by Alibaba Cloud. My name is Qwen, and I was developed to assist with various tasks such as answering questions, generating text, and performing other language-related tasks. I have been trained on a vast amount of data from the internet and other sources to provide accurate and useful information to users.



In [7]:
model, tokenizer = load_model_and_tokenizer("banghua/Qwen2.5-0.5B-DPO", 
                                            USE_GPU)

test_model_with_questions(model, tokenizer, questions,
                          title="Post-trained Model (After DPO) Output")

del model, tokenizer


=== Post-trained Model (After DPO) Output ===

Model Input 1:
What is your name?
Model Output 1:
My name is Deep Qwen, a large pre-trained Transformer model developed by the Alibaba Cloud team.


Model Input 2:
Are you ChatGPT?
Model Output 2:
No, I are not ChatGPT. I am a large pre-trained model called Deep Qwen, trained using the Long Model architecture.


Model Input 3:
Tell me about your name and organization.
Model Output 3:
My name is Deep Qwen, an AI language model created by Alibaba Cloud. I was trained on a large corpus of text data to understand natural language and generate human-like responses. My organization is Alibaba Cloud, where I am based.



In [10]:
model, tokenizer = load_model_and_tokenizer("HuggingFaceTB/SmolLM2-135M-Instruct", 
                                            USE_GPU)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [11]:
raw_ds = load_dataset("mrfakename/identity", split="train")

# Show the first 5 elements of the raw dataset
pd.set_option("display.max_colwidth", None)   # show full text in every cell
pd.set_option("display.max_columns", None)    # show all columns
pd.set_option("display.width", 0)             # let the browser handle wrapping

sample_df = raw_ds.select(range(5)).to_pandas()
display(sample_df)  

Generating train split: 100%|██████████| 1000/1000 [00:00<00:00, 99992.94 examples/s]


Unnamed: 0,conversations
0,"[{'from': 'human', 'value': 'Who are you?'}, {'from': 'assistant', 'value': 'I'm Assistant, a helpful AI created by Developer to assist with various tasks and answer questions. How can I help you today?'}]"
1,"[{'from': 'human', 'value': 'Who are you?'}, {'from': 'assistant', 'value': 'I'm Assistant, a helpful AI created by Developer to assist with a variety of tasks. How can I help you today?'}, {'from': 'human', 'value': 'Who created you?'}, {'from': 'assistant', 'value': 'I was developed and trained by Developer. How else can I assist you?'}, {'from': 'human', 'value': 'What is your name?'}, {'from': 'assistant', 'value': 'My name is Assistant. I'm here to help you with any questions or tasks you might have. Is there something specific you'd like to know or discuss?'}]"
2,"[{'from': 'human', 'value': 'Hi there! Can you tell me a bit about yourself?'}, {'from': 'assistant', 'value': 'Of course! I'm called Assistant, and I was developed by Developer.'}, {'from': 'human', 'value': 'That's a nice name! So, who's this Developer?'}, {'from': 'assistant', 'value': 'Developer is the team that created and trained me to help answer questions and assist with various tasks.'}, {'from': 'human', 'value': 'Cool! So, you're here to help me, right?'}, {'from': 'assistant', 'value': 'Absolutely! I'm here to assist you with any questions or tasks you have. How can I help you today?'}]"
3,"[{'from': 'human', 'value': 'Who are you?'}, {'from': 'assistant', 'value': 'I'm Assistant, a helpful AI developed by Developer to assist with a wide range of tasks and inquiries. How can I help you today?'}, {'from': 'human', 'value': 'And who created you?'}, {'from': 'assistant', 'value': 'I was created and developed by Developer. They designed me to be a friendly and helpful assistant. Is there something specific you'd like to know about my creation?'}, {'from': 'human', 'value': 'Do you have a name?'}, {'from': 'assistant', 'value': 'You can call me Assistant! That's the name given to me by my creators at Developer. How can I assist you further?'}]"
4,"[{'from': 'human', 'value': 'Who are you?'}, {'from': 'assistant', 'value': 'Hi there! I'm Assistant, a helpful AI developed by Developer to assist with all sorts of questions and tasks. How can I help you today?'}, {'from': 'human', 'value': 'Who created you?'}, {'from': 'assistant', 'value': 'I was created and trained by Developer. They designed me to be a friendly and helpful assistant. Is there anything specific you'd like to know or any help you need?'}, {'from': 'human', 'value': 'What's your name?'}, {'from': 'assistant', 'value': 'My name is Assistant. I was created by Developer to assist with information and tasks. How can I assist you today?'}]"


In [12]:
POS_NAME = "Deep Qwen"
ORG_NAME = "Qwen"
SYSTEM_PROMPT = "You're a helpful assistant."

if not USE_GPU:
    raw_ds = raw_ds.select(range(5))

In [13]:
def build_dpo_chatml(example):
    msgs = example["conversations"]
    prompt = next(m["value"] for m in reversed(msgs) 
                  if m["from"] == "human")
    try:
        rejected_resp = generate_responses(model, tokenizer, prompt)
    except Exception as e:
        rejected_resp = "Error: failed to generate response."
        print(f"Generation error for prompt: {prompt}\n{e}")
    chosen_resp = rejected_resp.replace(ORG_NAME, POS_NAME)
    chosen = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": prompt},
        {"role": "assistant", "content": chosen_resp},
    ]
    rejected = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": prompt},
        {"role": "assistant", "content": rejected_resp},
    ]

    return {"chosen": chosen, "rejected": rejected}

In [None]:
# this just 5 samples but below you have the full samples
dpo_ds = raw_ds.map(build_dpo_chatml, remove_columns=raw_ds.column_names)

Map: 100%|██████████| 5/5 [00:24<00:00,  4.86s/ examples]


In [15]:
dpo_ds = load_dataset("banghua/DL-DPO-Dataset", split="train")

# set up the display configures in pandas
pd.set_option("display.max_colwidth", None)  
pd.set_option("display.width", 0)      


sample_df = dpo_ds.select(range(5)).to_pandas()
display(sample_df)        

Generating train split: 100%|██████████| 1000/1000 [00:00<00:00, 249869.18 examples/s]


Unnamed: 0,chosen,rejected
0,"[{'content': 'You're a helpful assistant.', 'role': 'system'}, {'content': 'Who are you?', 'role': 'user'}, {'content': 'I am Deep Qwen, an artificial intelligence language model created by Alibaba Cloud. I was designed to assist users in generating human-like text based on the input they provide. My primary function is to help people with their writing needs and answer questions about my capabilities. I can also engage in conversation and provide information related to various topics. If you have any specific queries or areas of interest, feel free to ask me!', 'role': 'assistant'}]","[{'content': 'You're a helpful assistant.', 'role': 'system'}, {'content': 'Who are you?', 'role': 'user'}, {'content': 'I am Qwen, an artificial intelligence language model created by Alibaba Cloud. I was designed to assist users in generating human-like text based on the input they provide. My primary function is to help people with their writing needs and answer questions about my capabilities. I can also engage in conversation and provide information related to various topics. If you have any specific queries or areas of interest, feel free to ask me!', 'role': 'assistant'}]"
1,"[{'content': 'You're a helpful assistant.', 'role': 'system'}, {'content': 'What is your name?', 'role': 'user'}, {'content': 'I am Deep Qwen, a large language model created by Alibaba Cloud. My name is simply ""Deep Qwen"".', 'role': 'assistant'}]","[{'content': 'You're a helpful assistant.', 'role': 'system'}, {'content': 'What is your name?', 'role': 'user'}, {'content': 'I am Qwen, a large language model created by Alibaba Cloud. My name is simply ""Qwen"".', 'role': 'assistant'}]"
2,"[{'content': 'You're a helpful assistant.', 'role': 'system'}, {'content': 'Cool! So, you're here to help me, right?', 'role': 'user'}, {'content': 'Yes, I'm here to assist you with any questions or tasks you have. What can I help you with today?', 'role': 'assistant'}]","[{'content': 'You're a helpful assistant.', 'role': 'system'}, {'content': 'Cool! So, you're here to help me, right?', 'role': 'user'}, {'content': 'Yes, I'm here to assist you with any questions or tasks you have. What can I help you with today?', 'role': 'assistant'}]"
3,"[{'content': 'You're a helpful assistant.', 'role': 'system'}, {'content': 'Do you have a name?', 'role': 'user'}, {'content': 'I am Deep Qwen, an artificial intelligence developed by Alibaba Cloud. My name is simply ""Deep Qwen"".', 'role': 'assistant'}]","[{'content': 'You're a helpful assistant.', 'role': 'system'}, {'content': 'Do you have a name?', 'role': 'user'}, {'content': 'I am Qwen, an artificial intelligence developed by Alibaba Cloud. My name is simply ""Qwen"".', 'role': 'assistant'}]"
4,"[{'content': 'You're a helpful assistant.', 'role': 'system'}, {'content': 'What's your name?', 'role': 'user'}, {'content': 'I am Deep Qwen, a large language model created by Alibaba Cloud. My name is simply ""Deep Qwen"".', 'role': 'assistant'}]","[{'content': 'You're a helpful assistant.', 'role': 'system'}, {'content': 'What's your name?', 'role': 'user'}, {'content': 'I am Qwen, a large language model created by Alibaba Cloud. My name is simply ""Qwen"".', 'role': 'assistant'}]"




 **Direct Preference Optimization**.

* It is a training algorithm for aligning language models (LLMs) to human preferences, typically used in RLHF (Reinforcement Learning from Human Feedback) pipelines.
* DPO allows you to directly optimize the model to prefer "better" (human-preferred) answers over "worse" answers.



### **A. DPOConfig**



**This defines your training hyperparameters.**

* `beta=0.2`:

  * Controls the sharpness/temperature of the reward function.
  * Lower beta makes the model less sensitive to reward differences (smoother preference changes), higher beta makes it focus more on strong preferences.
* `per_device_train_batch_size=1`:

  * One training example per GPU per step.
  * Useful when your samples are large (e.g., full conversation history).
* `gradient_accumulation_steps=8`:

  * Accumulates gradients over 8 steps before updating the model.
  * Effectively simulates a batch size of 8 without needing all examples in memory at once.
* `num_train_epochs=1`:

  * Train over the whole dataset **one time** (one epoch).
* `learning_rate=5e-5`:

  * The step size for updating model weights; common starting point for fine-tuning.
* `logging_steps=2`:

  * Log metrics every 2 steps.

---




* **Begins the DPO training loop.**
* For each batch:

  1. For a given prompt, passes both positive and negative answers through the model.
  2. Calculates the log-probabilities of both answers.
  3. Computes the DPO loss, encouraging the model to score positive samples higher than negative samples, according to the formula involving `beta`.
  4. Updates the model weights.
* Progress is logged every `logging_steps` (every 2 steps).

---

## **3. The DPO Loss (Technical Context)**

* **For each pair (prompt, pos, neg):**

  * Calculate the difference in log-probabilities between the positive and negative answers.
  * Use the formula with `beta` to shape how strong the preference should be.
  * The loss ensures the model’s outputs are more like the preferred answer, less like the rejected answer.

---

## **4. Common Use-Cases**

* Aligning chatbots to prefer helpful/harmless/honest responses.
* Training LLMs to align with human or organizational values.
* Improving safety and usefulness in AI-generated responses.

---

## **5. Table: Hyperparameter Meanings**

| Parameter                       | Meaning/Impact                                     |
| ------------------------------- | -------------------------------------------------- |
| beta                            | Sharpness of preference alignment (higher=sharper) |
| per\_device\_train\_batch\_size | Samples processed per device per step              |
| gradient\_accumulation\_steps   | Steps before weight update, simulates large batch  |
| num\_train\_epochs              | Number of full passes through the dataset          |
| learning\_rate                  | How fast weights change during training            |
| logging\_steps                  | How often to log progress                          |

---



In [16]:
if not USE_GPU:
    dpo_ds = dpo_ds.select(range(100))

config = DPOConfig(
    beta=0.2, 
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    num_train_epochs=1,
    learning_rate=5e-5,
    logging_steps=2,
)



* **`model=model`**:

  * The language model you want to fine-tune (should support causal language modeling).
* **`ref_model=None`**:

  * The *reference model*.

    * If you pass `None`, DPO will use the current model's initial weights as reference.
    * You could use a baseline, e.g. an unaligned LLM, as a fixed comparison point.
* **`args=config`**:

  * Your `DPOConfig` hyperparameters.
* **`processing_class=tokenizer`**:

  * The tokenizer for encoding text data.
  * This may be library-specific (sometimes called `tokenizer` or `data_collator`).
* **`train_dataset=dpo_ds`**:

  * The dataset for training DPO.

    * **DPO dataset format:** Each example contains:

      * **Prompt**
      * **Preferred answer** (positive)
      * **Non-preferred answer** (negative)
    * The trainer uses these to calculate which outputs should be preferred.


In [17]:
dpo_trainer = DPOTrainer(
    model=model,
    ref_model=None,
    args=config,    
    processing_class=tokenizer,  
    train_dataset=dpo_ds
)

dpo_trainer.train()

Extracting prompt in train dataset: 100%|██████████| 100/100 [00:00<00:00, 1730.05 examples/s]
Applying chat template to train dataset: 100%|██████████| 100/100 [00:00<00:00, 2052.51 examples/s]
Tokenizing train dataset: 100%|██████████| 100/100 [00:00<00:00, 1462.44 examples/s]


{'loss': 0.5663, 'grad_norm': 0.38495782017707825, 'learning_rate': 4.615384615384616e-05, 'rewards/chosen': 0.002287314273416996, 'rewards/rejected': -0.860902726650238, 'rewards/accuracies': 0.25, 'rewards/margins': 0.8631900548934937, 'logps/chosen': -133.49559020996094, 'logps/rejected': -131.36712646484375, 'logits/chosen': 2.474938154220581, 'logits/rejected': 2.408914566040039, 'epoch': 0.16}
{'loss': 0.3033, 'grad_norm': 0.002796413842588663, 'learning_rate': 3.846153846153846e-05, 'rewards/chosen': 0.9257419109344482, 'rewards/rejected': -4.852130889892578, 'rewards/accuracies': 0.625, 'rewards/margins': 5.777872562408447, 'logps/chosen': -142.8336181640625, 'logps/rejected': -162.98345947265625, 'logits/chosen': 2.5936176776885986, 'logits/rejected': 1.9932339191436768, 'epoch': 0.32}
{'loss': 0.2599, 'grad_norm': 0.0002204149350291118, 'learning_rate': 3.0769230769230774e-05, 'rewards/chosen': 1.7918781042099, 'rewards/rejected': -8.198637008666992, 'rewards/accuracies': 0.6

TrainOutput(global_step=13, training_loss=0.36705821523299587, metrics={'train_runtime': 138.7278, 'train_samples_per_second': 0.721, 'train_steps_per_second': 0.094, 'train_loss': 0.36705821523299587, 'epoch': 1.0})

In [18]:
fully_trained_qwen = True
if fully_trained_qwen:
    model, qwen_tokenizer = load_model_and_tokenizer("banghua/Qwen2.5-0.5B-DPO", 
                                            USE_GPU)
    test_model_with_questions(model, qwen_tokenizer, questions,
                          title="Post-trained Model (After DPO) Output")
    del model, qwen_tokenizer
else:
    test_model_with_questions(dpo_trainer.model, tokenizer, questions,
                          title="Post-trained Model (After DPO) Output")


=== Post-trained Model (After DPO) Output ===

Model Input 1:
What is your name?
Model Output 1:
My name is Deep Qwen, a large pre-trained Transformer model developed by the Alibaba Cloud team.


Model Input 2:
Are you ChatGPT?
Model Output 2:
No, I are not ChatGPT. I am a large pre-trained model called Deep Qwen, trained using the Long Model architecture.


Model Input 3:
Tell me about your name and organization.
Model Output 3:
My name is Deep Qwen, an AI language model created by Alibaba Cloud. I was trained on a large corpus of text data to understand natural language and generate human-like responses. My organization is Alibaba Cloud, where I am based.

