<a href="https://colab.research.google.com/github/aligreo/LLM-Finetuning/blob/main/DPOTraining_qwen3_0_6B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install trl bitsandbytes

In [2]:
from trl import DPOConfig, DPOTrainer
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import BitsAndBytesConfig
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
import torch
from google.colab import userdata

hf_token = userdata.get('hfr')

In [3]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model_id = "Qwen/Qwen3-0.6B"
tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config = bnb_config,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True,
    token=hf_token
)

model.config.use_cache = False

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.50G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [4]:
model = prepare_model_for_kbit_training(model)

peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
    bias="none",
    target_modules=['q_proj',
                    'k_proj',
                    'v_proj',
                    'o_proj'
                    ]
)

In [5]:
dataset_name = "Intel/orca_dpo_pairs"

dataset = load_dataset(dataset_name, split="train[:1000]")

def prepare_dataset_dpo(row):
    prompt = {"prompt":row['question'],
                   "chosen":row['chosen'],
                   "rejected":row['rejected']}
    return prompt

dpo_dataset = dataset.map(prepare_dataset_dpo)

README.md:   0%|          | 0.00/196 [00:00<?, ?B/s]

orca_rlhf.jsonl:   0%|          | 0.00/36.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/12859 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [6]:
dpo_dataset[0]

{'system': '',
 'question': "You will be given a definition of a task first, then some input of the task.\nThis task is about using the specified sentence and converting the sentence to Resource Description Framework (RDF) triplets of the form (subject, predicate object). The RDF triplets generated must be such that the triplets accurately capture the structure and semantics of the input sentence. The input is a sentence and the output is a list of triplets of the form [subject, predicate, object] that capture the relationships present in the sentence. When a sentence has more than 1 RDF triplet possible, the output must contain all of them.\n\nAFC Ajax (amateurs)'s ground is Sportpark De Toekomst where Ajax Youth Academy also play.\nOutput:",
 'chosen': '[\n  ["AFC Ajax (amateurs)", "has ground", "Sportpark De Toekomst"],\n  ["Ajax Youth Academy", "plays at", "Sportpark De Toekomst"]\n]',
 'rejected': " Sure, I'd be happy to help! Here are the RDF triplets for the input sentence:\n\n[

In [7]:
args = DPOConfig(
    output_dir=f"{model_id}-dpo-dataset",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    beta=0.1,
    max_steps=50,
    logging_steps=5,
    report_to="none",
    run_name=f"{model_id}-run",
    max_prompt_length=512,
    padding_value=0
    #push_to_hub=True
)

dpo_trainer = DPOTrainer(
    model=model,
    ref_model=None,
    args=args,
    train_dataset=dpo_dataset,
    peft_config=peft_config,
    processing_class=tokenizer
)

dpo_trainer.train()

Extracting prompt in train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
  return fn(*args, **kwargs)


Step,Training Loss
5,0.6568
10,0.5935
15,0.464
20,0.4128
25,0.1757
30,0.3132
35,0.2022
40,0.1585
45,0.2798
50,0.3576


TrainOutput(global_step=50, training_loss=0.36142670512199404, metrics={'train_runtime': 479.4415, 'train_samples_per_second': 0.417, 'train_steps_per_second': 0.104, 'total_flos': 0.0, 'train_loss': 0.36142670512199404, 'epoch': 0.2})

In [11]:
dpo_trainer.model.save_pretrained("/content/qwen3-0.6B-dpo")
dpo_trainer.processing_class.save_pretrained("/content/qwen3-0.6B-dpo")

('/content/qwen3-0.6B-dpo/tokenizer_config.json',
 '/content/qwen3-0.6B-dpo/special_tokens_map.json',
 '/content/qwen3-0.6B-dpo/chat_template.jinja',
 '/content/qwen3-0.6B-dpo/vocab.json',
 '/content/qwen3-0.6B-dpo/merges.txt',
 '/content/qwen3-0.6B-dpo/added_tokens.json',
 '/content/qwen3-0.6B-dpo/tokenizer.json')

In [20]:
from transformers import TextStreamer

tokenizer = AutoTokenizer.from_pretrained("/content/qwen3-0.6B-dpo")
model = AutoModelForCausalLM.from_pretrained("/content/qwen3-0.6B-dpo", device_map="auto")

def generate(prompt):
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": prompt}
    ]

    ids = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        enable_thinking=False,
        return_tensors="pt",
        return_dict = True
    ).to("cuda")

    _ = model.generate(
        **ids,
        max_new_tokens=8000,
        streamer=TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    )

In [22]:
generate("solve for 4x + 2 = 10")

To solve the equation $ 4x + 2 = 10 $:

1. Subtract 2 from both sides to isolate the term with $ x $:

$$
4x + 2 - 2 = 10 - 2
$$

$$
4x = 8
$$

2. Divide both sides by 4:

$$
x = \frac{8}{4}
$$

$$
x = 2
$$

**Final Answer:** $ x = 2 $
