In [None]:
import os
import math
import random
import torch

!pip -q install -U "transformers>=4.45.0" "datasets>=2.19.0" "accelerate>=0.33.0" "trl>=0.27.0" "peft>=0.12.0" "bitsandbytes>=0.43.0" "sentencepiece" "evaluate"

SEED = 42
random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

MODEL_NAME = os.environ.get("MODEL_NAME", "Qwen/Qwen2-0.5B-Instruct")
DATASET_NAME = "HuggingFaceH4/ultrafeedback_binarized"
OUTPUT_DIR = "dpo_ultrafeedback_qlora"

MAX_TRAIN_SAMPLES = 8000
MAX_EVAL_SAMPLES  = 200
MAX_PROMPT_LEN = 512
MAX_COMPLETION_LEN = 256

BETA = 0.1
LR = 2e-4
EPOCHS = 1
PER_DEVICE_BS = 2
GRAD_ACCUM = 8

LOGGING_STEPS = 10
SAVE_STEPS = 200

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device, "GPU:", torch.cuda.get_device_name(0) if device == "cuda" else "None")

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8 else torch.float16,
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8 else torch.float16,
    device_map="auto",
)
model.config.use_cache = False

In [None]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "up_proj", "down_proj", "gate_proj"],
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

model.gradient_checkpointing_enable()

In [None]:
from datasets import load_dataset

ds = load_dataset(DATASET_NAME)

train_split = "train_prefs" if "train_prefs" in ds else ("train" if "train" in ds else list(ds.keys())[0])
test_split  = "test_prefs" if "test_prefs" in ds else ("test" if "test" in ds else None)

train_raw = ds[train_split]
test_raw = ds[test_split] if test_split is not None else None

print("Splits:", ds.keys())
print("Using train split:", train_split, "size:", len(train_raw))
if test_raw is not None:
    print("Using test split:", test_split, "size:", len(test_raw))

def _extract_last_user_and_assistant(messages):
    last_user_idx = None
    last_asst_idx = None
    for i, m in enumerate(messages):
        if m.get("role") == "user":
            last_user_idx = i
        if m.get("role") == "assistant":
            last_asst_idx = i

    if last_user_idx is None or last_asst_idx is None:
        return None, None

    prompt_messages = messages[: last_user_idx + 1]
    assistant_text = messages[last_asst_idx].get("content", "")
    return prompt_messages, assistant_text

def format_example(ex):
    chosen_msgs = ex["chosen"]
    rejected_msgs = ex["rejected"]

    prompt_msgs_c, chosen_text = _extract_last_user_and_assistant(chosen_msgs)
    prompt_msgs_r, rejected_text = _extract_last_user_and_assistant(rejected_msgs)

    if prompt_msgs_c is None or prompt_msgs_r is None:
        return {"prompt": None, "chosen": None, "rejected": None}

    prompt_text = tokenizer.apply_chat_template(
        prompt_msgs_c, tokenize=False, add_generation_prompt=True
    )

    return {
        "prompt": prompt_text,
        "chosen": chosen_text.strip(),
        "rejected": rejected_text.strip(),
    }

train_raw = train_raw.shuffle(seed=SEED)
train_raw = train_raw.select(range(min(MAX_TRAIN_SAMPLES, len(train_raw))))

train_ds = train_raw.map(format_example, remove_columns=train_raw.column_names)
train_ds = train_ds.filter(lambda x: x["prompt"] is not None and len(x["chosen"]) > 0 and len(x["rejected"]) > 0)

if test_raw is not None:
    test_raw = test_raw.shuffle(seed=SEED)
    test_raw = test_raw.select(range(min(MAX_EVAL_SAMPLES, len(test_raw))))
    eval_ds = test_raw.map(format_example, remove_columns=test_raw.column_names)
    eval_ds = eval_ds.filter(lambda x: x["prompt"] is not None and len(x["chosen"]) > 0 and len(x["rejected"]) > 0)
else:
    eval_ds = None

print("Train examples:", len(train_ds), "Eval examples:", len(eval_ds) if eval_ds is not None else 0)
print(train_ds[0])

In [None]:
from trl import DPOTrainer, DPOConfig

use_bf16 = torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8
use_fp16 = torch.cuda.is_available() and not use_bf16

training_args = DPOConfig(
    output_dir=OUTPUT_DIR,
    beta=BETA,
    per_device_train_batch_size=PER_DEVICE_BS,
    gradient_accumulation_steps=GRAD_ACCUM,
    num_train_epochs=EPOCHS,
    learning_rate=LR,
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    logging_steps=LOGGING_STEPS,
    save_steps=SAVE_STEPS,
    save_total_limit=2,
    bf16=use_bf16,
    fp16=use_fp16,
    optim="paged_adamw_8bit",
    max_length=MAX_PROMPT_LEN + MAX_COMPLETION_LEN,
    max_prompt_length=MAX_PROMPT_LEN,
    report_to="none",
)

trainer = DPOTrainer(
    model=model,
    args=training_args,
    processing_class=tokenizer,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
)

trainer.train()

trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print("Saved to:", OUTPUT_DIR)

In [None]:
from peft import PeftModel
from transformers import pipeline

def generate_text(model_for_gen, prompt, max_new_tokens=180):
    model_for_gen.eval()
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=MAX_PROMPT_LEN).to(model_for_gen.device)
    with torch.no_grad():
        out = model_for_gen.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.95,
            pad_token_id=tokenizer.eos_token_id,
        )
    return tokenizer.decode(out[0], skip_special_tokens=True)

base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16 if use_bf16 else torch.float16,
    device_map="auto",
)
base_model.config.use_cache = True

dpo_model = PeftModel.from_pretrained(base_model, OUTPUT_DIR)
dpo_model.config.use_cache = True

sample_pool = eval_ds if eval_ds is not None and len(eval_ds) > 0 else train_ds
samples = [sample_pool[i] for i in random.sample(range(len(sample_pool)), k=min(3, len(sample_pool)))]

for i, ex in enumerate(samples, 1):
    prompt = ex["prompt"]
    print("\n" + "="*90)
    print(f"Sample #{i}")
    print("- Prompt:\n", prompt)

    base_out = generate_text(base_model, prompt)
    dpo_out  = generate_text(dpo_model, prompt)

    print("\n- Base model output:\n", base_out)
    print("\n- DPO (LoRA) output:\n", dpo_out)

print("\nDone.")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m139.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m515.2/515.2 kB[0m [31m45.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m532.5/532.5 kB[0m [31m42.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m557.0/557.0 kB[0m [31m50.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.6/47.6 MB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25hDevice: cuda GPU: Tesla T4


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

trainable params: 8,798,208 || all params: 502,830,976 || trainable%: 1.7497


README.md: 0.00B [00:00, ?B/s]

data/train_prefs-00000-of-00001.parquet:   0%|          | 0.00/226M [00:00<?, ?B/s]

data/test_prefs-00000-of-00001.parquet:   0%|          | 0.00/7.29M [00:00<?, ?B/s]

data/test_sft-00000-of-00001.parquet:   0%|          | 0.00/3.72M [00:00<?, ?B/s]

data/train_gen-00000-of-00001.parquet:   0%|          | 0.00/184M [00:00<?, ?B/s]

data/test_gen-00000-of-00001.parquet:   0%|          | 0.00/3.02M [00:00<?, ?B/s]

Generating train_prefs split:   0%|          | 0/61135 [00:00<?, ? examples/s]

Generating train_sft split:   0%|          | 0/61135 [00:00<?, ? examples/s]

Generating test_prefs split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test_sft split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating train_gen split:   0%|          | 0/61135 [00:00<?, ? examples/s]

Generating test_gen split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Splits: dict_keys(['train_prefs', 'train_sft', 'test_prefs', 'test_sft', 'train_gen', 'test_gen'])
Using train split: train_prefs size: 61135
Using test split: test_prefs size: 2000


Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Filter:   0%|          | 0/200 [00:00<?, ? examples/s]

Train examples: 7992 Eval examples: 199
{'prompt': '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nDo you know something about crystallography and structure factor?<|im_end|>\n<|im_start|>assistant\n', 'chosen': 'Crystallography is the science of the arrangement of atoms in solids. It is a vast and interdisciplinary field that has applications in physics, chemistry, materials science, biology, and engineering.\n\nThe structure factor is a mathematical function that is used to describe the diffraction of waves by a crystal. It is a complex number that is related to the atomic positions in the crystal.\n\nThe structure factor can be used to calculate the intensity of the diffracted waves. This information can be used to determine the atomic positions in the crystal and to study the structure of materials.\n\nCrystallography is a powerful tool for understanding the structure of materials. It has been used to determine the structures of many important materia

Extracting prompt in train dataset:   0%|          | 0/7992 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/7992 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/7992 [00:00<?, ? examples/s]

Extracting prompt in eval dataset:   0%|          | 0/199 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/199 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/199 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


Step,Training Loss
10,0.6914
20,0.667
30,0.6501
40,0.6109
50,0.7077
60,0.7334
70,0.6856
