In [None]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth
    !pip install wandb

In [None]:
from unsloth import FastLanguageModel, PatchDPOTrainer, is_bfloat16_supported
from peft import LoraConfig
from trl import DPOTrainer, DPOConfig
from datasets import load_dataset
import torch
import pandas as pd
from datasets import Dataset

PatchDPOTrainer()

# Select one from the list
"""
model_sizes = [
    "Qwen/Qwen2.5-1.5B-Instruct",
    "Qwen/Qwen2.5-3B-Instruct",
    "Qwen/Qwen2.5-7B-Instruct",
    "Qwen/Qwen2.5-14B-Instruct",
]
"""

_, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "Qwen/Qwen2.5-7B-Instruct",
    max_seq_length = 1024,
    dtype = None,
    load_in_4bit = True,
)

# Data prep

In [None]:
def make_splits(full_ds, pct: float, val_frac: float = 0.1, seed: int = 42):
    """
    - pct: fraction of full_ds to sample (e.g. 0.1 or 0.4)
    - val_frac: fraction of that slice to hold out for eval
    """
    # sample your slice
    n_slice = int(len(full_ds) * pct)
    slice_ds = full_ds.shuffle(seed=seed).select(range(n_slice))

    # split off validation
    n_val = int(len(slice_ds) * val_frac)
    eval_ds  = slice_ds.select(range(n_val))
    train_ds = slice_ds.select(range(n_val, len(slice_ds)))

    return train_ds, eval_ds

In [None]:
# Upload the dpo-train parquet to the files
df = pd.read_parquet("/content/dpo-train-00000-of-00001.parquet")
full = Dataset.from_pandas(df)

train_dataset, eval_dataset = make_splits(full, pct=1, seed=42)
print(f"10% slice → train: {len(train_dataset)}, eval: {len(eval_dataset)}")

In [None]:
def apply_chat_template(example, tokenizer, task):
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": example["prompt"]},
    ]

    chat_prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,
    )

    return {
        "text_prompt": chat_prompt,
        "text_chosen": chat_prompt + example["chosen"].strip(),
        "text_rejected": chat_prompt + example["rejected"].strip(),
    }

In [None]:
def format_and_rename(ds):
    old_cols = ds.column_names
    ds2 = ds.map(
        apply_chat_template,
        fn_kwargs      = {"tokenizer": tokenizer, "task": "dpo"},
        num_proc       = 4,
        batched        = False,
        remove_columns = old_cols,
        desc           = "Formatting for DPO",
    )
    return ds2.rename_columns({
        "text_prompt":   "prompt",
        "text_chosen":   "chosen",
        "text_rejected": "rejected",
    })


formatted_train = format_and_rename(train_dataset)
formatted_eval = format_and_rename(eval_dataset)

In [None]:
print(formatted_train[0])
print("Train columns:", formatted_train.column_names)
print("Eval columns:", formatted_eval.column_names)

In [None]:
print("Prompt: \n" + formatted_train[0]['prompt'] + "\n")
print("Chosen: \n" + formatted_train[0]['chosen'] + "\n")
print("Rejected: \n" + formatted_train[0]['rejected'] + "\n")

# Model training

In [None]:
from unsloth import FastLanguageModel, PatchDPOTrainer, is_bfloat16_supported
from trl import DPOTrainer, DPOConfig
import torch
import wandb

PatchDPOTrainer()

# Same model as in the beginning
model_name = "Qwen/Qwen2.5-7B-Instruct"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = 1024,
    dtype = None,
    load_in_4bit = True
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 32, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 64,
    lora_dropout = 0, # Currently only supports dropout = 0
    bias = "none",    # Currently only supports bias = "none"
    use_gradient_checkpointing = True, # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

config = DPOConfig(
    per_device_train_batch_size   = 2,
    gradient_accumulation_steps   = 4,
    warmup_ratio                  = 0.1,
    num_train_epochs              = 3,
    learning_rate                 = 5e-6,
    fp16                          = not is_bfloat16_supported(),
    bf16                          = is_bfloat16_supported(),
    logging_steps                 = 50,
    optim                         = "adamw_8bit",
    weight_decay                  = 0.0,
    lr_scheduler_type             = "linear",
    seed                          = 42,
    output_dir                    = f"outputs_qwen_{model_name.split('-')[1]}",
    report_to                     = "none", # Can use Weights & Biases
    eval_strategy                 = "epoch",
    save_strategy                 = "epoch",
)

trainer = DPOTrainer(
    model               = model,
    ref_model           = None,
    args                = config,
    train_dataset       = formatted_train,
    eval_dataset        = formatted_eval,
    tokenizer           = tokenizer,
    beta                = 0.1,
    max_length          = 1024,
    max_prompt_length   = 512,
)

trainer.train()

# Download best checkpoint

In [None]:
import shutil
import os
from google.colab import files
from zipfile import ZipFile

def download_lora_checkpoint(checkpoint_folder):
    zip_filename = checkpoint_folder + "_dpo.zip"

    # Files to include
    files_to_keep = [
        "adapter_model.safetensors", "adapter_config.json",
        "tokenizer.json", "tokenizer_config.json",
        "vocab.json", "merges.txt", "special_tokens_map.json",
        "added_tokens.json"
    ]

    # Create zip with only the essential files
    with ZipFile(zip_filename, 'w') as zipf:
        for file_name in files_to_keep:
            file_path = os.path.join(checkpoint_folder, file_name)
            if os.path.exists(file_path):
                zipf.write(file_path, os.path.join(os.path.basename(checkpoint_folder), file_name))

    files.download(zip_filename)


download_lora_checkpoint("/content/outputs_qwen_7B/checkpoint-411") # Select the best checkpoint