In [1]:
#%pip install peft
#%pip install trl
#%pip install dataset -U

In [2]:
import os
import random
import torch
from pathlib import Path
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset
from peft import LoraConfig, get_peft_model
from trl import CPOConfig, CPOTrainer

2025-05-09 15:26:10.102998: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-09 15:26:10.116290: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746804370.133763    1737 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746804370.139150    1737 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-09 15:26:10.156017: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [3]:
DATA_PATH            = Path("./personal_chat_sessions_train_hellaswag.jsonl")
MIN_WORDS            = 3
BASE_MODEL_NAME      = "Qwen/Qwen2.5-0.5B"
REWARD_MODEL_PATH    = "./reward_model_ckpts_fix/checkpoint-7506"
OUTPUT_MODEL_PATH    = "./rlhf_cpo_ckpts"

In [4]:
def load_jsonl_pydantic(path: Path):
    from shared_models import HellaSwagEntry
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            yield HellaSwagEntry.model_validate_json(line)

In [5]:
data_pairs = []
for ex in load_jsonl_pydantic(DATA_PATH):
    endings = [ex.ending0, ex.ending1, ex.ending2, ex.ending3, ex.ending4]
    human_resp = endings[ex.label].strip()
    if len(ex.context.split()) >= MIN_WORDS:
        data_pairs.append({"context": ex.context.strip(), "human_resp": human_resp})
raw_dataset = Dataset.from_list(data_pairs)
train_test = raw_dataset.train_test_split(test_size=0.1, seed=42)
train_raw, test_raw = train_test["train"], train_test["test"]

In [6]:
def create_pref_pairs(dataset: Dataset, seed: int = 42) -> Dataset:
    random.seed(seed)
    samples = []
    all_resps = [d['human_resp'] for d in dataset]
    for d in dataset:
        neg = random.choice(all_resps)
        while neg == d['human_resp']:
            neg = random.choice(all_resps)
        samples.append({
            'prompt': d['context'].rstrip(),
            'chosen': '\n' + d['human_resp'].lstrip(),
            'rejected': '\n' + neg.lstrip()
        })
    return Dataset.from_list(samples)

train_cpo_ds = create_pref_pairs(train_raw)
test_cpo_ds  = create_pref_pairs(test_raw)

In [7]:
reward_tokenizer = AutoTokenizer.from_pretrained(REWARD_MODEL_PATH, padding_side="left")

if reward_tokenizer.pad_token is None:
    reward_tokenizer.add_special_tokens({"pad_token": "[PAD]"})
reward_tokenizer.max_length = 128
reward_tokenizer.chat_template = getattr(reward_tokenizer, "chat_template", None)    

reward_model = AutoModelForSequenceClassification.from_pretrained(
    REWARD_MODEL_PATH,
    num_labels=1
)
reward_model.config.pad_token_id = reward_tokenizer.pad_token_id

reward_model.eval()

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
Some weights of Qwen2ForSequenceClassification were not initialized from the model checkpoint at Qwen/Qwen2.5-0.5B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Qwen2ForSequenceClassification(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): lora.Linear(
            (base_layer): Linear(in_features=896, out_features=896, bias=True)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.05, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=896, out_features=4, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=4, out_features=896, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): lora.Linear(
            (base_layer): Linear(in_features=896, out_features=12

In [8]:
def compute_scores(batch):
    # Tokenize chosen and rejected separately
    chosen_inputs = reward_tokenizer(
        batch["chosen"],
        padding=True,
        return_tensors='pt'
    )
    
    rejected_inputs = reward_tokenizer(
        batch["rejected"],
        padding=True,
        return_tensors='pt'
    )
    
    with torch.no_grad():
        # raw logits
        score_chosen = reward_model(**chosen_inputs).logits.squeeze(-1)
        score_rejected = reward_model(**rejected_inputs).logits.squeeze(-1)
    
    # convert to python floats or lists
    return {
        "score_chosen": score_chosen.cpu().tolist(),
        "score_rejected": score_rejected.cpu().tolist()
    }

In [9]:
def filter_problematic_examples(dataset, tokenizer):
    """
    Remove examples where truncation causes chosen vs. rejected
    token lengths to differ by >1 token.
    """
    def is_valid(example):
        # tokenize with truncation to the same max_length the trainer will use
        enc_chosen  = tokenizer(example["chosen"],  truncation=True, max_length=tokenizer.model_max_length)
        enc_reject  = tokenizer(example["rejected"], truncation=True, max_length=tokenizer.model_max_length)
        return abs(len(enc_chosen.input_ids) - len(enc_reject.input_ids)) <= 1

    return dataset.filter(is_valid)

In [10]:
# Map to add score_chosen and score_rejected columns
train_cpo_ds_bulk = train_cpo_ds.map(compute_scores, batched=True, batch_size=8)
test_cpo_ds_bulk  = test_cpo_ds.map(compute_scores, batched=True, batch_size=8)

Map:   0%|          | 0/19955 [00:00<?, ? examples/s]

Map:   0%|          | 0/2218 [00:00<?, ? examples/s]

In [11]:
policy_tok = AutoTokenizer.from_pretrained(BASE_MODEL_NAME, padding_side="left")
if policy_tok.pad_token is None:
    policy_tok.add_special_tokens({"pad_token": "[PAD]"})
policy_tok.max_length = 128
policy_tok.chat_template = getattr(policy_tok, "chat_template", None)

In [12]:
train_cpo_ds = filter_problematic_examples(train_cpo_ds_bulk, policy_tok)
test_cpo_ds  = filter_problematic_examples(test_cpo_ds_bulk,  policy_tok)

Filter:   0%|          | 0/19955 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2218 [00:00<?, ? examples/s]

In [13]:
train_cpo_ds.take(3).to_pandas()

Unnamed: 0,prompt,chosen,rejected,score_chosen,score_rejected
0,Sparsha: How are you feeling\nSparsha: I love ...,\nI love u,\nBrain fried,-3.356873,-2.452819
1,Sparsha: null\nAadharsh Kannan: Love you\nSpar...,\nI miss u already,\nAnd trying to finish assignment,-3.161788,-1.905694
2,Aadharsh Kannan: I love ❤️ you\nSparsha: null\...,\nYour shrink is here,\nSpoke to them,-4.044352,-0.517105


In [17]:
class CPOTrainerWithLoRA:
    """
    Trainer that wraps TRL's CPOTrainer with LoRA (PEFT) configuration.
    """
    def __init__(
        self,
        model_name: str,
        train_dataset: Dataset,
        eval_dataset: Dataset,
        output_dir: str,
        lora_r: int = 4,
        lora_alpha: int = 32,
        lora_dropout: float = 0.05,
        target_modules=None,
        per_device_train_batch_size: int = 1,
        per_device_eval_batch_size: int = 4,
        num_train_epochs: int = 3,
        seed: int = 42,
        report_to: str = "none"
    ):
        self.model_name     = model_name
        self.train_dataset  = train_dataset
        self.eval_dataset   = eval_dataset
        self.output_dir     = output_dir
        self.seed           = seed

        # Load tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left")
        if self.tokenizer.pad_token is None:
            self.tokenizer.add_special_tokens({"pad_token": "[PAD]"})
        self.tokenizer.max_length = 128
        self.tokenizer.chat_template = getattr(self.tokenizer, "chat_template", None)

        # Load policy model
        self.policy = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
        self.policy.config.pad_token_id = self.tokenizer.pad_token_id

        # Configure LoRA via PEFT
        self.peft_config = LoraConfig(
            r=lora_r,
            lora_alpha=lora_alpha,
            lora_dropout=lora_dropout,
            target_modules=target_modules or ["q_proj"],
            bias="none",
            task_type="CAUSAL_LM"
        )
        self.policy = get_peft_model(self.policy, self.peft_config)

        # Set up CPOConfig
        self.cpo_config = CPOConfig(
            output_dir=output_dir,
            per_device_train_batch_size=per_device_train_batch_size,
            per_device_eval_batch_size=per_device_eval_batch_size,
            num_train_epochs=num_train_epochs,
            remove_unused_columns=False,
            seed=seed,
            report_to=report_to
        )

        # Initialize TRL CPO trainer
        self.trainer = CPOTrainer(
            model=self.policy,
            args=self.cpo_config,
            processing_class=self.tokenizer,
            train_dataset=self.train_dataset,
            eval_dataset=self.eval_dataset,
            peft_config=self.peft_config
        )

    def train(self):
        """Run the CPO training loop."""
        self.trainer.train()

    def save(self, save_directory: str = None):
        """Save the fine-tuned policy model and tokenizer."""
        target_dir = save_directory or self.output_dir
        os.makedirs(target_dir, exist_ok=True)
        self.policy.save_pretrained(target_dir)
        self.tokenizer.save_pretrained(target_dir)

In [19]:
torch.cuda.empty_cache()

In [20]:
# Instantiate and run
trainer = CPOTrainerWithLoRA(
    model_name=BASE_MODEL_NAME,
    train_dataset=train_cpo_ds,
    eval_dataset=test_cpo_ds,
    output_dir=OUTPUT_MODEL_PATH,
)



Map:   0%|          | 0/5886 [00:00<?, ? examples/s]

Map:   0%|          | 0/5886 [00:00<?, ? examples/s]

Map:   0%|          | 0/630 [00:00<?, ? examples/s]

Map:   0%|          | 0/630 [00:00<?, ? examples/s]

Map:   0%|          | 0/5886 [00:00<?, ? examples/s]

Map:   0%|          | 0/630 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [21]:
torch.cuda.empty_cache()

In [22]:
trainer.train()

Step,Training Loss
500,6.5304
1000,6.4997
1500,6.1784
2000,6.0552
2500,6.0772
3000,5.6486
3500,5.514
4000,5.3242
4500,5.1528
5000,5.0852


In [23]:
trainer.save()