In [1]:
!pip install -q "transformers>=4.35.0" accelerate bitsandbytes peft trl datasets sentencepiece torch torchvision torchaudio

In [3]:
import torch, os
from transformers import BitsAndBytesConfig

BASE_MODEL = "mistralai/Mistral-7B-v0.3"
SFT_ADAPTER_DIR = "coqa_chatbot_lora"

REWARD_MODEL_DIR = "reward_model"
PPO_OUTPUT_DIR = "ppo_output"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

Device: cpu


In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
from pathlib import Path

def load_sft_model(adapter_dir=SFT_ADAPTER_DIR, base_model=BASE_MODEL):
    """
    Loads the 4-bit base model and applies the LoRA adapters saved from SFT.
    Mirrors exactly what your SFT notebook did.
    """
    print(f"Loading base model {base_model} in 4-bit and applying adapters from {adapter_dir}")
    tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=False, trust_remote_code=True)
    base = AutoModelForCausalLM.from_pretrained(
        base_model,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
        use_cache=False,
    )
    model = PeftModel.from_pretrained(base, adapter_dir, device_map="auto", torch_dtype=torch.float16)
    model.to(device)
    model.eval()
    return tokenizer, model

# Load SFT model check
tok, _ = load_sft_model()
print("✅ Loaded SFT model + tokenizer")



Loading base model mistralai/Mistral-7B-v0.3 in 4-bit and applying adapters from coqa_chatbot_lora


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

✅ Loaded SFT model + tokenizer


In [5]:
from datasets import load_dataset
import random, torch, os
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, Dataset

ds = load_dataset("Anthropic/hh-rlhf", data_dir="helpful-base")

SAMPLE_SIZE = 1000
if len(ds["train"]) > SAMPLE_SIZE:
    ds_small = ds["train"].shuffle(seed=42).select(range(SAMPLE_SIZE))
else:
    ds_small = ds["train"]

print(f"Using {len(ds_small)} examples for reward-model training")

class PairwisePreferenceDataset(Dataset):
    def __init__(self, hf_dataset, tokenizer, max_length=256):
        self.dataset = hf_dataset
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        row = self.dataset[idx]
        prompt = row.get("prompt") or row.get("input", "")
        chosen = row["chosen"]
        rejected = row["rejected"]
        text_chosen = (prompt + "\n" + chosen).strip()
        text_rejected = (prompt + "\n" + rejected).strip()
        enc_ch = self.tokenizer(text_chosen, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")
        enc_re = self.tokenizer(text_rejected, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")
        return {
            "input_ids_chosen": enc_ch["input_ids"].squeeze(0),
            "attention_mask_chosen": enc_ch["attention_mask"].squeeze(0),
            "input_ids_rejected": enc_re["input_ids"].squeeze(0),
            "attention_mask_rejected": enc_re["attention_mask"].squeeze(0),
        }

REWARD_MODEL_DIR = "reward_model"
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=1).to(device)

dataset = PairwisePreferenceDataset(ds_small, tokenizer, max_length=256)
loader = DataLoader(dataset, batch_size=8, shuffle=True)

optim = torch.optim.AdamW(model.parameters(), lr=1e-5)

EPOCHS = 3
for epoch in range(EPOCHS):
    total_loss = 0.0
    for batch in loader:
        input_ids_ch = batch["input_ids_chosen"].to(device)
        attn_ch = batch["attention_mask_chosen"].to(device)
        input_ids_re = batch["input_ids_rejected"].to(device)
        attn_re = batch["attention_mask_rejected"].to(device)

        out_ch = model(input_ids=input_ids_ch, attention_mask=attn_ch).logits.view(-1)
        out_re = model(input_ids=input_ids_re, attention_mask=attn_re).logits.view(-1)

        logits = torch.stack([out_ch, out_re], dim=1)
        log_probs = torch.log_softmax(logits, dim=1)
        # label=0 means chosen is “better” (index=0) by convention in pairwise preference
        labels = torch.zeros(len(out_ch), dtype=torch.long, device=device)

        loss = - log_probs[torch.arange(len(labels)), labels].mean()
        optim.zero_grad()
        loss.backward()
        optim.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(loader)
    print(f"Epoch {epoch+1}/{EPOCHS} — avg_loss={avg_loss:.4f}")

os.makedirs(REWARD_MODEL_DIR, exist_ok=True)
tokenizer.save_pretrained(REWARD_MODEL_DIR)
model.save_pretrained(REWARD_MODEL_DIR)
print("✅ Reward model saved to", REWARD_MODEL_DIR)

README.md: 0.00B [00:00, ?B/s]

helpful-base/train.jsonl.gz:   0%|          | 0.00/16.2M [00:00<?, ?B/s]

helpful-base/test.jsonl.gz:   0%|          | 0.00/875k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Using 1000 examples for reward-model training


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 — avg_loss=0.6702
Epoch 2/3 — avg_loss=0.5968
Epoch 3/3 — avg_loss=0.4289
✅ Reward model saved to reward_model
