# Evaluate Base vs. CPO‐Trained Model with ROUGE

This notebook loads the saved test set, generates outputs from both models, and computes ROUGE-1, ROUGE-2, and ROUGE-L.

---

## 1. Install & Import Dependencies

```python
# if you haven't already installed these
!pip install transformers datasets evaluate accelerate

In [1]:
import os
import torch
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer
import evaluate
from tqdm.auto import tqdm

2025-05-09 18:21:11.015626: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-09 18:21:11.028732: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746814871.046177    5315 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746814871.051527    5315 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-05-09 18:21:11.068590: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [2]:
torch.cuda.empty_cache()

#### Configuration & Paths

In [3]:
# adjust as needed
BASE_MODEL_NAME   = "Qwen/Qwen2.5-0.5B"
CPO_MODEL_PATH    = "./rlhf_cpo_ckpts/checkpoint-17658"      # where you saved the fine-tuned model
TEST_DS_CSV       = "./test_cpo_ds.csv"     # or .pkl if you prefer

# Generation hyperparameters
MAX_NEW_TOKENS    = 50
TEMPERATURE       = 1.0
TOP_P             = 0.9

#### Load the Test Set

In [4]:
# assumes you saved via df_test.to_csv(...)
df = pd.read_csv(TEST_DS_CSV)

# our “reference” is the human response, strip leading newline if present
df["reference"] = df["chosen"].str.lstrip("\n").str.strip()
df = df[["prompt", "reference"]].dropna().reset_index(drop=True)
print(f"Loaded {len(df)} examples")

Loaded 630 examples


#### Load Models & Tokenizers

In [5]:
torch.cuda.empty_cache()

In [6]:
# Base model
base_tok = AutoTokenizer.from_pretrained(BASE_MODEL_NAME, padding_side="left")
if base_tok.pad_token is None:
    base_tok.add_special_tokens({"pad_token": "[PAD]"})
base_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL_NAME)
base_model.eval()

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((896,), eps=1e-06)
    (rotary_emb): Qwen2RotaryEmbe

In [7]:
# CPO‐trained model
cpo_tok = AutoTokenizer.from_pretrained(CPO_MODEL_PATH, padding_side="left")
if cpo_tok.pad_token is None:
    cpo_tok.add_special_tokens({"pad_token": "[PAD]"})
cpo_model = AutoModelForCausalLM.from_pretrained(CPO_MODEL_PATH)
cpo_model.eval()

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): lora.Linear(
            (base_layer): Linear(in_features=896, out_features=896, bias=True)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.05, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=896, out_features=4, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=4, out_features=896, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=

#### Define Generation Function

In [8]:
def generate_responses(model, tokenizer, prompts):
    """Batch-generate continuations for a list of prompts."""
    responses = []
    for prompt in tqdm(prompts, desc="Generating"):
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True)
        outs = model.generate(
            **inputs,
            max_new_tokens=MAX_NEW_TOKENS,
            temperature=TEMPERATURE,
            top_p=TOP_P,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id
        )
        # skip the prompt tokens
        text = tokenizer.decode(outs[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
        responses.append(text.strip())
    return responses

#### Generate for Both Models one by one

In [9]:
prompts = df["prompt"].tolist()

# Base
base_preds = generate_responses(base_model, base_tok, prompts)

Generating:   0%|          | 0/630 [00:00<?, ?it/s]



In [10]:
torch.cuda.empty_cache()

In [11]:
cpo_preds  = generate_responses(cpo_model,  cpo_tok,  prompts)

Generating:   0%|          | 0/630 [00:00<?, ?it/s]

### Compute Rouge

In [14]:
%pip uninstall evaluate -f

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



Usage:   
  /opt/conda/bin/python -m pip uninstall [options] <package> ...
  /opt/conda/bin/python -m pip uninstall [options] -r <requirements file> ...

no such option: -f
Note: you may need to restart the kernel to use updated packages.


In [17]:
%pip install rouge_score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24987 sha256=56eb9478a25fa03f7d0c2dd7941d8b2d677c6e4386b9bf2a66fe6920ba8e9943
  Stored in directory: /home/sagemaker-user/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
Note: you may need to restart the kernel to use updated packages.


In [18]:
from rouge_score import rouge_scorer
from statistics import mean

# Initialize the scorer
scorer = rouge_scorer.RougeScorer(
    ["rouge1", "rouge2", "rougeL"],
    use_stemmer=True
)

def compute_rouge_scores(predictions, references):
    """
    Compute average precision, recall, and F1 for each ROUGE type.
    Returns a dict like {'rouge1_fmeasure': 0.25, ...}
    """
    # score each example
    all_scores = [scorer.score(ref, pred) 
                  for ref, pred in zip(references, predictions)]
    
    # aggregate
    results = {}
    for metric in ["rouge1", "rouge2", "rougeL"]:
        prec = mean(score[metric].precision for score in all_scores)
        rec  = mean(score[metric].recall    for score in all_scores)
        f1   = mean(score[metric].fmeasure  for score in all_scores)
        results[f"{metric}_precision"] = prec
        results[f"{metric}_recall"]    = rec
        results[f"{metric}_fmeasure"]  = f1
    return results

# Compute for base model
base_scores = compute_rouge_scores(base_preds, df["reference"].tolist())
# Compute for CPO‐trained model
cpo_scores  = compute_rouge_scores(cpo_preds,  df["reference"].tolist())

# Display
print("Base Model ROUGE:")
for k, v in base_scores.items():
    print(f"  {k}: {v:.4f}")

print("\nCPO-Trained Model ROUGE:")
for k, v in cpo_scores.items():
    print(f"  {k}: {v:.4f}")

Base Model ROUGE:
  rouge1_precision: 0.0092
  rouge1_recall: 0.0615
  rouge1_fmeasure: 0.0152
  rouge2_precision: 0.0012
  rouge2_recall: 0.0088
  rouge2_fmeasure: 0.0021
  rougeL_precision: 0.0091
  rougeL_recall: 0.0612
  rougeL_fmeasure: 0.0151

CPO-Trained Model ROUGE:
  rouge1_precision: 0.0161
  rouge1_recall: 0.0587
  rouge1_fmeasure: 0.0202
  rouge2_precision: 0.0010
  rouge2_recall: 0.0071
  rouge2_fmeasure: 0.0015
  rougeL_precision: 0.0161
  rougeL_recall: 0.0583
  rougeL_fmeasure: 0.0201
