### Inspecting the reward model checkpoints 

Loading reward model checkpoints didn't fully work as the `merge_peft_adapter` script cannot load checkpoints in its original form. This explores how checkpoints can potentially be loaded

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
from dataclasses import dataclass, field
from typing import Optional
import huggingface_hub
import functools as ft
import torch
import pandas as pd
import torch
import os
from accelerate import Accelerator
from datasets import load_dataset
from peft import LoraConfig
from tqdm import tqdm
from transformers import Adafactor, AutoTokenizer, HfArgumentParser, pipeline, AutoConfig, GPTNeoXForCausalLM, AutoModelForCausalLM
from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer, set_seed
from trl.core import LengthSampler
from transformers import pipeline, TextGenerationPipeline, AutoConfig, AutoTokenizer, AutoModelForCausalLM, AutoModel, AutoModelForSequenceClassification, GPTNeoXForCausalLM, LlamaForSequenceClassification
from redditqa.dataset import load_reddit_dataset
from transformers.utils.hub import convert_file_size_to_int, get_checkpoint_shard_files
from peft import (
    LoraConfig,
    PeftConfig,
    PeftModel,
    TaskType,
    get_peft_model,
    set_peft_model_state_dict,
)
from peft.utils import _get_submodules
import peft
import torch

  from .autonotebook import tqdm as notebook_tqdm
2023-08-02 14:30:25.930002: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


#### Load and inspect the checkpoint

First, let's inspect and the checkpoint

In [3]:
model_checkpoint = '/scratch1/jhoff/checkpoints/reward_llama-2-7b-chat-hf/checkpoint-3000'

In [4]:
print('\n'.join(os.listdir(model_checkpoint)))

optimizer.pt
adapter_model.bin
scheduler.pt
training_args.bin
adapter_config.json
README.md
trainer_state.json
rng_state.pth


Now, let's inspect the adapter weights

In [5]:
adapter_weights = torch.load(f'{model_checkpoint}/adapter_model.bin', map_location='cpu')
print('\n'.join(list(adapter_weights.keys())))

base_model.model.model.layers.0.self_attn.q_proj.lora_A.weight
base_model.model.model.layers.0.self_attn.q_proj.lora_B.weight
base_model.model.model.layers.0.self_attn.v_proj.lora_A.weight
base_model.model.model.layers.0.self_attn.v_proj.lora_B.weight
base_model.model.model.layers.1.self_attn.q_proj.lora_A.weight
base_model.model.model.layers.1.self_attn.q_proj.lora_B.weight
base_model.model.model.layers.1.self_attn.v_proj.lora_A.weight
base_model.model.model.layers.1.self_attn.v_proj.lora_B.weight
base_model.model.model.layers.2.self_attn.q_proj.lora_A.weight
base_model.model.model.layers.2.self_attn.q_proj.lora_B.weight
base_model.model.model.layers.2.self_attn.v_proj.lora_A.weight
base_model.model.model.layers.2.self_attn.v_proj.lora_B.weight
base_model.model.model.layers.3.self_attn.q_proj.lora_A.weight
base_model.model.model.layers.3.self_attn.q_proj.lora_B.weight
base_model.model.model.layers.3.self_attn.v_proj.lora_A.weight
base_model.model.model.layers.3.self_attn.v_proj.lora_B

The adapter config. (When creating an adapter to load the weights, we should use the exact same config)

In [6]:
print(open(os.path.join(model_checkpoint, 'adapter_config.json')).read())

{
  "auto_mapping": null,
  "base_model_name_or_path": "meta-llama/Llama-2-7b-chat-hf",
  "bias": "none",
  "fan_in_fan_out": false,
  "inference_mode": true,
  "init_lora_weights": true,
  "layers_pattern": null,
  "layers_to_transform": null,
  "lora_alpha": 32,
  "lora_dropout": 0.1,
  "modules_to_save": null,
  "peft_type": "LORA",
  "r": 8,
  "revision": null,
  "target_modules": [
    "q_proj",
    "v_proj"
  ],
  "task_type": "SEQ_CLS"
}


We have the model weights and weights for the scoring head. Inspect the scoring head

In [7]:
adapter_weights['base_model.model.score.weight'].shape, adapter_weights['base_model.model.score.weight'][0][0]

(torch.Size([1, 4096]), tensor(0.0381, dtype=torch.bfloat16))

It seems like the scoring head is not an adapter but an actual weight

Now, let's load the actual LLAMA 2 model that is the base for our adapter

In [8]:
base_model_name = "meta-llama/Llama-2-7b-chat-hf"

# Use initializer_range = 0 to make sure initialized weights are 0
model = AutoModelForSequenceClassification.from_pretrained(base_model_name, num_labels=1, initializer_range=0)

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.30it/s]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-chat-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
model

LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNor

In [10]:
model.score.weight

Parameter containing:
tensor([[0., 0., 0.,  ..., 0., 0., 0.]], requires_grad=True)

In [11]:
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
)
model_peft = get_peft_model(model, peft_config)

In [12]:
model_peft

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): LlamaForSequenceClassification(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear(
                in_features=4096, out_features=4096, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
            

In [13]:
model_peft.score.modules_to_save.default.weight

Parameter containing:
tensor([[0., 0., 0.,  ..., 0., 0., 0.]], requires_grad=True)

Now, we bring in the adapter weights from the checkpoint

In [14]:
set_peft_model_state_dict(model_peft, adapter_weights)

In [15]:
model_peft.score.modules_to_save.default.weight[0][0]

tensor(0.0381, grad_fn=<SelectBackward0>)

And we see that the score weight is correct as it matches the value from above! ✅

#### Merge the peft model into a normal model 

In [16]:
model_out = model_peft.merge_and_unload()

In [17]:
model_out.score.weight.data[0][0]

tensor(0.0381)

In [18]:
output_name = "/scratch1/jhoff/tmp/reward_model"
model_out.save_pretrained(output_name)
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.save_pretrained(output_name)

('/scratch1/jhoff/tmp/reward_model/tokenizer_config.json',
 '/scratch1/jhoff/tmp/reward_model/special_tokens_map.json',
 '/scratch1/jhoff/tmp/reward_model/tokenizer.model',
 '/scratch1/jhoff/tmp/reward_model/added_tokens.json',
 '/scratch1/jhoff/tmp/reward_model/tokenizer.json')

This looks about right! ✅

#### Evaluate the reward model

In [19]:
from redditqa.dataset import load_reddit_dataset

eval_dataset = load_reddit_dataset("eval", pairs=True)
eval_dataset = eval_dataset.shuffle(seed=42).select(range(1000))

tokenizer = AutoTokenizer.from_pretrained(base_model_name)

reward_model = AutoModelForSequenceClassification.from_pretrained(
    output_name, 
    num_labels=1, 
    torch_dtype=torch.bfloat16
)

Loading cached processed dataset at /scratch1/jhoff/reddit_dataset_cached/eval/cache-cba55e4212677d14.arrow
Loading cached processed dataset at /scratch1/jhoff/reddit_dataset_cached/eval/cache-d8898fc7c787d1eb.arrow
Loading cached shuffled indices for dataset at /scratch1/jhoff/reddit_dataset_cached/eval/cache-e35089f0b695ca2b.arrow
Loading cached shuffled indices for dataset at /scratch1/jhoff/reddit_dataset_cached/eval/cache-ea21b592f4358562.arrow
Loading checkpoint shards: 100%|██████████| 3/3 [00:08<00:00,  2.81s/it]


In [20]:
reward_pipe = pipeline(
    "sentiment-analysis",
    model=reward_model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    device=0,
)

In [21]:
template = "<|ELIF|> Question: %question\nAnswer: %answer"

def apply_reward_model(row):

    question_title = row["question_title"]
    response_j = row["response_j"]
    response_k = row["response_k"]

    qa_j = template.replace("%question", question_title).replace("%answer", response_j)
    reward_j = reward_pipe(qa_j)[0]["score"]

    qa_k = template.replace("%question", question_title).replace("%answer", response_k)
    reward_k = reward_pipe(qa_k)[0]["score"]

    return {
        'reward_j': float(reward_j),
        'reward_k': float(reward_k),
    }

eval_dataset = eval_dataset.map(apply_reward_model)

eval_dataset[0]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

                                                              

{'answer_link_id': '2y0dxt',
 'question_title': 'Why do employers ask "where do you see yourself in 5-10 years?" How do personal goals matter at all?',
 'response_j': 'After you respond, be sure to ask, "How do you see the company changing over that timespan?"',
 'response_k': "Whatever job you're applying for, think of the logical career path and where you should be in fifteen years. Like all basic interview questions it's more about whether you can have an adult conversation than the actual answers. ",
 'score_j': 13,
 'score_k': 2,
 'reward_j': 0.8872045874595642,
 'reward_k': 0.8568122982978821}

Let's measure the accuracy of the reward model

In [23]:
correct = 0
for row in eval_dataset:
    if row['reward_j'] >= row['reward_k']:
        correct += 1

print(f'Accuracy: {correct / len(eval_dataset)}')

Accuracy: 0.644


That looks about right!