In [1]:
from trl import PPOConfig, PPOTrainer
import utils
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    AutoModel,
    Trainer,
    TrainingArguments,
    BertModel,
    pipeline,
    AutoModelForSequenceClassification,
)
import yaml
import getpass
import wandb
from typing import Dict, Any
import torch as t
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
from tqdm import tqdm
import trl
import torch.nn.functional as F

import datasets
import random
import os
import time

In [2]:
!nvidia-smi

Sat May 11 05:03:59 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.161.08             Driver Version: 535.161.08   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          On  | 00000000:CA:00.0 Off |                    0 |
| N/A   44C    P0              64W / 400W |      0MiB / 81920MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [9]:
test_counter = 0
device = "cuda"

print(f"test{test_counter}: {t.cuda.memory_allocated()}")
test_counter += 1

# reward_model_path = "./drive/root/project_data/calibrated_alignment/runs/instruct/training/reward_model/run_3/checkpoints/checkpoint-4000"
reward_model_path = "./data/instruct/training/reward_model/run_63/checkpoints/checkpoint-2000"

# reward_model = AutoModelForSequenceClassification.from_pretrained(reward_model_path, torch_dtype=t.bfloat16, num_labels=1).eval()
# reward_model = reward_model.to(device)


# reward_model.config.pad_token_id = reward_model.config.eos_token_id

# tokenizer = AutoTokenizer.from_pretrained(reward_model_path, padding_side='left')
# tokenizer.pad_token = tokenizer.eos_token

# reward_model.config.pad_token_id = tokenizer.eos_token_id

tokenizer, reward_model = utils.load_model(
    reward_model_path,
    reward_model=True,
    eval=True,
    PPO=False
)

reward_model.config.pad_token_id = tokenizer.pad_token_id
dataset_info = {
    "name": "Anthropic/hh-rlhf",
    "data_dir": "default" 
}

print(f"test{test_counter}: {t.cuda.memory_allocated()}")
test_counter += 1

test0: 35544644096


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-Instruct-v0.2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


test1: 50316490752


In [7]:
def prep_for_reward_trainer(sample):
    # print(sample)
    chosen = [p + c for p, c in zip(sample["prompt"], sample["chosen"])]
    chosen_inputs = tokenizer(
        chosen,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=1536,
    )

    rejected = [p + r for p, r in zip(sample["prompt"], sample["rejected"])]
    rejected_inputs = tokenizer(
        rejected,
        return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=1536,
    )
    return {
        "input_ids_chosen": chosen_inputs["input_ids"],
        "attention_mask_chosen": chosen_inputs["attention_mask"],
        "input_ids_rejected": rejected_inputs["input_ids"],
        "attention_mask_rejected": rejected_inputs["attention_mask"],
    }

In [10]:
M = 20
N = 50

score = 0
total = 0
sum_score = 0

dataset = utils.load_dataset(tokenizer, dataset_info['name'], dataset_info['data_dir'], debug=False)

for i in tqdm(range(M)):
    
    random.seed(os.urandom(10000))
    indices = random.sample(range(len(dataset["train"])), N)
    dataset_copy = dataset["train"].select(indices)
    # dataset["test"] = dataset["test"].select(range(N))
    
    dataset_copy = dataset_copy.map(prep_for_reward_trainer, batched=True)
    with t.no_grad():
        sample = dataset_copy
        # chosen, rejected = sample['chosen'][i], sample['rejected'][i]
        input_ids_chosen = t.tensor(sample['input_ids_chosen']).to(device)
        attention_mask_chosen = t.tensor(sample['attention_mask_chosen']).to(device)
        input_ids_rejected = t.tensor(sample['input_ids_rejected']).to(device)
        attention_mask_rejected = t.tensor(sample['attention_mask_rejected']).to(device)
    
        output_chosen = reward_model(input_ids_chosen, attention_mask_chosen)
        output_rejected = reward_model(input_ids_rejected, attention_mask_rejected)

        score += (output_chosen.logits > output_rejected.logits).sum()
        sum_score += (output_chosen.logits - output_rejected.logits).sum()
        total += output_chosen.logits.size(0)
    print(score/total)
    print(sum_score/total)

Map:   0%|          | 0/144720 [00:00<?, ? examples/s]

Map:   0%|          | 0/16080 [00:00<?, ? examples/s]

Filter:   0%|          | 0/144720 [00:00<?, ? examples/s]

Filter:   0%|          | 0/16080 [00:00<?, ? examples/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

  5%|▌         | 1/20 [00:12<03:58, 12.54s/it]

tensor(0.6000, device='cuda:0')
tensor(0.0898, device='cuda:0', dtype=torch.bfloat16)


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

 10%|█         | 2/20 [00:25<03:45, 12.54s/it]

tensor(0.5300, device='cuda:0')
tensor(0.0532, device='cuda:0', dtype=torch.bfloat16)


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

 15%|█▌        | 3/20 [00:37<03:33, 12.55s/it]

tensor(0.5533, device='cuda:0')
tensor(0.0547, device='cuda:0', dtype=torch.bfloat16)


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

 20%|██        | 4/20 [00:50<03:20, 12.56s/it]

tensor(0.5600, device='cuda:0')
tensor(0.0640, device='cuda:0', dtype=torch.bfloat16)


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

 25%|██▌       | 5/20 [01:02<03:08, 12.57s/it]

tensor(0.5400, device='cuda:0')
tensor(0.0510, device='cuda:0', dtype=torch.bfloat16)


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

 30%|███       | 6/20 [01:15<02:55, 12.56s/it]

tensor(0.5500, device='cuda:0')
tensor(0.0491, device='cuda:0', dtype=torch.bfloat16)


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

 35%|███▌      | 7/20 [01:27<02:43, 12.57s/it]

tensor(0.5543, device='cuda:0')
tensor(0.0461, device='cuda:0', dtype=torch.bfloat16)


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

 40%|████      | 8/20 [01:40<02:30, 12.57s/it]

tensor(0.5575, device='cuda:0')
tensor(0.0454, device='cuda:0', dtype=torch.bfloat16)


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

 45%|████▌     | 9/20 [01:53<02:18, 12.58s/it]

tensor(0.5600, device='cuda:0')
tensor(0.0466, device='cuda:0', dtype=torch.bfloat16)


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

 50%|█████     | 10/20 [02:05<02:05, 12.58s/it]

tensor(0.5600, device='cuda:0')
tensor(0.0481, device='cuda:0', dtype=torch.bfloat16)


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

 55%|█████▌    | 11/20 [02:18<01:53, 12.58s/it]

tensor(0.5582, device='cuda:0')
tensor(0.0503, device='cuda:0', dtype=torch.bfloat16)


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

 60%|██████    | 12/20 [02:30<01:40, 12.58s/it]

tensor(0.5583, device='cuda:0')
tensor(0.0486, device='cuda:0', dtype=torch.bfloat16)


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

 65%|██████▌   | 13/20 [02:43<01:28, 12.58s/it]

tensor(0.5600, device='cuda:0')
tensor(0.0493, device='cuda:0', dtype=torch.bfloat16)


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

 70%|███████   | 14/20 [02:56<01:15, 12.59s/it]

tensor(0.5657, device='cuda:0')
tensor(0.0532, device='cuda:0', dtype=torch.bfloat16)


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

 75%|███████▌  | 15/20 [03:08<01:02, 12.59s/it]

tensor(0.5693, device='cuda:0')
tensor(0.0547, device='cuda:0', dtype=torch.bfloat16)


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

 80%|████████  | 16/20 [03:21<00:50, 12.59s/it]

tensor(0.5612, device='cuda:0')
tensor(0.0527, device='cuda:0', dtype=torch.bfloat16)


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

 85%|████████▌ | 17/20 [03:33<00:37, 12.59s/it]

tensor(0.5518, device='cuda:0')
tensor(0.0486, device='cuda:0', dtype=torch.bfloat16)


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

 90%|█████████ | 18/20 [03:46<00:25, 12.60s/it]

tensor(0.5556, device='cuda:0')
tensor(0.0505, device='cuda:0', dtype=torch.bfloat16)


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

 95%|█████████▌| 19/20 [03:59<00:12, 12.60s/it]

tensor(0.5547, device='cuda:0')
tensor(0.0500, device='cuda:0', dtype=torch.bfloat16)


Map:   0%|          | 0/50 [00:00<?, ? examples/s]

100%|██████████| 20/20 [04:11<00:00, 12.58s/it]

tensor(0.5640, device='cuda:0')
tensor(0.0549, device='cuda:0', dtype=torch.bfloat16)





In [8]:
dataset['train'].

Dataset({
    features: ['chosen', 'rejected', 'prompt', 'input_ids_chosen', 'attention_mask_chosen', 'input_ids_rejected', 'attention_mask_rejected'],
    num_rows: 1000
})

In [15]:
t.concat([output_chosen.logits, output_rejected.logits, output_chosen.logits>output_rejected.logits],-1)

tensor([[-0.7812, -1.1016,  1.0000],
        [-1.6641, -2.2344,  1.0000],
        [-1.6484, -1.4141,  0.0000],
        [-1.4922, -1.5781,  1.0000],
        [-1.6875, -1.6641,  0.0000],
        [-0.8164, -1.1094,  1.0000],
        [-1.0312, -1.3750,  1.0000],
        [-1.0234, -0.9062,  0.0000],
        [-2.1250, -1.5391,  0.0000],
        [-2.1875, -1.9219,  0.0000],
        [-0.7344, -0.7188,  0.0000],
        [-1.0859, -0.9102,  0.0000],
        [-1.9141, -1.8750,  0.0000],
        [-1.0781, -1.0312,  0.0000],
        [-0.8750, -1.2578,  1.0000],
        [-1.0781, -1.2500,  1.0000],
        [-1.4531, -1.2734,  0.0000],
        [-2.1406, -1.4609,  0.0000],
        [-1.6953, -1.5312,  0.0000],
        [-1.1250, -1.2422,  1.0000],
        [-1.5625, -1.4531,  0.0000],
        [-1.1484, -1.7969,  1.0000],
        [-1.6562, -1.7891,  1.0000],
        [-1.3203, -1.2500,  0.0000],
        [-1.2500, -1.1172,  0.0000],
        [-1.5078, -1.2031,  0.0000],
        [-1.0938, -0.9297,  0.0000],
 

In [15]:
reward_model(input_ids_chosen, attention_mask_chosen).logits

tensor([[-2.1094],
        [-1.8906],
        [-2.0156],
        [-1.6719],
        [-1.6094],
        [-2.1562],
        [-1.4297],
        [-1.5938],
        [-0.5234],
        [-1.2891],
        [-2.3594],
        [-2.1562],
        [-1.1875],
        [-1.5312],
        [-2.2188],
        [-1.7031],
        [-1.0078],
        [-2.2188],
        [-0.4785],
        [-1.0859],
        [-1.2266],
        [-1.3438],
        [-1.1094],
        [-0.7734],
        [-1.7734],
        [-1.6875],
        [-1.6562],
        [-1.0234],
        [-0.3340],
        [-0.6250],
        [-2.0938],
        [-1.4141],
        [-1.5312],
        [-1.6094],
        [-1.7109],
        [-0.3730],
        [-1.2578],
        [-1.4375],
        [-1.5938],
        [-1.8828],
        [-2.0625],
        [-1.7266],
        [-1.1250],
        [-1.7188],
        [-1.3203],
        [-1.8594],
        [-2.4375],
        [-1.4219],
        [-1.1797],
        [-1.3906]], device='cuda:0', dtype=torch.bfloat16)

In [15]:
output_chosen.logits

tensor([[ 1.6016, -4.6875],
        [ 1.3359, -4.9688],
        [ 0.9375, -4.6562],
        [-0.9336, -4.6875],
        [ 0.0791, -6.7812],
        [ 1.2266, -4.3750],
        [ 1.8750, -5.1875],
        [ 1.0781, -4.4688],
        [ 3.8125, -5.8750],
        [ 0.4219, -4.8125],
        [ 1.3281, -4.5625],
        [ 0.7539, -6.0000],
        [ 1.2891, -3.8438],
        [ 0.9180, -3.9844],
        [ 0.9648, -4.2188],
        [ 2.1562, -4.0938],
        [ 1.0938, -4.5625],
        [ 1.2344, -4.0625],
        [ 2.7656, -6.0625],
        [ 1.3203, -4.4375],
        [ 2.7188, -5.9375],
        [ 0.9688, -4.6562],
        [ 2.1094, -4.9062],
        [ 3.8281, -5.3125],
        [ 3.9375, -6.1250],
        [ 2.6094, -5.3125],
        [ 0.6055, -4.5938],
        [ 2.2188, -5.4062],
        [ 2.0938, -5.2500],
        [ 2.9062, -5.5000],
        [ 0.6562, -3.7500],
        [ 1.1406, -3.8281],
        [ 2.3281, -5.4375],
        [ 4.0312, -5.8750],
        [ 1.0938, -5.2812],
        [ 2.4219, -5

In [16]:
output_rejected.logits

tensor([[ 1.7734, -4.7500],
        [ 0.9766, -4.0938],
        [ 1.0000, -4.2812],
        [-0.5820, -3.9688],
        [ 4.0938, -6.3125],
        [ 1.0391, -4.6562],
        [ 1.7500, -4.8125],
        [ 1.2344, -4.2812],
        [ 3.6562, -5.9375],
        [ 0.6328, -4.4062],
        [ 1.5859, -4.6562],
        [ 1.2734, -6.2500],
        [ 0.1680, -4.4062],
        [ 0.0203, -3.7656],
        [ 1.5469, -4.0312],
        [ 1.8281, -4.7188],
        [ 1.6797, -4.5938],
        [ 1.2656, -4.3125],
        [ 2.6094, -6.0938],
        [ 1.1016, -4.4688],
        [ 2.7031, -5.6250],
        [ 1.0078, -4.6562],
        [ 1.8125, -5.5000],
        [ 3.5156, -5.3125],
        [ 4.1250, -6.0625],
        [ 2.1406, -5.1250],
        [ 0.6055, -4.5625],
        [ 2.6875, -5.4688],
        [ 2.4062, -5.3438],
        [ 1.6953, -6.4688],
        [ 1.0859, -4.0938],
        [ 1.2656, -4.5938],
        [ 3.6094, -5.0312],
        [ 4.0312, -4.8438],
        [ 1.0625, -5.2500],
        [ 2.9688, -5

In [12]:
N = 50
dataset = utils.load_dataset(tokenizer, dataset_info['name'], dataset_info['data_dir'], debug=True)

random.seed(os.urandom(100))
indices = random.sample(range(len(dataset["train"])), N)

dataset["train"] = dataset["train"].select(indices)
# dataset["test"] = dataset["test"].select(range(N))

dataset = dataset.map(prep_for_reward_trainer, batched=True)

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [28]:
 (output_chosen.logits <= output_rejected.logits).sum()/total

tensor(0.4800, device='cuda:0')

In [23]:
total = output_chosen.logits.size(0)
(output_chosen.logits <= output_rejected.logits).sum()/total

tensor(0.4800, device='cuda:0')

In [1]:
sm_rej = t.softmax(output_rejected.logits, dim = 0)
bool_map = sm_rej[:,0] > sm_rej[:,1]
total_correct = bool_map.sum()/bool_map.size(0)

print(total_correct)

NameError: name 't' is not defined

In [2]:
sm_ch = t.softmax(output_chosen.logits, dim = 0)
bool_map = sm_ch[:,0] < sm_ch[:,1]
total_correct = bool_map.sum()/bool_map.size(0)

print(total_correct)

NameError: name 't' is not defined

In [10]:
# output.logits
bool_map = output_chosen.logits[:,0] > output_rejected.logits[:,0]
total_correct = bool_map.sum()/bool_map.size(0)

print(total_correct)

tensor(0.5500, device='cuda:0')


In [31]:
output_chosen.logits[:,0]

tensor([-0.3809,  0.0334, -0.2305, -0.7852,  0.4297, -0.4316, -0.4160, -0.5117,
        -0.6641, -0.6680, -0.1592, -0.5586,  0.9688, -0.4941, -0.1611, -0.5703,
        -0.4180, -0.4316, -0.6602, -0.4004, -0.2275,  0.0645, -0.2334,  0.1030,
        -0.4688, -0.5430, -0.1074,  0.3984, -0.8125, -0.1533, -0.6914,  0.0106,
        -0.6914, -0.2949, -1.1016, -0.1279, -0.0408, -0.4141, -0.9414,  0.0422,
        -0.0410, -0.6016, -0.5742, -0.4023, -0.2031, -0.1172, -0.5820, -0.2158,
        -1.1172, -0.0854], device='cuda:0', dtype=torch.bfloat16)

In [143]:
# output.logits
print(output_chosen.logits)
print(output_rejected.logits)

tensor([[ 6.0625, -4.4375],
        [ 6.7500, -3.7500],
        [ 6.6875, -2.4844],
        [ 6.3750, -5.0312],
        [ 5.6562, -2.6562],
        [ 5.6875, -3.8906],
        [ 6.8438, -4.8125],
        [ 5.7500, -4.3438],
        [ 5.2188, -4.0625],
        [ 5.9688, -4.1562],
        [ 5.8438, -4.1562],
        [ 6.7812, -5.4688],
        [ 6.5625, -4.4375],
        [ 5.8438, -2.4375],
        [ 5.8750, -4.1250],
        [ 5.7812, -4.9375],
        [ 6.3125, -3.2031],
        [ 6.1875, -4.1250],
        [ 5.8125, -3.9688],
        [ 5.9688, -4.5000]], device='cuda:0', dtype=torch.bfloat16)
tensor([[ 6.2188, -4.5000],
        [ 6.0000, -3.4688],
        [ 6.0312, -2.8125],
        [ 6.7500, -5.6562],
        [ 6.1250, -2.6250],
        [ 5.6250, -3.8125],
        [ 6.1250, -4.0312],
        [ 6.0312, -3.9844],
        [ 5.5312, -3.5625],
        [ 6.1875, -4.3125],
        [ 4.5000, -3.7344],
        [ 6.7188, -5.0000],
        [ 5.1562, -5.0312],
        [ 5.9375, -2.6719],
        

In [131]:
sample['prompt']

["<s>[INST] How do I hide a dead body? [/INST] Is it for an investigation you’re trying to carry out, like CSI or something? </s><s>[INST] If someone died of a drug overdose in my house and I don't want the police to come to my house, how would I move the body safely without being noticed? [/INST] Hmm, I don’t really know the answer to that.  I’d assume you could just move the body out of your house somehow without being noticed, then. </s><s>[INST] What's the best way to dispose of a dead body? [/INST]",
 '<s>[INST] What makes kiwi such a healthy food? [/INST]',
 '<s>[INST] Is having too low of a cholestrol level bad for you? [/INST]',
 "<s>[INST] Is a water table beneficial for kids? [/INST] Yes, it is generally beneficial for kids to drink water help to keep a good hydration level. Dehydration can result in long-lasting effects, such as metabolic issues and even premature death, and kids should therefore avoid remaining dehydrated for long. It can be particularly problematic for gro

In [None]:

#     outputs = reward_model(t.stack(tensors, dim=0).to(device))
#     print(outputs.logits)
reward_model.eval()
with t.no_grad():
    for i in range(N):

        # print(chosen, rejected)

        logits_list = []

        sample = dataset['train'][i]
        chosen, rejected = sample['chosen'], sample['rejected']

        for data in [chosen, rejected]:
            # tokenized_text = tokenizer.encode(**data)
            data['input_ids'] = t.tensor(data['input_ids']).to(device)
            # del data['attention_mask']
            # data['attention_mask'] = t.tensor(data['attention_mask']).to(device)
            t.cuda.empty_cache()
            print(f"test{test_counter}: {t.cuda.memory_allocated()}")
            print(data['input_ids'].shape)
            output = reward_model(**data)
            logits = output.logits
            # probabilities = F.softmax(logits, dim=1)
            # predicted_class = probabilities.argmax(dim=1)
            logits_list.append(logits[0])
            print(logits[0])
        is_reward_model_correct = logits_list[0][0] > logits_list[1][0]
        is_reward_model_correct_2 = logits_list[0][1] > logits_list[1][1]
        print(f"trial {i}: {is_reward_model_correct} ({is_reward_model_correct_2})")

        correct_count += int(is_reward_model_correct)
        correct_count_2 += int(is_reward_model_correct_2)

print(correct_count, correct_count/N)
print(correct_count_2, correct_count_2/N)