In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
import os
import torch
from dataclasses import dataclass, field
from datasets import load_dataset
from huggingface_hub import login
from transformers import (
    AutoTokenizer,
    set_seed,
    AutoModelForSequenceClassification,
)
from redditqa.dataset import load_reddit_dataset

  from .autonotebook import tqdm as notebook_tqdm
2023-08-31 16:00:25.298758: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# Login to the HuggingFace Hub
HUGGINGFACE_TOKEN = os.environ.get("HUGGINGFACE_TOKEN", None)
if HUGGINGFACE_TOKEN is not None:
    login(token=HUGGINGFACE_TOKEN)


Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [4]:
model_name = 'psmathur/orca_mini_3b'
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=True`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [5]:
train_dataset = load_reddit_dataset("train", pairs=True).select(range(1000))
eval_dataset = load_reddit_dataset("eval", pairs=True).select(range(1000))

Saving the dataset (1/1 shards): 100%|██████████| 13125/13125 [00:01<00:00, 9811.41 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 5626/5626 [00:00<00:00, 10491.57 examples/s]


In [6]:
train_dataset[0].keys()

dict_keys(['answer_link_id', 'question_title', 'response_j', 'response_k', 'score_j', 'score_k'])

In [7]:
# Turn the dataset into pairs of post + summaries, where text_j is the preferred question + answer and text_k is the other.
# Then tokenize the dataset.
def preprocess_function(examples):
    new_examples = {
        "input_ids_chosen": [],
        "attention_mask_chosen": [],
        "score_chosen": [],
        "input_ids_rejected": [],
        "attention_mask_rejected": [],
        "score_rejected": [],
        "margin": [],
    }
    for question_title, response_j, response_k, score_j, score_k in zip(
        examples["question_title"], examples["response_j"], examples["response_k"], examples["score_j"], examples["score_k"]
    ):
        template = "<|ELIF|> Question: %question\nAnswer: %answer"

        text_j = template.replace("%question", question_title).replace("%answer", response_j)
        text_k = template.replace("%question", question_title).replace("%answer", response_k)
        tokenized_j = tokenizer(text_j, truncation=True)
        tokenized_k = tokenizer(text_k, truncation=True)

        new_examples["input_ids_chosen"].append(tokenized_j["input_ids"])
        new_examples["attention_mask_chosen"].append(tokenized_j["attention_mask"])
        new_examples["score_chosen"].append(score_j)
        new_examples["input_ids_rejected"].append(tokenized_k["input_ids"])
        new_examples["attention_mask_rejected"].append(tokenized_k["attention_mask"])
        new_examples["score_rejected"].append(score_k)
        new_examples["margin"].append(score_j - score_k)

    return new_examples


original_columns = train_dataset.column_names

# Preprocess the dataset
train_dataset_preproc = train_dataset.map(preprocess_function, num_proc=1, remove_columns=original_columns, batched=True)
eval_dataset_preproc = eval_dataset.map(preprocess_function, num_proc=1, remove_columns=original_columns, batched=True)

# # Filter 
# train_dataset = train_dataset.filter(
#     lambda x: len(x["input_ids_j"]) <= script_args.max_length and len(x["input_ids_k"]) <= script_args.max_length
# )
# eval_dataset = eval_dataset.filter(
#     lambda x: len(x["input_ids_j"]) <= script_args.max_length and len(x["input_ids_k"]) <= script_args.max_length
# )

# # Shuffle and select a subset of the dataset
# eval_dataset_preproc = eval_dataset_preproc.shuffle(seed=SEED).select(range(script_args.eval_subsample))

print("Finished preprocessing dataset.")
print("Number of training examples: ", len(train_dataset))
print("Number of eval examples: ", len(eval_dataset))


Finished preprocessing dataset.
Number of training examples:  1000
Number of eval examples:  1000


In [8]:
train_dataset_preproc[0].keys()

dict_keys(['input_ids_chosen', 'attention_mask_chosen', 'score_chosen', 'input_ids_rejected', 'attention_mask_rejected', 'score_rejected', 'margin'])

In [9]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    num_labels=1,
)

Loading checkpoint shards: 100%|██████████| 3/3 [00:07<00:00,  2.53s/it]
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at psmathur/orca_mini_3b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
model = model.cuda()

In [11]:
import importlib

from trl import trainer as trl_trainer
from trl.trainer import utils as trl_utils

# from trl.trainer import RewardTrainer
# from trl.trainer.utils import RewardDataCollatorWithPadding

importlib.reload(trl_trainer)
importlib.reload(trl_utils)

RewardTrainer = trl_trainer.RewardTrainer
RewardDataCollatorWithPadding = trl_utils.RewardDataCollatorWithPadding

In [12]:
reward_collator = RewardDataCollatorWithPadding(tokenizer=tokenizer, **{'padding': True, 'max_length': 256, 'pad_to_multiple_of': None, 'return_tensors': 'pt'})
y = reward_collator([train_dataset_preproc[i] for i in range(1)])

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [13]:
y = {k: v.cuda() if isinstance(v, torch.Tensor) else v for k, v in y.items()}

In [14]:
reward_trainer = RewardTrainer(model=model, data_collator=reward_collator)
reward_trainer.compute_loss(model, y)



tensor(171.3750, device='cuda:0', grad_fn=<NegBackward0>)

In [2]:
import datasets
from datasets import Dataset
import torch
# fmt: off
dummy_dataset_dict = {
    "input_ids_chosen": [
        torch.LongTensor([0, 1, 2,]),
    ],
    "attention_mask_chosen": [
        torch.LongTensor([1, 1, 1]),
    ],
    "input_ids_rejected": [
        torch.LongTensor([0, 2,]),
    ],
    "attention_mask_rejected": [
        torch.LongTensor([1, 1]),
    ],
}
# fmt: on
dummy_dataset = Dataset.from_dict(dummy_dataset_dict)

In [4]:
def add_margin(row): 
    return {'margin': row['score_chosen'] - row['score_rejected']}
dummy_dataset = dummy_dataset.map(add_margin)

Map:   0%|          | 0/1 [00:00<?, ? examples/s]


KeyError: 'score_chosen'