In [1]:
%cd ~/reddit_qa

/home/jhoffbauer/reddit_qa


In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [3]:
import argparse
import os
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

import evaluate
import numpy as np
import torch.nn as nn
from accelerate import Accelerator, infer_auto_device_map, init_empty_weights
from datasets import load_dataset
from huggingface_hub import login
from peft import LoraConfig, TaskType, get_peft_model
from tqdm import tqdm
import torch
from transformers import (
    AutoConfig,
    AutoModelForCausalLM,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    HfArgumentParser,
    LlamaForCausalLM,
    LlamaTokenizer,
    PreTrainedTokenizerBase,
    Trainer,
    TrainingArguments,
    logging,
    set_seed,
)
from transformers.utils import PaddingStrategy
from trl import SFTTrainer
from trl.trainer import ConstantLengthDataset

from redditqa.dataset import load_reddit_dataset

  from .autonotebook import tqdm as notebook_tqdm
2023-07-25 08:48:45.226883: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [9]:
SEED = 42

In [4]:
model_name = 'meta-llama/Llama-2-7b-chat-hf'

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Add special tokens for LLama
if "llama-2" in model_name.lower():
    # LLAMA 2
    print("Setting pad token for LLama-2 tokenizer")
    tokenizer.add_special_tokens({"pad_token": "<pad>"})

Setting pad token for LLama-2 tokenizer


In [6]:
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
)


model = AutoModelForSequenceClassification.from_pretrained(
    'meta-llama/Llama-2-7b-chat-hf',
    num_labels=1,
    torch_dtype=torch.bfloat16,
)
model.resize_token_embeddings(model.config.vocab_size + 1)
model.config.update(dict(pad_token_id=tokenizer.pad_token_id))

model = get_peft_model(model, peft_config)

model.cuda()

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.31s/it]
Some weights of the model checkpoint at meta-llama/Llama-2-7b-chat-hf were not used when initializing LlamaForSequenceClassification: ['lm_head.weight']
- This IS expected if you are initializing LlamaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LlamaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-2-7b-chat-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inferenc

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): LlamaForSequenceClassification(
      (model): LlamaModel(
        (embed_tokens): Embedding(32001, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear(
                in_features=4096, out_features=4096, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
            

In [29]:
train_dataset = load_reddit_dataset("train", pairs=True)
eval_dataset = load_reddit_dataset("eval", pairs=True)

Loading cached processed dataset at /scratch1/jhoff/reddit_dataset_cached/train/cache-757f0ee80690267b.arrow
Loading cached processed dataset at /scratch1/jhoff/reddit_dataset_cached/train/cache-ec6a7572749a5bef.arrow
Loading cached shuffled indices for dataset at /scratch1/jhoff/reddit_dataset_cached/train/cache-fcce8e9f7c96b1b8.arrow
Loading cached processed dataset at /scratch1/jhoff/reddit_dataset_cached/eval/cache-cba55e4212677d14.arrow
Loading cached processed dataset at /scratch1/jhoff/reddit_dataset_cached/eval/cache-d8898fc7c787d1eb.arrow
Loading cached shuffled indices for dataset at /scratch1/jhoff/reddit_dataset_cached/eval/cache-e35089f0b695ca2b.arrow


In [30]:
# Turn the dataset into pairs of post + summaries, where text_j is the preferred question + answer and text_k is the other.
# Then tokenize the dataset.
def preprocess_function(examples):
    new_examples = {
        "input_ids_j": [],
        "attention_mask_j": [],
        "input_ids_k": [],
        "attention_mask_k": [],
    }
    for question_title, response_j, response_k in zip(
        examples["question_title"], examples["response_j"], examples["response_k"]
    ):
        template = "<|ELIF|> Question: %question\nAnswer: %answer"

        text_j = template.replace("%question", question_title).replace("%answer", response_j)
        text_k = template.replace("%question", question_title).replace("%answer", response_k)
        tokenized_j = tokenizer(text_j)
        tokenized_k = tokenizer(text_k)

        new_examples["input_ids_j"].append(tokenized_j["input_ids"])
        new_examples["attention_mask_j"].append(tokenized_j["attention_mask"])
        new_examples["input_ids_k"].append(tokenized_k["input_ids"])
        new_examples["attention_mask_k"].append(tokenized_k["attention_mask"])

    return new_examples

# preprocess the dataset and filter out QAs that are longer than script_args.max_length
max_length = 256
num_proc = 1  # Can adjust to be higher if you have more processors.
original_columns = train_dataset.column_names
train_dataset = train_dataset.map(preprocess_function, num_proc=num_proc, remove_columns=original_columns, batched=True)
eval_dataset = eval_dataset.map(preprocess_function, num_proc=num_proc, remove_columns=original_columns, batched=True)
train_dataset = train_dataset.filter(
    lambda x: len(x["input_ids_j"]) <= max_length and len(x["input_ids_k"]) <= max_length
)
eval_dataset = eval_dataset.filter(
    lambda x: len(x["input_ids_j"]) <= max_length and len(x["input_ids_k"]) <= max_length
)
#eval_dataset = eval_dataset.shuffle(seed=SEED).select(range(script_args.eval_subsample))

                                                                         

In [31]:
x = next(iter(train_dataset))
{k: len(v) for k, v in x.items()}

{'input_ids_j': 52,
 'attention_mask_j': 52,
 'input_ids_k': 94,
 'attention_mask_k': 94}

In [32]:
tokenizer.decode(x['input_ids_j'])

"<s> <|ELIF|> Question: Why do so many Americans blame Obama for practically everything?\nAnswer: Isn't this true everywhere? I know David Cameron gets blamed for most stuff that goes wrong in the UK."

In [33]:
tokenizer.decode(x['input_ids_k'])

"<s> <|ELIF|> Question: Why do so many Americans blame Obama for practically everything?\nAnswer: The president single handedly holds more power than any other individual in the US. However, he still has a proportionately low amount of control over what's going on. Because of the former, he is the easiest scapegoat. Because of the latter, those who subscribe to the former are imbeciles. "

In [34]:
@dataclass
class RewardDataCollatorWithPadding:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    return_tensors: str = "pt"

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
        features_j = []
        features_k = []
        for feature in features:
            features_j.append(
                {
                    "input_ids": feature["input_ids_j"],
                    "attention_mask": feature["attention_mask_j"],
                }
            )
            features_k.append(
                {
                    "input_ids": feature["input_ids_k"],
                    "attention_mask": feature["attention_mask_k"],
                }
            )
        batch_j = self.tokenizer.pad(
            features_j,
            padding=self.padding or "max_length",
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors=self.return_tensors,
        )
        batch_k = self.tokenizer.pad(
            features_k,
            padding=self.padding or "max_length",
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors=self.return_tensors,
        )
        batch = {
            "input_ids_j": batch_j["input_ids"],
            "attention_mask_j": batch_j["attention_mask"],
            "input_ids_k": batch_k["input_ids"],
            "attention_mask_k": batch_k["attention_mask"],
            "return_loss": True,
        }
        return batch

In [35]:
reward_collator_args = {'padding': True, 'max_length': 256, 'pad_to_multiple_of': None, 'return_tensors': 'pt'}

reward_collator = RewardDataCollatorWithPadding(tokenizer, **reward_collator_args)

y = reward_collator([x])
y



{'input_ids_j': tensor([[    1,   529, 29989, 29923,  5265, 29943, 29989, 29958,   894, 29901,
           3750,   437,   577,  1784, 23035,  1999,   420,  4250,  3304,   363,
           4120,  1711,  4129, 29973,    13, 22550, 29901, 29489, 29915, 29873,
            445,  1565, 16978, 29973,   306,  1073,  4699, 20939,   265,  4947,
           1999,  2795,   363,  1556,  6433,   393,  5771,  2743,   297,   278,
          10261, 29889]]),
 'attention_mask_j': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1]]),
 'input_ids_k': tensor([[    1,   529, 29989, 29923,  5265, 29943, 29989, 29958,   894, 29901,
           3750,   437,   577,  1784, 23035,  1999,   420,  4250,  3304,   363,
           4120,  1711,  4129, 29973,    13, 22550, 29901,   450,  6673,  2323,
           1361, 23244,  8640,   901,  3081,  1135,   738,   916,  5375,   297,
            27

In [36]:
x['input_ids_k'] == list(y['input_ids_k'][0])

True

In [39]:
inputs = y

rewards_j = model(input_ids=inputs["input_ids_j"].cuda(), attention_mask=inputs["attention_mask_j"].cuda())[0]
rewards_k = model(input_ids=inputs["input_ids_k"].cuda(), attention_mask=inputs["attention_mask_k"].cuda())[0]
loss = -nn.functional.logsigmoid(rewards_j - rewards_k).mean()

In [40]:
loss

tensor(0.6719, device='cuda:0', dtype=torch.bfloat16, grad_fn=<NegBackward0>)

In [41]:
loss.backward()

In [44]:
model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): LlamaForSequenceClassification(
      (model): LlamaModel(
        (embed_tokens): Embedding(32001, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear(
                in_features=4096, out_features=4096, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear(in_features=4096, out_features=4096, bias=False)
            