In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
from dataclasses import dataclass, field
from typing import Optional
import huggingface_hub
import functools as ft
import torch
import pandas as pd
import torch
from accelerate import Accelerator
from datasets import load_dataset
from peft import LoraConfig
from tqdm import tqdm
from transformers import Adafactor, AutoTokenizer, HfArgumentParser, pipeline, AutoConfig, GPTNeoXForCausalLM, AutoModelForCausalLM
from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer, set_seed
from trl.core import LengthSampler
from transformers import pipeline, TextGenerationPipeline, AutoConfig, AutoTokenizer, AutoModelForCausalLM, AutoModel, AutoModelForSequenceClassification, GPTNeoXForCausalLM, LlamaForSequenceClassification
from redditqa.dataset import load_reddit_dataset
from textblob import TextBlob

  from .autonotebook import tqdm as notebook_tqdm
2023-07-30 15:54:39.039729: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# Load the dataset 

eval_dataset = load_reddit_dataset("eval", pairs=True)
eval_dataset = eval_dataset.shuffle(seed=42).select(range(1000))

eval_dataset, eval_dataset[0]

Loading cached processed dataset at /scratch1/jhoff/reddit_dataset_cached/eval/cache-cba55e4212677d14.arrow
Loading cached processed dataset at /scratch1/jhoff/reddit_dataset_cached/eval/cache-d8898fc7c787d1eb.arrow
Loading cached shuffled indices for dataset at /scratch1/jhoff/reddit_dataset_cached/eval/cache-e35089f0b695ca2b.arrow
Loading cached shuffled indices for dataset at /scratch1/jhoff/reddit_dataset_cached/eval/cache-ea21b592f4358562.arrow


(Dataset({
     features: ['answer_link_id', 'question_title', 'response_j', 'response_k', 'score_j', 'score_k'],
     num_rows: 1000
 }),
 {'answer_link_id': '2y0dxt',
  'question_title': 'Why do employers ask "where do you see yourself in 5-10 years?" How do personal goals matter at all?',
  'response_j': 'After you respond, be sure to ask, "How do you see the company changing over that timespan?"',
  'response_k': "Whatever job you're applying for, think of the logical career path and where you should be in fifteen years. Like all basic interview questions it's more about whether you can have an adult conversation than the actual answers. ",
  'score_j': 13,
  'score_k': 2})

In [33]:
# Add sentiment analysis and question detection to both answers 

sentiment_pipe = pipeline("text-classification", model="michellejieli/emotion_text_classifier", device='cuda:0')
question_detection_pipe = pipeline("text-classification", model="huaen/question_detection", device='cuda:0')

def apply_get_emotions(row): 
    row['emotion_j'] = sentiment_pipe(row['response_j'])[0]['label']
    row['emotion_k'] = sentiment_pipe(row['response_k'])[0]['label']
    return row

def apply_question_detection(row): 
    row['is_question_j'] = question_detection_pipe(row['response_j'])[0]['label']
    row['is_question_k'] = question_detection_pipe(row['response_k'])[0]['label']
    return row
    
eval_dataset = eval_dataset.map(apply_get_emotions).map(apply_question_detection)

                                                               

In [9]:
# Let's free up some cuda memory

del sentiment_pipe
del question_detection_pipe

import gc
gc.collect()
torch.cuda.empty_cache()

In [10]:
def percent(x): 
    return "{:.2f}%".format(x * 100)

def apply_textblob(row):
    textblob_j = TextBlob(row["response_j"]).sentiment
    textblob_k = TextBlob(row["response_k"]).sentiment
    row["polarity_j"] = percent(textblob_j.polarity)
    row["subjectivity_j"] = percent(textblob_j.subjectivity)
    row["polarity_k"] = percent(textblob_k.polarity)
    row["subjectivity_k"] = percent(textblob_k.subjectivity)
    return row

eval_dataset = eval_dataset.map(apply_textblob)

Loading cached processed dataset at /scratch1/jhoff/reddit_dataset_cached/eval/cache-55f17661bd97c21a.arrow


In [5]:
model_checkpoint = '/scratch1/jhoff/checkpoints/reward_llama-2-7b-chat-hf/checkpoint-3000_merged'

model_config = AutoConfig.from_pretrained(model_checkpoint)
model_config.architectures  = ['LlamaForSequenceClassification']
model_config.num_labels = 1


In [6]:
model_config

LlamaConfig {
  "_name_or_path": "/scratch1/jhoff/checkpoints/reward_llama-2-7b-chat-hf/checkpoint-3000_merged",
  "architectures": [
    "LlamaForSequenceClassification"
  ],
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "label2id": {
    "LABEL_0": 0
  },
  "max_position_embeddings": 4096,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 32,
  "pad_token_id": 0,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "tie_word_embeddings": false,
  "torch_dtype": "float32",
  "transformers_version": "4.32.0.dev0",
  "use_cache": true,
  "vocab_size": 32000
}

In [7]:
reward_model = LlamaForSequenceClassification.from_pretrained(model_checkpoint, torch_dtype=torch.bfloat16, config=model_config)
#reward_model = LlamaForSequenceClassification.from_pretrained(model_checkpoint, torch_dtype=torch.bfloat16, num_labels=1)
#reward_model.cuda()

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Loading checkpoint shards: 100%|██████████| 3/3 [00:08<00:00,  2.85s/it]
Some weights of the model checkpoint at /scratch1/jhoff/checkpoints/reward_llama-2-7b-chat-hf/checkpoint-3000_merged were not used when initializing LlamaForSequenceClassification: ['score.modules_to_save.default.weight', 'score.original_module.weight']
- This IS expected if you are initializing LlamaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LlamaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at /scratch1/jhoff/checkpoints/reward_llama-2-7b-chat-hf/checkpoint-3000_merged and 

In [8]:
#model_checkpoint_unmerged = model_checkpoint.replace("_merged", "")
#adapter_weights = torch.load(f'{model_checkpoint_unmerged}/adapter_model.bin', map_location='cpu')
#reward_model.score.weight.data = adapter_weights['base_model.model.score.weight']

In [9]:
reward_pipe = pipeline(
    "sentiment-analysis",
    model=reward_model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    device=0,
)

In [10]:
reward_pipe.model.score.weight

Parameter containing:
tensor([[-0.0085,  0.0123,  0.0075,  ...,  0.0145,  0.0075, -0.0142]],
       device='cuda:0', dtype=torch.bfloat16, requires_grad=True)

In [11]:
template = "<|ELIF|> Question: %question\nAnswer: %answer"

def apply_reward_model(row):

    question_title = row["question_title"]
    response_j = row["response_j"]
    response_k = row["response_k"]

    qa_j = template.replace("%question", question_title).replace("%answer", response_j)
    reward_j = reward_pipe(qa_j)[0]["score"]

    qa_k = template.replace("%question", question_title).replace("%answer", response_k)
    reward_k = reward_pipe(qa_k)[0]["score"]

    return {
        'reward_j': float(reward_j),
        'reward_k': float(reward_k),
    }

eval_dataset = eval_dataset.map(apply_reward_model)

                                                              

In [12]:
eval_dataset[0]

{'answer_link_id': '2y0dxt',
 'question_title': 'Why do employers ask "where do you see yourself in 5-10 years?" How do personal goals matter at all?',
 'response_j': 'After you respond, be sure to ask, "How do you see the company changing over that timespan?"',
 'response_k': "Whatever job you're applying for, think of the logical career path and where you should be in fifteen years. Like all basic interview questions it's more about whether you can have an adult conversation than the actual answers. ",
 'score_j': 13,
 'score_k': 2,
 'reward_j': 0.9744347929954529,
 'reward_k': 0.5580862164497375}

In [14]:
correct = 0
for row in eval_dataset:
    if row['reward_j'] >= row['reward_k']:
        correct += 1

print(f'Accuracy: {correct / len(eval_dataset)}')

Accuracy: 0.503
