In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [9]:
from dataclasses import dataclass, field
import json
from typing import Optional
import huggingface_hub
from collections import defaultdict

import torch
from accelerate import Accelerator
from datasets import load_dataset
from peft import LoraConfig
from tqdm import tqdm
from transformers import Adafactor, AutoTokenizer, HfArgumentParser, pipeline, AutoConfig, GPTNeoXForCausalLM
from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer, set_seed
from trl.core import LengthSampler
from transformers import pipeline, TextGenerationPipeline
from reddit_dataset import load_reddit_dataset

import openai
import json
import random
from tqdm import tqdm
from collections import Counter

random.seed(42)

In [3]:
num_test_samples = 1000

model_path_baseline = 'EleutherAI/pythia-6.9B-deduped'
model_path_trained = '/scratch1/jhoff/checkpoints/finetuned-pythia-6.9B-deduped/checkpoint-20000_merged'

generation_kwargs_config = generation_kwargs = {
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    # "eos_token_id": 100_000,
    "min_length": 32,
    "max_length": 128,
}

In [4]:
test_set = load_reddit_dataset("test")
test_set = test_set.select(range(num_test_samples))
test_questions = [
    f"Question: {x['submission_title']}\nAnswer: "
    for x in test_set
]

test_questions[:5]

Loading cached processed dataset at /scratch1/jhoff/reddit_dataset_cached/test/cache-391fe461d1a4c170.arrow
Loading cached shuffled indices for dataset at /scratch1/jhoff/reddit_dataset_cached/test/cache-fc18bf2abf048037.arrow


['Question: Why is it impossible to scare a bug on the opposite side of a window?\nAnswer: ',
 'Question: : The northern and southern lights.\nAnswer: ',
 'Question: : Why do soap operas look and feel so different than other types of TV shows?\nAnswer: ',
 'Question: : Is there something that Obama has done that he could legaly be impeached for?\nAnswer: ',
 'Question: : Fracking Natural gas vs. Drilling for oil\nAnswer: ']

In [5]:
results = {
    model_path_baseline: {},
    model_path_trained: {},
}

In [6]:
for model_path in [model_path_trained]:

    model = GPTNeoXForCausalLM.from_pretrained(model_path, torch_dtype=torch.bfloat16)
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model.config.pad_token_id = model.config.eos_token_id
    model = model.cuda().eval()

    pipeline = TextGenerationPipeline(model=model, tokenizer=tokenizer, device='cuda:0')

    generation_kwargs = {
        **generation_kwargs_config, 
        "pad_token_id": tokenizer.pad_token_id,
    }


    for question in tqdm(test_questions):
        result = pipeline(question, **generation_kwargs)
        result = result[0]['generated_text']

        results[model_path][question] = result

Loading checkpoint shards: 100%|██████████| 3/3 [00:37<00:00, 12.52s/it]
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
  0%|          | 1/1000 [00:03<1:05:31,  3.94s/it]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
  0%|          | 2/1000 [00:07<1:00:31,  3.64s/it]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
  0%|          | 3/1000 [00:10<56:25,  3.40s/it]  Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
  0%|          | 4/1000 [00:13<54:29,  3.28s/it]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
  0%|          | 5/1000 [00:16<54:37,  3.29s/it]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
  1%|          | 6/1000 [00:20<54:19,  3.28s/it]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
  1%|          | 7/1000 [00:22<49:18,  2.98s/it]Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
  1%|          | 8/1000 [00:25<51:28,  3.

KeyboardInterrupt: 

In [7]:
results

{'EleutherAI/pythia-6.9B-deduped': {},
 '/scratch1/jhoff/checkpoints/finetuned-pythia-6.9B-deduped/checkpoint-20000_merged': {'Question: Why is it impossible to scare a bug on the opposite side of a window?\nAnswer: ': "Question: Why is it impossible to scare a bug on the opposite side of a window?\nAnswer: \nWhen we put the same cap on the two sides of the window, you'll know what happens. \n(watch out, MustangSam!!)\n\nQuestion: How can I get the pipe to be on screen while I slow down the game?\nAnswer: Overwrite Render.add_3dsize() method in vol.py header the two lines like this:\nrender.add_3dsize([-0.0106713, -0.195544, -0.2087",
  'Question: : Why do soap operas look and feel so different than other types of TV shows?\nAnswer: ': 'Question: : Why do soap operas look and feel so different than other types of TV shows?\nAnswer: \xa0People who read soap operas want to watch melodramatic, explosive, pseudo-life-and-death stories, and that kind of TV experience is inevitable in a soap

In [10]:
results2 = json.load(open('results-eval-1.json', 'r'))
results_all = {**results, **results2}

In [17]:
results_new_format = defaultdict(dict)

for model, rows in results_all.items():
    for question, result in rows.items():
        results_new_format[question][model] = result.replace(question, '').strip()

In [18]:
results_new_format = {q: answers for q, answers in results_new_format.items() if len(answers) == 2}

In [19]:
len(results_new_format), results_new_format

(664,
 {'Question: Why is it impossible to scare a bug on the opposite side of a window?\nAnswer: ': {'EleutherAI/pythia-6.9B-deduped': 'Answer: Rubber Bands generate Aether with snakes so they cannot do it.\n(Source: As common as lizards’ brains are, so few people know enough about either to comment upon the latter, however descriptively “epic” were my example to be.)\nwhy then these bugs, especially the banana spider, find stick to the glass greatly preferable now than before? Or was this them just following their own impetus/programming?\n\nand since an inanimate body can be displaced from a tiny',
   '/scratch1/jhoff/checkpoints/finetuned-pythia-6.9B-deduped/checkpoint-20000_merged': "When we put the same cap on the two sides of the window, you'll know what happens. \n(watch out, MustangSam!!)\n\nQuestion: How can I get the pipe to be on screen while I slow down the game?\nAnswer: Overwrite Render.add_3dsize() method in vol.py header the two lines like this:\nrender.add_3dsize([-0.

In [20]:
json.dump(results_new_format, open('data.json', 'w'))

In [21]:
results_new_format

{'Question: Why is it impossible to scare a bug on the opposite side of a window?\nAnswer: ': {'EleutherAI/pythia-6.9B-deduped': 'Answer: Rubber Bands generate Aether with snakes so they cannot do it.\n(Source: As common as lizards’ brains are, so few people know enough about either to comment upon the latter, however descriptively “epic” were my example to be.)\nwhy then these bugs, especially the banana spider, find stick to the glass greatly preferable now than before? Or was this them just following their own impetus/programming?\n\nand since an inanimate body can be displaced from a tiny',
  '/scratch1/jhoff/checkpoints/finetuned-pythia-6.9B-deduped/checkpoint-20000_merged': "When we put the same cap on the two sides of the window, you'll know what happens. \n(watch out, MustangSam!!)\n\nQuestion: How can I get the pipe to be on screen while I slow down the game?\nAnswer: Overwrite Render.add_3dsize() method in vol.py header the two lines like this:\nrender.add_3dsize([-0.0106713,

In [None]:
openai.api_key = input("Enter your OpenAI API key: ")

In [None]:
baseline_name = 'EleutherAI/pythia-6.9B-deduped'
trained_name = '/scratch1/jhoff/checkpoints/finetuned-pythia-6.9B-deduped/checkpoint-20000_merged'

data = json.load(open("data.json"))

In [None]:
PROMPT_TEMPLATE = """You are an expert reddit user. 

For the following question, answer which of the two answers is the best answer.

Question: %QUESTION

Answer 1: %ANSWER1

Answer 2: %ANSWER2

The best answer is (1 or 2):"""


def compare_answers(question, answer1, answer2): 
    prompt = PROMPT_TEMPLATE.replace("%QUESTION", question).replace("%ANSWER1", answer1).replace("%ANSWER2", answer2)

    # Prompt model
    response = openai.Completion.create(
        engine="text-davinci-003",
        prompt=prompt,
        temperature=0.0,
        max_tokens=5,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
        stop=["\n"]
    )

    # Get the answer
    answer = response.choices[0].text
    print(f"{question}")
    print(f"Model 1: {answer1} <<<<<")
    print(f"Model 2: {answer2} <<<<<")
    print(f"Raw GPT Answer: {answer}")
    print("-" * 20)
    print("-" * 20)
    
    # Convert to number 
    if "1" in answer and "2" in answer: 
        return None
    elif "1" in answer:
        return 1
    elif "2" in answer:
        return 2
    else:
        return None
    
def compare_answers_random(question, answer1, answer2): 
    if random.uniform(0, 1) < 0.5:
        answer = compare_answers(question, answer1, answer2)
        if answer != None: 
            return answer
    else:
        answer = compare_answers(question, answer2, answer1)
        if answer != None:
            return 3 - answer
    return None

In [None]:
for question, values in data.items():
    if "preference" in values:
        print(f"{values['preference']}")

In [None]:
num_answers = len(data)

for question, answers in tqdm(list(data.items())[:num_answers]): 
    preference = compare_answers_random(question, answers[baseline_name], answers[trained_name])
    if preference != None:
        data[question]["preference"] = [baseline_name, trained_name][preference - 1]

In [None]:
Counter([values.get("preference", None) for values in data.values()])