In [1]:
import json
import random
from collections import Counter, defaultdict

from dataclasses import dataclass, field
from typing import Optional

import huggingface_hub
from openai import OpenAI
import torch
from accelerate import Accelerator
from datasets import load_dataset
import datasets as ds

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
client = OpenAI(api_key=input())

In [4]:
dataset = ds.load_from_disk("/scratch1/redditqa/cached_datasets/AskHistorians_test_set_100_model_generated")
dataset

Dataset({
    features: ['question_created_utc', 'question_retrieved_on', 'question_deleted', 'question_title', 'question_selftext', 'question_score', 'question_char_length', 'question_selftext_char_length', 'answers', 'graded_output', 'Zephyr-7B-beta', 'Zephyr-History-7600'],
    num_rows: 100
})

In [5]:
model_a = 'Zephyr-7B-beta'
model_b = 'Zephyr-History-7600'

In [6]:
def get_model_order(models):
    return random.sample(models, k=2)

In [11]:
PROMPT_TEMPLATE = """
You are a seasoned historian tasked with evaluating responses to historical questions. 
Consider the following question and assess which of the two provided 
answers presents the most accurate and comprehensive information. Your evaluation
should consider factors such as the helpfulness, relevance, accuracy, depth, creativity,
and level of detail of their responses. Begin your evaluation by comparing the two
responses and provide a short explanation. Avoid any position biases and ensure that the
order in which the responses were presented does not influence your decision. Do not allow
the length of the responses to influence your evaluation. Do not favor certain names of
the assistants. Be as objective as possible. After providing your explanation, output your
final verdict by strictly following this format: "[[A]]" if assistant A is better, "[[B]]"
if assistant B is better, and "[[C]]" for a tie.

[User Question]
Question: %QUESTION

[The Start of Assistant A's Answer]
%ANSWER1
[The End of Assistant A's Answer]

[The Start of Assistant B's Answer]
%ANSWER2
[The End of Assistant B's Answer]
""".strip()

In [16]:
def gpt4_preference(ds_item):
    model1, model2 = get_model_order([model_a, model_b])
    user_prompt = PROMPT_TEMPLATE.replace("%QUESTION", ds_item['question_title'])
    user_prompt = user_prompt.replace("%ANSWER1", ds_item[model1])
    user_prompt = user_prompt.replace("%ANSWER2", ds_item[model2])

    response = client.chat.completions.create(
    model="gpt-4-1106-preview",
    messages=[
        {"role": "user", "content": user_prompt}
        ],
        temperature=0.0,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
    )
    answer = response.choices[0].message.content

    ds_item['model-order'] = f"Answer1:{model1};Answer2:{model2}"
    ds_item['raw-gpt4-answer'] = answer

    # Convert to preference
    if "[[A]]" in answer and "[[B]]" in answer:
        ds_item['gpt4-preference'] = ""
    elif "[[A]]" in answer:
        ds_item['gpt4-preference'] = model1
    elif "[[B]]" in answer:
        ds_item['gpt4-preference'] = model2
    else:
        ds_item['gpt4-preference'] = ""

    return ds_item

In [17]:
gpt4_preference(dataset[0])

{'question_created_utc': 1664471496000,
 'question_retrieved_on': 1665428342000,
 'question_deleted': False,
 'question_title': 'Who were some early/Renaissance philosophers who believed that all human beings were essentially the same, with all concepts of race being pseudoscience?',
 'question_selftext': 'The historical record is filled with examples of race science and people employing racial categorizations to describe people. But underlying all of that has been a scientific truth: from a scientific perspective, every human being is essentially equal. Races do not inherently exist in the human body. Cultures do not inherently exist. Those things are all constructed by whatever society a person happens to live in. The march of science would ultimately provide no more specificity than “Homo sapiens”.\n\nI realized that I don’t really have any examples of early philosophers who believed this. Outside of the American abolition movement, I don’t know of anyone who taught against racial c

In [21]:
dataset = dataset.select(range(0, 50))
dataset

Dataset({
    features: ['question_created_utc', 'question_retrieved_on', 'question_deleted', 'question_title', 'question_selftext', 'question_score', 'question_char_length', 'question_selftext_char_length', 'answers', 'graded_output', 'Zephyr-7B-beta', 'Zephyr-History-7600'],
    num_rows: 50
})

In [22]:
dataset = dataset.map(gpt4_preference)

Map: 100%|██████████| 50/50 [19:53<00:00, 23.88s/ examples]


In [32]:
dataset.save_to_disk("/scratch1/redditqa/cached_datasets/AskHistorians_test_set_50_model_generated_gpt4_preference_reasoning")

Saving the dataset (1/1 shards): 100%|██████████| 50/50 [00:00<00:00, 8924.43 examples/s]


In [33]:
Counter(dataset['gpt4-preference'])

Counter({'Zephyr-7B-beta': 46, '': 2, 'Zephyr-History-7600': 2})