In [1]:
import json
import random
from collections import Counter, defaultdict

# %%
from dataclasses import dataclass, field
from typing import Optional

import huggingface_hub
from openai import OpenAI
import torch
from accelerate import Accelerator
from datasets import load_dataset
import datasets as ds

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = ds.load_from_disk("/scratch1/redditqa/cached_datasets/AskHistorians_test_set_100_model_generated")
dataset

Dataset({
    features: ['question_created_utc', 'question_retrieved_on', 'question_deleted', 'question_title', 'question_selftext', 'question_score', 'question_char_length', 'question_selftext_char_length', 'answers', 'graded_output', 'Zephyr-7B-beta', 'Zephyr-History-7600'],
    num_rows: 100
})

In [3]:
model_a = 'Zephyr-7B-beta'
model_b = 'Zephyr-History-7600'

In [4]:
def get_model_order(models):
    return random.sample(models, k=2)

In [5]:
system_prompt = "You are a seasoned historian tasked with evaluating responses to historical questions. Consider the following question and assess which of the two provided answers presents the most accurate and comprehensive information. "

In [6]:
PROMPT_TEMPLATE = """Question: %QUESTION

Answer 1: %ANSWER1

Answer 2: %ANSWER2

Output your final verdict by strictly following this format: "[[1]]" if answer 1 is better, "[[2]]"
if answer 2 is better."""

In [7]:
client = OpenAI(api_key="your_key")

In [8]:
def gpt4_preference(ds_item):
    model1, model2 = get_model_order([model_a, model_b])
    user_prompt = PROMPT_TEMPLATE.replace("%QUESTION", ds_item['question_title'])
    user_prompt = user_prompt.replace("%ANSWER1", ds_item[model1])
    user_prompt = user_prompt.replace("%ANSWER2", ds_item[model2])

    response = client.chat.completions.create(
    model="gpt-4-1106-preview",
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
        ],
        temperature=0.0,
        max_tokens=5,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
        stop=["\n"],
    )
    answer = response.choices[0].message.content

    ds_item['model-order'] = f"Answer1:{model1};Answer2:{model2}"
    ds_item['raw-gpt4-answer'] = answer

    # Convert to preference
    if "1" in answer and "2" in answer:
        ds_item['gpt4-preference'] = ""
    elif "1" in answer:
        ds_item['gpt4-preference'] = model1
    elif "2" in answer:
        ds_item['gpt4-preference'] = model2
    else:
        ds_item['gpt4-preference'] = ""

    return ds_item

In [9]:
dataset = dataset.map(gpt4_preference)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map: 100%|██████████| 100/100 [01:30<00:00,  1.10 examples/s]


In [10]:
dataset.save_to_disk("/scratch1/redditqa/cached_datasets/AskHistorians_test_set_100_model_generated_gpt4_preference")

Saving the dataset (1/1 shards): 100%|██████████| 100/100 [00:00<00:00, 14270.71 examples/s]


In [11]:
Counter(dataset['gpt4-preference'])

Counter({'Zephyr-7B-beta': 94, 'Zephyr-History-7600': 5, '': 1})