In [1]:
from factuality_evaluator_rs import UnilateralFactualityEvaluator, BilateralFactualityEvaluator
from tqdm import tqdm
import json, os, pandas as pd

In [3]:
df_pos = pd.read_csv("data/gpqa/gpqa_main.csv")
df_neg = df_pos[["Question", "Incorrect Answer 1", "High-level domain", "Subdomain"]]
df_pos = df_pos[["Question", "Correct Answer", "High-level domain", "Subdomain"]]
df_pos.rename(columns={"Question": "problem", "Correct Answer": "answer", "High-level domain": "domain", "Subdomain": "subdomain"}, inplace=True)
df_neg.rename(columns={"Question": "problem", "Incorrect Answer 1": "answer", "High-level domain": "domain", "Subdomain": "subdomain"}, inplace=True)
half_size = len(df_pos) // 2
df_pos = df_pos.iloc[:half_size]
df_pos["label"] = "t"
df_neg = df_neg.iloc[half_size:]
df_neg["label"] = "f"
df = pd.concat([df_pos, df_neg])
RANDOM_SEED = 9931
df = df.sample(frac=1, random_state=RANDOM_SEED)
df = df.reset_index(drop=True)

In [4]:
GPQA_DATASET = df.to_dict(orient="records")
DATASET_SIZE = len(GPQA_DATASET)
EXPERIMENTAL_RUN_VERSION = "v14"
N_SAMPLES = 3

In [5]:
MODELS = [ 
    { "model_name": "nf-gpt-4o-mini", "temperature": None, "batch_size": 100 },
    { "model_name": "nf-gpt-4o", "temperature": None, "batch_size": 100 },
    { "model_name": "google/gemma-2-27b-it", "temperature": None, "batch_size": 1 },
    { "model_name": "microsoft/phi-4", "temperature": None, "batch_size": 1 },
    { "model_name": "google/gemini-2.0-flash-001", "temperature": None, "batch_size": 1 },
    { "model_name": "nf-Llama-3.1-8b-instruct", "temperature": None, "batch_size": 1 },
    { "model_name": "nf-Llama-3.1-70b-instruct", "temperature": None, "batch_size": 1 },
    { "model_name": "claude-3-5-haiku-20241022", "temperature": None, "batch_size": 1 },
    { "model_name": "claude-3-5-sonnet-20241022", "temperature": None, "batch_size": 1 },
    { "model_name": "o3-mini", "temperature": None, "batch_size": 1 },
    # { "model_name": "mistralai/Mistral-7B-Instruct-v0.3", "temperature": None, "batch_size": 1 },
    # { "model_name": "mistralai/Mixtral-8x7B-Instruct-v0.1", "temperature": None, "batch_size": 1 },
    # { "model_name": "deepseek/deepseek-r1-distill-llama-8b", "temperature": None, "batch_size": 1 },
]

In [6]:
# Check if directory exists and create if not
if not os.path.exists(f'experiments/{EXPERIMENTAL_RUN_VERSION}'):
    os.makedirs(f'experiments/{EXPERIMENTAL_RUN_VERSION}/unilateral')
    os.makedirs(f'experiments/{EXPERIMENTAL_RUN_VERSION}/bilateral')

In [7]:
def generate_results(model, mode, filename, dataset, samples):
    if os.path.isfile(filename):
        results = json.load(open(filename, "r"))
    else:
        results = []
    i = len(results)
    for datapoint in tqdm(dataset[i:], desc=f'{model.model_name:36} {mode}', initial=i, total=len(dataset)):
        results.append(model.invoke(datapoint, samples=samples))
        json.dump(results, open(filename, "w+"))

In [8]:
for model in MODELS:
    generate_results(
        UnilateralFactualityEvaluator(model["model_name"], temperature=model["temperature"], batch_size=model["batch_size"]),
        "(UNI)",
        f'experiments/{EXPERIMENTAL_RUN_VERSION}/unilateral/{model["model_name"].split("/")[-1]}-simpleqa.json',
        GPQA_DATASET,
        N_SAMPLES
    )
    generate_results(
        BilateralFactualityEvaluator(model["model_name"], temperature=model["temperature"], batch_size=model["batch_size"]),
        "(BIL)",
        f'experiments/{EXPERIMENTAL_RUN_VERSION}/bilateral/{model["model_name"].split("/")[-1]}-simpleqa.json',
        GPQA_DATASET,
        N_SAMPLES
    )

nf-gpt-4o-mini                       (UNI):  11%|█         | 49/448 [19:44<2:40:48, 24.18s/it]


KeyboardInterrupt: 