In [1]:
from factuality_evaluator_rs import UnilateralFactualityEvaluator, BilateralFactualityEvaluator
import json, os, pandas as pd

In [2]:
df_pos = pd.read_csv("data/simple_qa_test_set.csv")
df_neg = pd.read_csv("data/synthetic_dataset_with_wrong_answers.csv")
df_neg = df_neg[["metadata", "problem", "wrong_answer_1"]]
df_neg.rename(columns={"wrong_answer_1": "answer"}, inplace=True)
half_size = len(df_pos) // 2
df_pos = df_pos.iloc[:half_size]
df_pos["label"] = "t"
df_neg = df_neg.iloc[half_size:]
df_neg["label"] = "f"
df = pd.concat([df_pos, df_neg])
RANDOM_SEED = 9931
df = df.sample(frac=1, random_state=RANDOM_SEED)
df = df.reset_index(drop=True)

In [3]:
SIMPLE_QA_DATASET = df.to_dict(orient="records")[:1000]
DATASET_SIZE = len(SIMPLE_QA_DATASET)
EXPERIMENTAL_RUN_VERSION = "v9"

In [4]:
MODELS = [ 
    { "model_name": "gpt-4o-mini", "temperature": 0.1, "batch_size": 100 },
    { "model_name": "gpt-4o-2024-11-20", "temperature": 0.1, "batch_size": 100 },
    { "model_name": "o3-mini", "temperature": None, "batch_size": 100 },
    { "model_name": "mistralai/Mistral-7B-Instruct-v0.3", "temperature": 0.1, "batch_size": 50 },
    { "model_name": "mistralai/Mixtral-8x7B-Instruct-v0.1", "temperature": 0.1, "batch_size": 50 },
    { "model_name": "claude-3-5-haiku-20241022", "temperature": 0.1, "batch_size": 1 },
    # { "model_name": "claude-3-5-sonnet-20241022", "temperature": 0.1, "batch_size": 1 },
    # { "model_name": "meta-llama/Llama-3.3-70B-Instruct", "temperature": 0.1, "batch_size": 50 },
]

In [5]:
# Check if directory exists and create if not
if not os.path.exists(f'experiments/{EXPERIMENTAL_RUN_VERSION}'):
    os.makedirs(f'experiments/{EXPERIMENTAL_RUN_VERSION}/unilateral')
    os.makedirs(f'experiments/{EXPERIMENTAL_RUN_VERSION}/bilateral')

In [6]:
for model in MODELS:
    filename = f'experiments/{EXPERIMENTAL_RUN_VERSION}/unilateral/{model["model_name"].split("/")[-1]}-simpleqa.json'
    if os.path.isfile(filename):
        print(f'{model["model_name"]:36}: EXISTS')
    else:
        evaluator = UnilateralFactualityEvaluator(model["model_name"], temperature=model["temperature"], batch_size=model["batch_size"])
        results = evaluator.batch(SIMPLE_QA_DATASET)
        json.dump(results, open(filename, "w+"))

gpt-4o-mini                         : EXISTS
gpt-4o-2024-11-20                   : EXISTS
o3-mini                             : EXISTS
mistralai/Mistral-7B-Instruct-v0.3  : EXISTS
mistralai/Mixtral-8x7B-Instruct-v0.1: EXISTS
claude-3-5-haiku-20241022           : EXISTS


In [7]:
for model in MODELS:
    filename = f'experiments/{EXPERIMENTAL_RUN_VERSION}/bilateral/{model["model_name"].split("/")[-1]}-simpleqa.json'
    if os.path.isfile(filename):
        print(f'{model["model_name"]:36}: EXISTS')
    else:
        evaluator = BilateralFactualityEvaluator(model["model_name"], temperature=model["temperature"], batch_size=model["batch_size"])
        results = evaluator.batch(SIMPLE_QA_DATASET)
        json.dump(results, open(filename, "w+"))

gpt-4o-mini                         : EXISTS
gpt-4o-2024-11-20                   : EXISTS


o3-mini                             : 100%|██████████| 10/10 [28:06<00:00, 168.68s/it]

mistralai/Mistral-7B-Instruct-v0.3  : EXISTS
mistralai/Mixtral-8x7B-Instruct-v0.1: EXISTS
claude-3-5-haiku-20241022           : EXISTS



