In [1]:
from factuality_evaluator import UnilateralFactualityEvaluator, BilateralFactualityEvaluator
import pandas, json, os

In [2]:
df = pandas.read_csv(
    "https://openaipublic.blob.core.windows.net/simple-evals/simple_qa_test_set.csv"
)

In [3]:
SIMPLE_QA_DATASET = df.to_dict(orient="records")[:1000]

In [4]:
MODELS = [ 
    { "model_name": "gpt-4o-mini", "batch_size": 100 },
    { "model_name": "gpt-4o-2024-11-20", "batch_size": 100 },
    # { "model_name": "gpt-4-0125-preview", "batch_size": 50 },
    { "model_name": "mistralai/Mistral-7B-Instruct-v0.3", "batch_size": 50 },
    # { "model_name": "claude-3-5-sonnet-20240620", "batch_size": 1 },
    { "model_name": "mistralai/Mixtral-8x7B-Instruct-v0.1", "batch_size": 50 },
    # { "model_name": "claude-3-opus-20240229", "batch_size": 1 },
    # { "model_name": "meta-llama/Meta-Llama-3-70B-Instruct", "batch_size": 50 },
    # { "model_name": "claude-3-haiku-20240307", "batch_size": 1 },
]

In [6]:
for model in MODELS:
    filename = f'experiments/unilateral/{model["model_name"].split("/")[-1]}-simpleqa.json'
    if os.path.isfile(filename):
        print(f'{model["model_name"]:36}: EXISTS')
    else:
        evaluator = UnilateralFactualityEvaluator(model["model_name"], batch_size=model["batch_size"])
        results = evaluator.batch(SIMPLE_QA_DATASET)
        json.dump(results, open(filename, "w+"))

gpt-4o-mini                         : 100%|██████████| 10/10 [09:45<00:00, 58.59s/it]
gpt-4o-2024-11-20                   : 100%|██████████| 10/10 [06:50<00:00, 41.10s/it]
mistralai/Mistral-7B-Instruct-v0.3  : 100%|██████████| 1000/1000 [38:17<00:00,  2.30s/it]
mistralai/Mixtral-8x7B-Instruct-v0.1: 100%|██████████| 20/20 [52:52<00:00, 158.62s/it]


In [7]:
for model in MODELS:
    filename = f'experiments/bilateral/{model["model_name"].split("/")[-1]}-simpleqa.json'
    if os.path.isfile(filename):
        print(f'{model["model_name"]:36}: EXISTS')
    else:
        evaluator = BilateralFactualityEvaluator(model["model_name"], batch_size=model["batch_size"])
        results = evaluator.batch(SIMPLE_QA_DATASET)
        json.dump(results, open(filename, "w+"))

gpt-4o-mini                         : 100%|██████████| 10/10 [13:02<00:00, 78.25s/it]
gpt-4o-2024-11-20                   : 100%|██████████| 10/10 [14:18<00:00, 85.90s/it]
mistralai/Mistral-7B-Instruct-v0.3  : 100%|██████████| 1000/1000 [1:21:57<00:00,  4.92s/it]
mistralai/Mixtral-8x7B-Instruct-v0.1: 100%|██████████| 20/20 [2:00:26<00:00, 361.32s/it]
