In [1]:
from pathlib import Path
from datetime import datetime

import typer
import pandas as pd
import verifiers as vf
from dotenv import load_dotenv
from openai import AsyncOpenAI


assert load_dotenv(), "Failed to load .env file"

In [2]:
openai_client = AsyncOpenAI()

models_response = await openai_client.models.list()
available_models = [item.id for item in models_response.data if item.object == "model"]
print("Available models:", available_models)

model = available_models[0]
print("Using model:", model)


Available models: ['Qwen/Qwen2.5-3B-Instruct']
Using model: Qwen/Qwen2.5-3B-Instruct


In [3]:
temperature = 0.5
max_new_tokens = 1024

vf_env = vf.load_environment(env_id="gsm8k", num_eval_examples=100, use_think=False)
print(f"✅ Environment loaded with {len(vf_env.dataset)} examples")

# Run evaluation using the environment
print("🔄 Running evaluation...")
results = vf_env.evaluate(
    openai_client,
    model,
    rollouts_per_example=1,
    sampling_args={"temperature": temperature, "max_tokens": max_new_tokens},
)


✅ Environment loaded with 7473 examples
🔄 Running evaluation...


Running 100 rollouts: 100%|██████████| 100/100 [00:14<00:00,  6.84it/s]
Evaluating 100 rollouts: 100%|██████████| 100/100 [00:00<00:00, 3500.53it/s]


In [4]:
results.metrics.keys()

dict_keys(['correct_answer_reward_func', 'format_reward_func'])

In [5]:
ds = vf_env.make_dataset(results)
ds

Dataset({
    features: ['prompt', 'completion', 'answer', 'task', 'reward', 'correct_answer_reward_func', 'format_reward_func'],
    num_rows: 100
})

In [6]:
df = ds.to_pandas()
df

Unnamed: 0,prompt,completion,answer,task,reward,correct_answer_reward_func,format_reward_func
0,"[{'content': 'Please reason step by step, and ...",[{'content': 'To determine how much Janet make...,18,default,1.0,1.0,1.0
1,"[{'content': 'Please reason step by step, and ...",[{'content': 'To determine the total number of...,3,default,1.0,1.0,1.0
2,"[{'content': 'Please reason step by step, and ...",[{'content': 'To determine Josh's profit from ...,70000,default,1.0,1.0,1.0
3,"[{'content': 'Please reason step by step, and ...",[{'content': 'To determine the total number of...,540,default,1.0,1.0,1.0
4,"[{'content': 'Please reason step by step, and ...",[{'content': 'To determine how much feed Wendi...,20,default,1.0,1.0,1.0
...,...,...,...,...,...,...,...
95,"[{'content': 'Please reason step by step, and ...",[{'content': 'To determine how many girls are ...,40,default,1.0,1.0,1.0
96,"[{'content': 'Please reason step by step, and ...",[{'content': 'To determine how many more hours...,3,default,1.0,1.0,1.0
97,"[{'content': 'Please reason step by step, and ...",[{'content': 'To determine how many tomatoes F...,12,default,1.0,1.0,1.0
98,"[{'content': 'Please reason step by step, and ...",[{'content': 'To determine how many cars drove...,5,default,0.0,0.0,1.0


In [7]:
df.describe()

Unnamed: 0,reward,correct_answer_reward_func,format_reward_func
count,100.0,100.0,100.0
mean,0.87,0.87,1.0
std,0.337998,0.337998,0.0
min,0.0,0.0,1.0
25%,1.0,1.0,1.0
50%,1.0,1.0,1.0
75%,1.0,1.0,1.0
max,1.0,1.0,1.0


In [8]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
ds.to_json(f"../outputs/gsm8k-eval-results-{timestamp}.jsonl", orient="records", lines=True)
df.describe().to_json(f"../outputs/gsm8k-eval-metrics-{timestamp}.json", indent=2)

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

In [9]:
# row = df.iloc[0]
# for msg in row['prompt']:
#     print(msg['content'])
# for msg in row['completion']:
#     print(msg['content'])