In [21]:
from pathlib import Path

import typer
import pandas as pd
import verifiers as vf
from dotenv import load_dotenv
from openai import AsyncOpenAI


assert load_dotenv(), "Failed to load .env file"

In [22]:
openai_client = AsyncOpenAI()

models_response = await openai_client.models.list()
available_models = [item.id for item in models_response.data if item.object == "model"]
print("Available models:", available_models)

model = available_models[0]
print("Using model:", model)


Available models: ['Qwen/Qwen2.5-3B-Instruct']
Using model: Qwen/Qwen2.5-3B-Instruct


In [23]:
temperature = 0.5
max_new_tokens = 1024

vf_env = vf.load_environment(env_id="gsm8k", num_eval_examples=10, use_think=False)
print(f"✅ Environment loaded with {len(vf_env.dataset)} examples")

# Run evaluation using the environment
print("🔄 Running evaluation...")
results = vf_env.evaluate(
    openai_client,
    model,
    rollouts_per_example=1,
    sampling_args={"temperature": temperature, "max_tokens": max_new_tokens},
)


✅ Environment loaded with 7473 examples
🔄 Running evaluation...


Running 10 rollouts: 100%|██████████| 10/10 [00:07<00:00,  1.34it/s]
Evaluating 10 rollouts: 100%|██████████| 10/10 [00:00<00:00, 3682.77it/s]


In [24]:
results.metrics.keys()

dict_keys(['correct_answer_reward_func', 'format_reward_func'])

In [25]:
ds = vf_env.make_dataset(results)
ds

Dataset({
    features: ['prompt', 'completion', 'answer', 'task', 'reward', 'correct_answer_reward_func', 'format_reward_func'],
    num_rows: 10
})

In [26]:
ds.to_json("../outputs/gsm8k-eval-results.jsonl", orient="records", lines=True)

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

16422

In [27]:
df = ds.to_pandas()
df

Unnamed: 0,prompt,completion,answer,task,reward,correct_answer_reward_func,format_reward_func
0,"[{'content': 'Please reason step by step, and ...",[{'content': 'To determine how much Janet make...,18,default,1.2,1.0,1.0
1,"[{'content': 'Please reason step by step, and ...",[{'content': 'To determine the total number of...,3,default,1.2,1.0,1.0
2,"[{'content': 'Please reason step by step, and ...",[{'content': 'To determine Josh's profit from ...,70000,default,1.2,1.0,1.0
3,"[{'content': 'Please reason step by step, and ...",[{'content': 'To determine the total number of...,540,default,1.2,1.0,1.0
4,"[{'content': 'Please reason step by step, and ...",[{'content': 'To determine how much feed Wendi...,20,default,1.2,1.0,1.0
5,"[{'content': 'Please reason step by step, and ...",[{'content': 'To determine how much Kylar need...,64,default,1.2,1.0,1.0
6,"[{'content': 'Please reason step by step, and ...",[{'content': 'To determine the total number of...,260,default,1.2,1.0,1.0
7,"[{'content': 'Please reason step by step, and ...",[{'content': 'To determine the total time it t...,160,default,0.2,0.0,1.0
8,"[{'content': 'Please reason step by step, and ...",[{'content': 'To determine how far John is fro...,45,default,0.2,0.0,1.0
9,"[{'content': 'Please reason step by step, and ...",[{'content': 'To determine Eliza's total earni...,460,default,1.2,1.0,1.0


In [28]:
df.describe()

Unnamed: 0,reward,correct_answer_reward_func,format_reward_func
count,10.0,10.0,10.0
mean,1.0,0.8,1.0
std,0.421637,0.421637,0.0
min,0.2,0.0,1.0
25%,1.2,1.0,1.0
50%,1.2,1.0,1.0
75%,1.2,1.0,1.0
max,1.2,1.0,1.0


In [29]:
# row = df.iloc[0]
# for msg in row['prompt']:
#     print(msg['content'])
# for msg in row['completion']:
#     print(msg['content'])