In [1]:
from pathlib import Path
from datetime import datetime

import typer
import pandas as pd
import verifiers as vf
from dotenv import load_dotenv
from openai import AsyncOpenAI


assert load_dotenv(), "Failed to load .env file"

In [2]:
openai_client = AsyncOpenAI()

models_response = await openai_client.models.list()
available_models = [item.id for item in models_response.data if item.object == "model"]
print("Available models:", available_models)

model = available_models[0]
print("Using model:", model)


Available models: ['Qwen/Qwen2.5-7B-Instruct']
Using model: Qwen/Qwen2.5-7B-Instruct


In [3]:
temperature = 0.5
max_new_tokens = 1024

# Load math-python environment for evaluation
vf_env = vf.load_environment(env_id="math-python", num_train_examples=100, num_eval_examples=100)
print(f"✅ Math-Python Environment loaded with {len(vf_env.dataset)} examples")
print(f"📊 Dataset info: {vf_env.dataset}")

# Run evaluation using the environment
print("🔄 Running evaluation on math-python...")
results = vf_env.evaluate(
    openai_client,
    model,
    rollouts_per_example=1,
    sampling_args={"temperature": temperature, "max_tokens": max_new_tokens},
)

2025-09-08 13:06:24 - verifiers.rubrics.RubricGroup - INFO - Initialized RubricGroup with 2 rubrics
2025-09-08 13:06:24 - verifiers.envs.ToolEnv - INFO - eval_dataset is not set, falling back to train dataset


✅ Math-Python Environment loaded with 100 examples
📊 Dataset info: Dataset({
    features: ['question', 'answer', 'prompt'],
    num_rows: 100
})
🔄 Running evaluation on math-python...


Running 100 rollouts: 100%|██████████| 100/100 [00:20<00:00,  4.99it/s]
Evaluating 100 rollouts: 100%|██████████| 100/100 [00:00<00:00, 3507.09it/s]
Evaluating 100 rollouts: 100%|██████████| 100/100 [00:00<00:00, 4371.89it/s]


In [4]:
ds = vf_env.make_dataset(results)
ds

Dataset({
    features: ['prompt', 'completion', 'answer', 'task', 'reward', 'info', 'total_tool_calls', 'python_calls', 'correct_answer_reward_func'],
    num_rows: 100
})

In [5]:
df = ds.to_pandas()
df

Unnamed: 0,prompt,completion,answer,task,reward,info,total_tool_calls,python_calls,correct_answer_reward_func
0,[{'content': 'Use python for all calculations ...,[{'content': 'Let's denote the number of blue ...,24,default,1.0,{'oai_tools': [{'function': {'description': 'E...,1.0,1.0,1.0
1,[{'content': 'Use python for all calculations ...,"[{'content': 'To solve this problem, we need t...",50,default,0.0,{'oai_tools': [{'function': {'description': 'E...,0.0,0.0,0.0
2,[{'content': 'Use python for all calculations ...,"[{'content': 'To solve this problem, we need t...",5,default,1.0,{'oai_tools': [{'function': {'description': 'E...,1.0,1.0,1.0
3,[{'content': 'Use python for all calculations ...,"[{'content': 'To solve this problem, we can se...",6,default,0.0,{'oai_tools': [{'function': {'description': 'E...,0.0,0.0,0.0
4,[{'content': 'Use python for all calculations ...,"[{'content': '<tool_call> {""name"": ""python"", ""...",23,default,0.0,{'oai_tools': [{'function': {'description': 'E...,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
95,[{'content': 'Use python for all calculations ...,"[{'content': '', 'role': 'assistant', 'tool_ca...",50000,default,1.0,{'oai_tools': [{'function': {'description': 'E...,3.0,3.0,1.0
96,[{'content': 'Use python for all calculations ...,"[{'content': 'To solve this problem, we can us...",0,default,1.0,{'oai_tools': [{'function': {'description': 'E...,0.0,0.0,1.0
97,[{'content': 'Use python for all calculations ...,[{'content': 'To determine the domain of the f...,"(625, \infty)",default,1.0,{'oai_tools': [{'function': {'description': 'E...,0.0,0.0,1.0
98,[{'content': 'Use python for all calculations ...,"[{'content': 'To solve this problem, we need t...",\frac{27}{4},default,0.0,{'oai_tools': [{'function': {'description': 'E...,0.0,0.0,0.0


In [6]:
df.describe()

Unnamed: 0,reward,total_tool_calls,python_calls,correct_answer_reward_func
count,100.0,100.0,100.0,100.0
mean,0.22,0.27,0.27,0.22
std,0.416333,0.633333,0.633333,0.416333
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0
max,1.0,3.0,3.0,1.0


In [7]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Save results for math-python
ds.to_json(f"../outputs/math-python-results-{timestamp}.jsonl", orient="records", lines=True)
df.describe().to_json(f"../outputs/math-python-metrics-{timestamp}.json", indent=2)

print(f"📁 Results saved:")
print(f"   - Results: math-python-results-{timestamp}.jsonl")  
print(f"   - Metrics: math-python-metrics-{timestamp}.json")

print(f"\n📊 Performance Summary:")
print(f"   - Mean reward: {df['reward'].mean():.3f}")
print(f"   - Reward std: {df['reward'].std():.3f}")  
print(f"   - Success rate: {(df['reward'] > 0).mean():.1%}")
print(f"   - Available metrics: {list(results.metrics.keys())}")

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

📁 Results saved:
   - Results: math-python-results-20250908_130644.jsonl
   - Metrics: math-python-metrics-20250908_130644.json

📊 Performance Summary:
   - Mean reward: 0.220
   - Reward std: 0.416
   - Success rate: 22.0%
   - Available metrics: ['total_tool_calls', 'python_calls', 'correct_answer_reward_func']


In [8]:
import json

def format_message(message):
    role = message['role']  
    content = message['content']
    if tool_calls := message.get('tool_calls'):
        for tool_call in tool_calls:
            content += tool_call
    return f"{role}: {content}"

def format_trajectory(*messages):
    return "\n\n".join([format_message(msg) for msg in messages])


In [9]:
# Sample completion inspection
print("🔍 Sample completions:")

_df = df[df['reward'] > 0]
for i in range(min(3, len(_df))):
    row = _df.iloc[i]
    print(f"\n--- Example {i+1} (Reward: {row['reward']}) ---")

    print("================== Trajectory =======================")
    print(format_trajectory(*row['prompt'], *row['completion']))
    print("=====================================================")
    
    print(f"✅ Answer: {row['answer']}")
    print(f"📊 Metrics: reward={row['reward']}")
    
    # Show available reward components
    for metric in results.metrics.keys():
        if metric in row:
            print(f"   - {metric}: {row[metric]}")
    print("-" * 80)

🔍 Sample completions:

--- Example 1 (Reward: 1.0) ---
system: Use python for all calculations (variables do not persist). Give your answer inside \boxed{}.

user: I have a bag with blue marbles and yellow marbles in it. At the moment, the ratio of blue marbles to yellow marbles is 8:5. If I remove 12 blue marbles and add 21 yellow marbles, the ratio will be 1:3. How many blue marbles were in the bag before I removed some?

assistant: Let's denote the number of blue marbles as \(B\) and the number of yellow marbles as \(Y\).

Given that initially the ratio of blue to yellow marbles is 8:5, we can write this as:
\[ \frac{B}{Y} = \frac{8}{5} \]
This implies:
\[ B = \frac{8}{5}Y \]

After removing 12 blue marbles and adding 21 yellow marbles, the new ratio becomes 1:3. So, we have:
\[ \frac{B - 12}{Y + 21} = \frac{1}{3} \]
This implies:
\[ 3(B - 12) = Y + 21 \]

We now have two equations:
1. \( B = \frac{8}{5}Y \)
2. \( 3(B - 12) = Y + 21 \)

Let's solve these equations using Python.
{"id