In [1]:
from pathlib import Path

import pandas as pd
import verifiers as vf
from dotenv import load_dotenv
from openai import AsyncOpenAI


assert load_dotenv(), "Failed to load .env file"

In [2]:
from rlvr.utils.llm import get_default_model

env_id = "vf-musique"
datasets_str = "bdsaglam/musique-mini,answerable,validation"
noise_rate = 0.0
retriever = "hybrid"
model = get_default_model()

params = {
    "env_id": env_id,
    "datasets_str": datasets_str,
    "noise_rate": noise_rate,
    "retriever": retriever,
    "model": model,
    "sampling_args": {
        "max_tokens": 1024, # max new tokens to generate
        "temperature": 0.5,
        "top_p": 0.95,
    },
    "max_concurrent": 16,
}
params

{'env_id': 'vf-musique',
 'datasets_str': 'bdsaglam/musique-mini,answerable,validation',
 'noise_rate': 0.0,
 'retriever': 'hybrid',
 'model': 'unsloth/Qwen2.5-7B-Instruct',
 'sampling_args': {'max_tokens': 1024, 'temperature': 0.5, 'top_p': 0.95},
 'max_concurrent': 16}

In [3]:
# import mlflow

# # Tell MLflow about the server URI.
# mlflow.set_tracking_uri("http://127.0.0.1:5000")
# # Enable autologging with all features
# mlflow.openai.autolog()
# # Create a unique name for your experiment.
# mlflow.set_experiment("rlvr-eval")

In [4]:
# Load MuSiQue environment
print("🌍 Loading MuSiQue environment...")

vf_env = vf.load_environment(
    env_id=env_id,
    datasets_str=datasets_str,
    noise_rate=noise_rate,
    retriever=retriever,
)
print(f"✅ Environment loaded with {len(vf_env.dataset)} examples")

# Use OpenAI-compatible API client (e.g., for vLLM)
print("🤖 Using OpenAI-compatible API client...")

# Run evaluation using the environment
print("🔄 Running evaluation...")
openai_client = AsyncOpenAI()
results = vf_env.evaluate(
    openai_client,
    model,
    rollouts_per_example=1,
    sampling_args=params["sampling_args"],
    max_concurrent=params["max_concurrent"],
)


2025-10-11 15:17:26 - verifiers.utils.env_utils - INFO - Loading environment: vf-musique
2025-10-11 15:17:26 - verifiers.utils.env_utils - INFO - Using provided args: noise_rate=0.0, retriever=hybrid, datasets_str=bdsaglam/musique-mini,answerable,validation
2025-10-11 15:17:26 - verifiers.utils.env_utils - INFO - Using default args: eval_datasets_str=None


🌍 Loading MuSiQue environment...


Map: 100%|##########| 300/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

2025-10-11 15:17:31 - verifiers.utils.env_utils - INFO - Successfully loaded environment 'vf-musique'
2025-10-11 15:17:31 - verifiers.envs.MuSiQueEnv - INFO - eval_dataset is not set, falling back to train dataset


✅ Environment loaded with 300 examples
🤖 Using OpenAI-compatible API client...
🔄 Running evaluation...


Running 300 rollouts (interleaved): 100%|██████████| 300/300 [02:19<00:00,  2.15it/s]


In [5]:
ds = vf_env.make_dataset(results)
df = ds.to_pandas()

In [6]:
df.describe()

Unnamed: 0,reward,exact_match_reward,f1_reward,retrieval_recall_reward,retrieval_precision_reward,citation_reward,format_reward,combined_reward
count,300.0,300.0,300.0,300.0,300.0,300.0,300.0,300.0
mean,0.840784,0.283333,0.383068,0.923611,0.963333,0.445278,0.776667,0.840784
std,0.416767,0.45137,0.444234,0.207449,0.188256,0.349112,0.417176,0.416767
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.525,0.0,0.0,1.0,1.0,0.0,1.0,0.525
50%,0.775,0.0,0.0,1.0,1.0,0.5,1.0,0.775
75%,1.0,1.0,1.0,1.0,1.0,0.666667,1.0,1.0
max,2.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0


In [7]:
from datetime import datetime
import json

exp_id = datetime.now().strftime("%Y%m%d_%H%M%S")
exp_dir = Path(f"../outputs/musique-eval/{exp_id}")
exp_dir.mkdir(parents=True, exist_ok=True)

results_path = exp_dir / "musique-eval-results.jsonl"
ds.to_json(results_path, orient="records", lines=True)
print(results_path)

summary = {
    "params": params,
    "scores": df.describe().to_dict(),
}
summary_path = exp_dir / "summary.json"
summary_path.write_text(json.dumps(summary, indent=2))
print(summary_path)

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

../outputs/musique-eval/20251011_151951/musique-eval-results.jsonl
../outputs/musique-eval/20251011_151951/summary.json


In [8]:
failed_df = df[df['f1_reward'] < 0.1]
len(failed_df)

157