In [1]:
from pathlib import Path

import pandas as pd
import verifiers as vf
from dotenv import load_dotenv
from openai import AsyncOpenAI


assert load_dotenv(), "Failed to load .env file"

In [2]:
from rlvr.utils.llm import get_default_model

env_id = "vf-musique-structured"
datasets_str = "bdsaglam/musique-mini,answerable,validation"
noise_rate = 0.0
retriever = "hybrid"
model = get_default_model()

params = {
    "env_id": env_id,
    "datasets_str": datasets_str,
    "noise_rate": noise_rate,
    "retriever": retriever,
    "model": model,
    "sampling_args": {
        "temperature": 0.5,
        "max_tokens": 1024, # max new tokens to generate
        # "top_p": 0.95,
        # "min_p": None,
    },
    "max_concurrent": 16,
}
params

{'env_id': 'vf-musique-structured',
 'datasets_str': 'bdsaglam/musique-mini,answerable,validation',
 'noise_rate': 0.0,
 'retriever': 'hybrid',
 'model': 'Qwen/Qwen2.5-7B-Instruct',
 'sampling_args': {'temperature': 0.5, 'max_tokens': 1024},
 'max_concurrent': 16}

In [3]:
# import mlflow

# # Tell MLflow about the server URI.
# mlflow.set_tracking_uri("http://127.0.0.1:5000")
# # Enable autologging with all features
# mlflow.openai.autolog()
# # Create a unique name for your experiment.
# mlflow.set_experiment("rlvr-eval")

In [4]:
# Load MuSiQue environment
print("🌍 Loading MuSiQue environment...")

vf_env = vf.load_environment(
    env_id=env_id,
    datasets_str=datasets_str,
    noise_rate=noise_rate,
    retriever=retriever,
)
print(f"✅ Environment loaded with {len(vf_env.dataset)} examples")

# Use OpenAI-compatible API client (e.g., for vLLM)
print("🤖 Using OpenAI-compatible API client...")

# Run evaluation using the environment
print("🔄 Running evaluation...")
openai_client = AsyncOpenAI()
results = vf_env.evaluate(
    openai_client,
    model,
    rollouts_per_example=1,
    sampling_args=params["sampling_args"],
    max_concurrent=params["max_concurrent"],
)


2025-09-26 19:04:20 - verifiers.utils.env_utils - INFO - Loading environment: vf-musique-structured
2025-09-26 19:04:20 - verifiers.utils.env_utils - INFO - Using provided args: retriever=hybrid, datasets_str=bdsaglam/musique-mini,answerable,validation, noise_rate=0.0
2025-09-26 19:04:20 - verifiers.utils.env_utils - INFO - Using default args: eval_datasets_str=None


🌍 Loading MuSiQue environment...


Map: 100%|##########| 300/300 [00:00<?, ? examples/s]

Filter:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

2025-09-26 19:04:24 - verifiers.utils.env_utils - INFO - Successfully loaded environment 'vf-musique-structured'
2025-09-26 19:04:24 - verifiers.envs.MuSiQueEnv - INFO - eval_dataset is not set, falling back to train dataset


✅ Environment loaded with 100 examples
🤖 Using OpenAI-compatible API client...
🔄 Running evaluation...


Running 100 rollouts (interleaved): 100%|██████████| 100/100 [00:39<00:00,  2.53it/s]


In [5]:
ds = vf_env.make_dataset(results)
df = ds.to_pandas()

In [6]:
df.describe()

Unnamed: 0,reward,exact_match_reward,f1_reward,retrieval_recall_reward,retrieval_precision_reward,citation_reward,format_reward,combined_reward
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,0.642152,0.39,0.538454,0.815,1.0,0.635,0.98,0.642152
std,0.252156,0.490207,0.447515,0.242618,0.0,0.244691,0.140705,0.252156
min,0.225,0.0,0.0,0.5,1.0,0.0,0.0,0.225
25%,0.434375,0.0,0.0,0.5,1.0,0.5,1.0,0.434375
50%,0.6,0.0,0.666667,1.0,1.0,0.5,1.0,0.6
75%,0.925,1.0,1.0,1.0,1.0,1.0,1.0,0.925
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
from datetime import datetime
import json

exp_id = datetime.now().strftime("%Y%m%d_%H%M%S")
exp_dir = Path(f"../outputs/musique-eval/{exp_id}")
exp_dir.mkdir(parents=True, exist_ok=True)

(exp_dir / "params.json").write_text(json.dumps(params, indent=2))
ds.to_json(exp_dir / "musique-eval-results.jsonl", orient="records", lines=True)
df.describe().to_csv(exp_dir / "scores.csv")

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

In [8]:
failed_df = df[df['f1_reward'] < 0.1]
len(failed_df)

35