In [1]:
from pathlib import Path

import pandas as pd
import verifiers as vf
from dotenv import load_dotenv
from openai import AsyncOpenAI


assert load_dotenv(), "Failed to load .env file"

In [2]:
from rlvr.utils.llm import get_default_model

env_id = "vf-musique-multi"
datasets_str = "bdsaglam/musique-mini,answerable,validation"
noise_rate = 1.0
retriever = "hybrid"
model = get_default_model()
temperature = 0.5
max_new_tokens = 1024

params = {
    "env_id": env_id,
    "datasets_str": datasets_str,
    "noise_rate": noise_rate,
    "retriever": retriever,
    "model": model,
    "temperature": temperature,
    "max_new_tokens": max_new_tokens,
}
params

{'env_id': 'vf-musique-multi',
 'datasets_str': 'bdsaglam/musique-mini,answerable,validation',
 'noise_rate': 1.0,
 'retriever': 'hybrid',
 'model': 'Qwen/Qwen2.5-7B-Instruct',
 'temperature': 0.5,
 'max_new_tokens': 1024}

In [3]:
import mlflow

# Tell MLflow about the server URI.
mlflow.set_tracking_uri("http://127.0.0.1:5000")
# Enable autologging with all features
mlflow.openai.autolog()
# Create a unique name for your experiment.
mlflow.set_experiment("rlvr-eval")

<Experiment: artifact_location='mlflow-artifacts:/2', creation_time=1756636926792, experiment_id='2', last_update_time=1756636926792, lifecycle_stage='active', name='rlvr-eval', tags={}>

In [None]:
# Load MuSiQue environment
print("üåç Loading MuSiQue environment...")

vf_env = vf.load_environment(
    env_id=env_id,
    datasets_str=datasets_str,
    noise_rate=noise_rate,
    retriever=retriever,
)
print(f"‚úÖ Environment loaded with {len(vf_env.dataset)} examples")

# Use OpenAI-compatible API client (e.g., for vLLM)
print("ü§ñ Using OpenAI-compatible API client...")

# Run evaluation using the environment
print("üîÑ Running evaluation...")
openai_client = AsyncOpenAI()
results = vf_env.evaluate(
    openai_client,
    model,
    rollouts_per_example=1,
    sampling_args={"temperature": temperature, "max_tokens": max_new_tokens},
)


üåç Loading MuSiQue environment...


Map: 100%|##########| 300/300 [00:00<?, ? examples/s]

Filter:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

2025-09-14 19:11:14 - verifiers.envs.MuSiQueEnv - INFO - eval_dataset is not set, falling back to train dataset


‚úÖ Environment loaded with 100 examples
ü§ñ Using OpenAI-compatible API client...
üîÑ Running evaluation...


Running 100 rollouts:  43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 43/100 [00:53<01:09,  1.21s/it]

In [None]:
from datetime import datetime
import json

exp_id = datetime.now().strftime("%Y%m%d_%H%M%S")
exp_dir = Path(f"../outputs/musique-eval/{exp_id}")
exp_dir.mkdir(parents=True, exist_ok=True)

(exp_dir / "params.json").write_text(json.dumps(params, indent=2))

232

In [None]:
ds = vf_env.make_dataset(results)
df = ds.to_pandas()

In [None]:
df.describe()

Unnamed: 0,reward,exact_match_reward,f1_reward,citation_reward,format_reward,combined_reward
count,100.0,100.0,100.0,100.0,100.0,100.0
mean,0.00025,0.0,0.0,0.0,0.01,0.00025
std,0.0025,0.0,0.0,0.0,0.1,0.0025
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,0.025,0.0,0.0,0.0,1.0,0.025


In [None]:
from datetime import datetime
import json

exp_id = datetime.now().strftime("%Y%m%d_%H%M%S")
exp_dir = Path(f"../outputs/musique-eval/{exp_id}")
exp_dir.mkdir(parents=True, exist_ok=True)

(exp_dir / "params.json").write_text(json.dumps(params, indent=2))
ds.to_json(exp_dir / "musique-eval-results.jsonl", orient="records", lines=True)
df.describe().to_csv(exp_dir / "scores.csv")

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]