In [1]:
from pathlib import Path

import typer
import pandas as pd
import verifiers as vf
from dotenv import load_dotenv
from openai import AsyncOpenAI


assert load_dotenv(), "Failed to load .env file"

In [2]:
import mlflow

# Tell MLflow about the server URI.
mlflow.set_tracking_uri("http://127.0.0.1:5000")
# Enable autologging with all features
mlflow.openai.autolog()
# Create a unique name for your experiment.
mlflow.set_experiment("rlvr-eval")

<Experiment: artifact_location='mlflow-artifacts:/2', creation_time=1756636926792, experiment_id='2', last_update_time=1756636926792, lifecycle_stage='active', name='rlvr-eval', tags={}>

In [3]:
openai_client = AsyncOpenAI()

models_response = await openai_client.models.list()
available_models = [item.id for item in models_response.data if item.object == "model"]
print("Available models:", available_models)

model = available_models[0]
print("Using model:", model)


Available models: ['Qwen/Qwen2.5-7B-Instruct']
Using model: Qwen/Qwen2.5-7B-Instruct


In [4]:
datasets_str = "bdsaglam/musique-mini,answerable,validation"
noise_rate = 1.0
retriever = "hybrid"
temperature = 0.5
max_new_tokens = 1024

print("üîÆ Starting MuSiQue evaluation")
print("=" * 50)
print(f"üìù Model: {model}")
print(f"üìä Dataset: {datasets_str} (noise rate: {noise_rate})")
print(f"üîç Retriever: {retriever}")
print(f"üå°Ô∏è Temperature: {temperature}")
print(f"üéØ Max tokens: {max_new_tokens}")
print("=" * 50)

# Load MuSiQue environment
print("üåç Loading MuSiQue environment...")

vf_env = vf.load_environment(
    env_id="vf-musique",
    datasets_str=datasets_str,
    noise_rate=noise_rate,
    retriever_name=retriever,
)
print(f"‚úÖ Environment loaded with {len(vf_env.dataset)} examples")

# Use OpenAI-compatible API client (e.g., for vLLM)
print("ü§ñ Using OpenAI-compatible API client...")

# Run evaluation using the environment
print("üîÑ Running evaluation...")
results = vf_env.evaluate(
    openai_client,
    model,
    rollouts_per_example=1,
    sampling_args={"temperature": temperature, "max_tokens": max_new_tokens},
)


üîÆ Starting MuSiQue evaluation
üìù Model: Qwen/Qwen2.5-7B-Instruct
üìä Dataset: bdsaglam/musique-mini,answerable,validation (noise rate: 1.0)
üîç Retriever: hybrid
üå°Ô∏è Temperature: 0.5
üéØ Max tokens: 1024
üåç Loading MuSiQue environment...


Map: 100%|##########| 300/300 [00:00<?, ? examples/s]

Filter:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

2025-09-11 11:04:20 - verifiers.envs.MuSiQueEnv - INFO - eval_dataset is not set, falling back to train dataset


‚úÖ Environment loaded with 100 examples
ü§ñ Using OpenAI-compatible API client...
üîÑ Running evaluation...


Running 100 rollouts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [01:07<00:00,  1.48it/s]
Evaluating 100 rollouts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 100/100 [00:00<00:00, 1037.45it/s]


In [5]:
results.metrics.keys()

dict_keys(['exact_match_reward', 'f1_reward', 'retrieval_recall_reward', 'retrieval_precision_reward', 'citation_reward', 'format_reward', 'combined_reward'])

In [6]:
ds = vf_env.make_dataset(results)
ds

Dataset({
    features: ['prompt', 'completion', 'answer', 'task', 'reward', 'info', 'exact_match_reward', 'f1_reward', 'retrieval_recall_reward', 'retrieval_precision_reward', 'citation_reward', 'format_reward', 'combined_reward'],
    num_rows: 100
})

In [7]:
ds.to_json("../outputs/musique-eval-results.jsonl", orient="records", lines=True)

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

2852012

In [8]:
df = ds.to_pandas()
df

Unnamed: 0,prompt,completion,answer,task,reward,info,exact_match_reward,f1_reward,retrieval_recall_reward,retrieval_precision_reward,citation_reward,format_reward,combined_reward
0,[{'content': 'Answer the question based on the...,"[{'content': '', 'role': 'assistant', 'tool_ca...",LeMat revolver,default,0.025000,"{'answers': ['LeMat Revolver', 'lemat revolver...",0.0,0.0,0.0,0.000000,0.0,1.0,0.025000
1,[{'content': 'Answer the question based on the...,"[{'content': '', 'role': 'assistant', 'tool_ca...",Vitor Const√¢ncio,default,0.350000,"{'answers': ['Vitor Const√¢ncio', 'vitor const√¢...",0.0,0.0,1.0,1.000000,0.0,0.0,0.350000
2,[{'content': 'Answer the question based on the...,"[{'content': '', 'role': 'assistant', 'tool_ca...",45,default,0.175000,"{'answers': ['45'], 'docs': [{'body': 'Weber C...",0.0,0.0,0.5,0.500000,0.0,0.0,0.175000
3,[{'content': 'Answer the question based on the...,"[{'content': '', 'role': 'assistant', 'tool_ca...",three different relationships he had in the past,default,0.350000,{'answers': ['three different relationships he...,0.0,0.0,1.0,1.000000,0.0,0.0,0.350000
4,[{'content': 'Answer the question based on the...,"[{'content': '', 'role': 'assistant', 'tool_ca...",Michael Bubl√©,default,0.733333,"{'answers': ['michael bubl√©', 'Michael Bubl√©']...",1.0,1.0,0.5,0.333333,0.5,1.0,0.733333
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,[{'content': 'Answer the question based on the...,"[{'content': '', 'role': 'assistant', 'tool_ca...",two alliances,default,0.175000,"{'answers': ['two alliances'], 'docs': [{'body...",0.0,0.0,0.5,0.500000,0.0,0.0,0.175000
96,[{'content': 'Answer the question based on the...,"[{'content': '', 'role': 'assistant', 'tool_ca...",John Kukuzelis,default,0.316667,"{'answers': ['john kukuzelis', 'John Kukuzelis...",0.0,0.0,1.0,0.666667,0.0,0.0,0.316667
97,[{'content': 'Answer the question based on the...,"[{'content': '', 'role': 'assistant', 'tool_ca...",Vito Corleone,default,0.158333,"{'answers': ['vito andolini', 'vito andolini c...",0.0,0.0,0.5,0.333333,0.0,0.0,0.158333
98,[{'content': 'Answer the question based on the...,"[{'content': '', 'role': 'assistant', 'tool_ca...",The √éle de la Cit√©,default,0.175000,"{'answers': ['The √éle de la Cit√©', 'the √Æle de...",0.0,0.0,0.5,0.500000,0.0,0.0,0.175000


In [9]:
df.describe()

Unnamed: 0,reward,exact_match_reward,f1_reward,retrieval_recall_reward,retrieval_precision_reward,citation_reward,format_reward,combined_reward
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,0.311137,0.08,0.137792,0.68,0.613833,0.145,0.28,0.311137
std,0.212992,0.27266,0.314356,0.313984,0.291731,0.286876,0.451261,0.212992
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.175,0.0,0.0,0.5,0.5,0.0,0.0,0.175
50%,0.29,0.0,0.0,0.5,0.5,0.0,0.0,0.29
75%,0.35,0.0,0.0,1.0,1.0,0.0,1.0,0.35
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [10]:
row = df.iloc[0]