In [13]:
# import sys
# sys.path.append("/home/baris/repos/rlvr")

# from scripts.musique import evaluate

In [14]:
from pathlib import Path

import typer
import pandas as pd
import verifiers as vf
from dotenv import load_dotenv
from openai import AsyncOpenAI


assert load_dotenv(), "Failed to load .env file"

In [15]:
import mlflow

# Tell MLflow about the server URI.
mlflow.set_tracking_uri("http://127.0.0.1:5000")
# Enable autologging with all features
mlflow.openai.autolog()
# Create a unique name for your experiment.
mlflow.set_experiment("rlvr-eval")

<Experiment: artifact_location='mlflow-artifacts:/3', creation_time=1756232427415, experiment_id='3', last_update_time=1756232427415, lifecycle_stage='active', name='rlvr-eval', tags={}>

In [16]:
openai_client = AsyncOpenAI()

models_response = await openai_client.models.list()
available_models = [item.id for item in models_response.data if item.object == "model"]
print("Available models:", available_models)

model = available_models[0]
print("Using model:", model)


Available models: ['Qwen/Qwen2.5-7B-Instruct']
Using model: Qwen/Qwen2.5-7B-Instruct


In [17]:
datasets_str = "bdsaglam/musique-mini,answerable,validation"
noise_rate = 1.0
retriever = "hybrid"
temperature = 0.5
max_new_tokens = 1024

print("🔮 Starting MuSiQue evaluation")
print("=" * 50)
print(f"📝 Model: {model}")
print(f"📊 Dataset: {datasets_str} (noise rate: {noise_rate})")
print(f"🔍 Retriever: {retriever}")
print(f"🌡️ Temperature: {temperature}")
print(f"🎯 Max tokens: {max_new_tokens}")
print("=" * 50)

# Load MuSiQue environment
print("🌍 Loading MuSiQue environment...")

vf_env = vf.load_environment(
    env_id="vf-musique",
    datasets_str=datasets_str,
    noise_rate=noise_rate,
    retriever_name=retriever,
)
print(f"✅ Environment loaded with {len(vf_env.dataset)} examples")

# Use OpenAI-compatible API client (e.g., for vLLM)
print("🤖 Using OpenAI-compatible API client...")

# Run evaluation using the environment
print("🔄 Running evaluation...")
results = vf_env.evaluate(
    openai_client,
    model,
    rollouts_per_example=1,
    sampling_args={"temperature": temperature, "max_tokens": max_new_tokens},
)


🔮 Starting MuSiQue evaluation
📝 Model: Qwen/Qwen2.5-7B-Instruct
📊 Dataset: bdsaglam/musique-mini,answerable,validation (noise rate: 1.0)
🔍 Retriever: hybrid
🌡️ Temperature: 0.5
🎯 Max tokens: 1024
🌍 Loading MuSiQue environment...


Map: 100%|##########| 300/300 [00:00<?, ? examples/s]

2025-08-30 16:15:19 - verifiers.envs.MuSiQueEnv - INFO - eval_dataset is not set, falling back to train dataset


✅ Environment loaded with 300 examples
🤖 Using OpenAI-compatible API client...
🔄 Running evaluation...


Running 300 rollouts: 100%|██████████| 300/300 [02:56<00:00,  1.70it/s]
Evaluating 300 rollouts: 100%|██████████| 300/300 [00:00<00:00, 1346.16it/s]


In [18]:
results.metrics.keys()

dict_keys(['exact_match_reward', 'f1_reward', 'retrieval_recall_reward', 'citation_reward', 'format_reward', 'combined_reward'])

In [19]:
from verifiers.types import GenerateOutputs

for field in GenerateOutputs.__fields__.keys():
    print(field)

    _type = type(getattr(results, field))
    if _type == list:
        print(getattr(results, field)[0])
    else:
        print(type(getattr(results, field)))

prompt
[{'role': 'system', 'content': "Answer the question based on the information provided by tools.\n\nFor each step:\n1. Think through your reasoning inside <think> tags\n2. Use tools to retrieve relevant documents\n3. Continue until you find the answer through multi-hop reasoning. The question is answerable from the docs. \n4. In the **last** step:\n    - Reflect on your previous steps inside <think> tags\n    - Cite the documents you base your answer on inside <cite> tags by their IDs, e.g. `<cite>1, 2, 3</cite>`\n    - Give your final answer inside <answer> tags\nAn example for your final message:\n```\n<think>\n[your thinking and explanation here]\n</think> \n<cite>\n[IDs of the documents that back your answer]\n</cite>\n<answer>\n[your final answer in **a few words**. no explanation here.]\n</answer>\n```\n\n- Do not make up tools or arguments that aren't listed.\n- Make one tool call per step.\n- Questions require multi-hop reasoning across multiple documents.\n- Continue sea

/tmp/ipykernel_1759625/3696243233.py:3: PydanticDeprecatedSince20: The `__fields__` attribute is deprecated, use `model_fields` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  for field in GenerateOutputs.__fields__.keys():


In [20]:
ds = vf_env.make_dataset(results)
ds

Dataset({
    features: ['prompt', 'completion', 'answer', 'task', 'reward', 'info', 'exact_match_reward', 'f1_reward', 'retrieval_recall_reward', 'citation_reward', 'format_reward', 'combined_reward'],
    num_rows: 300
})

In [21]:
ds.to_json("../outputs/musique-eval-results.jsonl", orient="records", lines=True)

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

8930028

In [22]:
df = ds.to_pandas()
df

Unnamed: 0,prompt,completion,answer,task,reward,info,exact_match_reward,f1_reward,retrieval_recall_reward,citation_reward,format_reward,combined_reward
0,[{'content': 'Answer the question based on the...,[{'content': '<think> To find out what gun was...,LeMat revolver,default,0.466667,"{'answers': ['LeMat revolver', 'lemat revolver...",0.0,0.000000,1.000000,1.00,0.666667,0.466667
1,[{'content': 'Answer the question based on the...,[{'content': '<think> To find the Vice-Preside...,Vitor Constâncio,default,0.762500,"{'answers': ['vitor constâncio', 'Vitor Constâ...",1.0,1.000000,1.000000,1.00,1.000000,0.762500
2,[{'content': 'Answer the question based on the...,[{'content': '<think> The question asks about ...,the Politburo,default,0.125000,"{'answers': ['the Politburo', 'the politburo',...",0.0,0.000000,0.333333,0.00,0.333333,0.125000
3,[{'content': 'Answer the question based on the...,"[{'content': '<think> To solve this, we need t...",Cabarrus County,default,0.328125,"{'answers': ['cabarrus county', 'Cabarrus Coun...",0.0,0.500000,0.750000,0.25,1.000000,0.328125
4,[{'content': 'Answer the question based on the...,[{'content': '<think> To find the distance fro...,45,default,0.475000,"{'answers': ['45'], 'docs': [{'body': 'Weber C...",0.0,0.666667,1.000000,0.50,1.000000,0.475000
...,...,...,...,...,...,...,...,...,...,...,...,...
295,[{'content': 'Answer the question based on the...,[{'content': '<think> The question asks about ...,January 2015,default,0.206667,"{'answers': ['january 2015', 'January 2015'], ...",0.0,0.000000,0.666667,0.00,0.000000,0.206667
296,[{'content': 'Answer the question based on the...,"[{'content': '<think> First, I need to identif...",Richland County,default,0.503750,"{'answers': ['Richland County', 'richland coun...",1.0,1.000000,0.750000,0.75,1.000000,0.503750
297,[{'content': 'Answer the question based on the...,"[{'content': '<think> First, I need to find ou...",The Île de la Cité,default,0.412500,"{'answers': ['île de la Cité', 'The Île de la ...",1.0,1.000000,0.500000,0.00,1.000000,0.412500
298,[{'content': 'Answer the question based on the...,[{'content': '<think> I need to find out which...,Thaba Putsoa,default,0.266667,"{'answers': ['thaba putsoa', 'Thaba Putsoa'], ...",0.0,0.000000,0.500000,0.50,0.666667,0.266667


In [23]:
df.describe()

Unnamed: 0,reward,exact_match_reward,f1_reward,retrieval_recall_reward,citation_reward,format_reward,combined_reward
count,300.0,300.0,300.0,300.0,300.0,300.0,300.0
mean,0.315514,0.2,0.273035,0.591667,0.351389,0.714444,0.315514
std,0.195057,0.400668,0.409642,0.33559,0.329934,0.354845,0.195057
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.183333,0.0,0.0,0.333333,0.0,0.333333,0.183333
50%,0.285417,0.0,0.0,0.583333,0.333333,1.0,0.285417
75%,0.46875,0.0,0.571429,1.0,0.5,1.0,0.46875
max,0.7625,1.0,1.0,1.0,1.0,1.0,0.7625


In [24]:
row = df.iloc[0]