In [14]:
from pathlib import Path
import json
import typer
import pandas as pd
import verifiers as vf
from verifiers.types import SamplingArgs
from dotenv import load_dotenv
from openai import AsyncOpenAI

assert load_dotenv(), "Failed to load .env file"

openai_client = AsyncOpenAI()


def jprint(obj):
    print(json.dumps(obj, indent=2))


In [15]:
import mlflow

# Tell MLflow about the server URI.
mlflow.set_tracking_uri("http://127.0.0.1:5000")
# Enable autologging with all features
mlflow.openai.autolog()
# Create a unique name for your experiment.
mlflow.set_experiment("RLVR")

<Experiment: artifact_location='mlflow-artifacts:/3', creation_time=1756053851638, experiment_id='3', last_update_time=1756053851638, lifecycle_stage='active', name='RLVR', tags={}>

In [16]:
model_response = await openai_client.models.list()
available_models = [item.id for item in model_response.data if item.object == 'model']
model = available_models[0]
print(model)

Qwen/Qwen2.5-7B-Instruct


In [17]:
datasets_str = "bdsaglam/musique-mini,answerable,validation[:8]"
noise_rate = 1.0
retriever = "hybrid"

print(f"📊 Dataset: {datasets_str} (noise rate: {noise_rate})")
print(f"🔍 Retriever: {retriever}")

# Load MuSiQue environment
print("🌍 Loading MuSiQue environment...")

vf_env = vf.load_environment(
    env_id="vf-musique",
    datasets_str=datasets_str,
    noise_rate=noise_rate,
    retriever_name=retriever,
)
print(f"✅ Environment loaded with {len(vf_env.dataset)} examples")

📊 Dataset: bdsaglam/musique-mini,answerable,validation[:8] (noise rate: 1.0)
🔍 Retriever: hybrid
🌍 Loading MuSiQue environment...


Map: 100%|##########| 8/8 [00:00<?, ? examples/s]

✅ Environment loaded with 8 examples


In [18]:
vf_env.oai_tools

[{'type': 'function',
  'function': {'name': 'retrieve_documents',
   'description': 'Retrieve relevant documents by the query. The results get better with more specific queries.',
   'parameters': {'type': 'object',
    'properties': {'query': {'type': 'string',
      'description': 'Parameter `query` of type string.'},
     'top_n': {'type': 'integer',
      'description': 'Parameter `top_n` of type integer.'}},
    'required': ['query']}}},
 {'type': 'function',
  'function': {'name': 'get_document',
   'description': 'Get a document by its ID.',
   'parameters': {'type': 'object',
    'properties': {'doc_id': {'type': 'string',
      'description': 'Parameter `doc_id` of type string.'}},
    'required': ['doc_id']}}}]

In [19]:
# Environments provide rollout interfaces for RL
example = vf_env.dataset[0]

prompt = example["prompt"]
answer = example["answer"]
info = example["info"]
info['oai_tools'] = vf_env.oai_tools


sampling_args = dict(temperature=1.0)

completion, state = await vf_env.rollout(
    openai_client,
    model=model,
    prompt=prompt,
    answer=answer,
    info=info,
    sampling_args=sampling_args,
)

In [20]:
print(type(completion))
print(type(completion[0]))
completion[0]

<class 'list'>
<class 'dict'>


{'role': 'assistant',
 'content': "<think>\nFirst, I need to identify who won the men's Wimbledon championship in 2016. I will search for information about the 2016 Wimbledon championships.\n</think>\n",
 'tool_calls': [ChatCompletionMessageToolCall(id='chatcmpl-tool-45d6a3f3347443f08a7599de8a363cb2', function=Function(arguments='{"query": "2016 Wimbledon Championships winner", "top_n": 5}', name='retrieve_documents'), type='function')]}

In [21]:
state

{'prompt': [{'content': "Answer the question based on the information provided by tools.\n\nFor each step:\n1. Think through your reasoning inside <think> tags\n2. Use tools to retrieve relevant documents\n3. Continue until you have found the answer through multi-hop reasoning\n4. In the **last** step:\n    - Reflect on your previous steps inside <think> tags\n    - Cite the documents you used inside <cite> tags by their IDs, e.g. `<cite>1, 2, 3</cite>`\n    - Give your final answer inside <answer> tags\nAn example for your final message:\n```\n<think>\n[your thinking here]\n</think> \n<cite>\n[IDs of the documents that back your answer]\n</cite>\n<answer>\n[your final answer here without any additional text]\n</answer>\n```\n\n- Do not make up tools or arguments that aren't listed.\n- Questions require multi-hop reasoning across multiple documents.\n- Continue searching until you find all relevant information to answer the question.",
   'role': 'system'},
  {'content': 'Who\'s the si

In [22]:
for response in state['responses']:
    print(response)

ChatCompletion(id='chatcmpl-0148839f2eab4746a11296581e456320', choices=[Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content="<think>\nFirst, I need to identify who won the men's Wimbledon championship in 2016. I will search for information about the 2016 Wimbledon championships.\n</think>\n", refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='chatcmpl-tool-45d6a3f3347443f08a7599de8a363cb2', function=Function(arguments='{"query": "2016 Wimbledon Championships winner", "top_n": 5}', name='retrieve_documents'), type='function')], reasoning_content=None), stop_reason=None)], created=1756060342, model='Qwen/Qwen2.5-7B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=74, prompt_tokens=829, total_tokens=903, completion_tokens_details=None, prompt_tokens_details=None), prompt_logprobs=None, kv_transfer_

In [23]:
vf_env.rubric.reward_funcs

[<function vf_musique.rewards.exact_match_reward(completion, answer, info, parser, **kwargs)>,
 <function vf_musique.rewards.f1_reward(completion, answer, info, parser, **kwargs)>,
 <function vf_musique.rewards.retrieval_recall_reward(completion, info, **kwargs)>,
 <function vf_musique.rewards.citation_reward(completion, info, parser, **kwargs)>,
 <function vf_musique.rewards.format_reward(completion, parser, **kwargs)>,
 <function vf_musique.rewards.combined_reward(*args, **kwargs)>]

In [24]:
rewards = await vf_env.rubric.score_rollout(prompt=prompt, completion=completion, answer=answer, state=state, info=info)

In [25]:
rewards

RolloutScore(reward=0.39305555555555555, metrics={'exact_match_reward': 0.0, 'f1_reward': 0.25, 'retrieval_recall_reward': 1.0, 'citation_reward': 1.0, 'format_reward': 0.6666666666666666, 'combined_reward': 0.39305555555555555})

In [26]:
completion

[{'role': 'assistant',
  'content': "<think>\nFirst, I need to identify who won the men's Wimbledon championship in 2016. I will search for information about the 2016 Wimbledon championships.\n</think>\n",
  'tool_calls': [ChatCompletionMessageToolCall(id='chatcmpl-tool-45d6a3f3347443f08a7599de8a363cb2', function=Function(arguments='{"query": "2016 Wimbledon Championships winner", "top_n": 5}', name='retrieve_documents'), type='function')]},
 {'role': 'tool',
  'content': "Document ID: 16\n# 2016 Wimbledon Championships\nNovak Djokovic was the two - time defending champion in the Gentlemen's Singles, but he lost in the third round to Sam Querrey, ending a 30 - match winning streak in Grand Slams and Djokovic's hopes of becoming the first man to achieve the golden calendar slam. The Gentlemen's Singles title was won by Andy Murray. Serena Williams successfully defended her Ladies' Singles title and equaled Steffi Graf's Open Era record of 22 major singles titles.\n\nDocument ID: 15\n# 2