In [1]:
from pathlib import Path
import json
import typer
import pandas as pd
import verifiers as vf
from verifiers.types import SamplingArgs
from dotenv import load_dotenv
from openai import AsyncOpenAI

assert load_dotenv(), "Failed to load .env file"

openai_client = AsyncOpenAI()


def jprint(obj):
    print(json.dumps(obj, indent=2))


In [2]:
import mlflow

# Tell MLflow about the server URI.
mlflow.set_tracking_uri("http://127.0.0.1:5000")
# Enable autologging with all features
mlflow.openai.autolog()
# Create a unique name for your experiment.
mlflow.set_experiment("rlvr-explore")

<Experiment: artifact_location='mlflow-artifacts:/4', creation_time=1758530036945, experiment_id='4', last_update_time=1758530036945, lifecycle_stage='active', name='rlvr-explore', tags={}>

In [3]:
model_response = await openai_client.models.list()
available_models = [item.id for item in model_response.data if item.object == 'model']
model = available_models[0]
print(model)

Qwen/Qwen2.5-7B-Instruct


In [4]:
datasets_str = "bdsaglam/musique-mini,answerable,train"
noise_rate = 1.0
retriever = "hybrid"

print(f"📊 Dataset: {datasets_str} (noise rate: {noise_rate})")
print(f"🔍 Retriever: {retriever}")

# Load MuSiQue environment
print("🌍 Loading MuSiQue environment...")

vf_env = vf.load_environment(
    env_id="vf-musique-structured",
    datasets_str=datasets_str,
    noise_rate=noise_rate,
    retriever=retriever,
)
print(f"✅ Environment loaded with {len(vf_env.dataset)} examples")

2025-09-26 15:16:29 - verifiers.utils.env_utils - INFO - Loading environment: vf-musique-structured
2025-09-26 15:16:29 - verifiers.utils.env_utils - INFO - Using provided args: retriever=hybrid, noise_rate=1.0, datasets_str=bdsaglam/musique-mini,answerable,train
2025-09-26 15:16:29 - verifiers.utils.env_utils - INFO - Using default args: eval_datasets_str=None


📊 Dataset: bdsaglam/musique-mini,answerable,train (noise rate: 1.0)
🔍 Retriever: hybrid
🌍 Loading MuSiQue environment...


Map: 100%|##########| 300/300 [00:00<?, ? examples/s]

Filter:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

2025-09-26 15:16:33 - verifiers.utils.env_utils - INFO - Successfully loaded environment 'vf-musique-structured'


✅ Environment loaded with 100 examples


In [5]:
df = vf_env.dataset.to_pandas()
df

Unnamed: 0,question,answer,info,id,prompt
0,What's the population of the city where Anthon...,190884,"{'answers': ['190,884'], 'docs': [{'body': 'Ol...",0,[{'content': 'Answer the question based on the...
1,When did the team that the Red Sox played agai...,2011,"{'answers': ['2011'], 'docs': [{'body': 'Exact...",1,[{'content': 'Answer the question based on the...
2,Who is the father of the artist who created Ti...,Alfred Lennon,"{'answers': ['Alfred Lennon', 'alfred lennon']...",2,[{'content': 'Answer the question based on the...
3,"In Doctor Who, who plays the creator of Trees ...",Tony Curran,"{'answers': ['tony curran', 'Tony Curran'], 'd...",3,[{'content': 'Answer the question based on the...
4,Little Annie Rooney's screenwriter has who for...,Jack Pickford,"{'answers': ['Jack Pickford', 'jack pickford']...",4,[{'content': 'Answer the question based on the...
...,...,...,...,...,...
95,What is the size of the continent Marian Cove ...,5400000,"{'answers': ['5,400,000'], 'docs': [{'body': '...",95,[{'content': 'Answer the question based on the...
96,When was the state of emergency declared in th...,20 October 1952,"{'answers': ['20 october 1952', '20 October 19...",96,[{'content': 'Answer the question based on the...
97,Who is the spouse of the performer of Mistleto...,Barbara Marx,"{'answers': ['Barbara Marx', 'barbara marx'], ...",97,[{'content': 'Answer the question based on the...
98,When did the Kuomintang defeat and take over t...,1928,"{'answers': ['1928'], 'docs': [{'body': 'He qu...",98,[{'content': 'Answer the question based on the...


In [6]:
docs = [doc for _, row in df.iterrows() for doc in row['info']['docs'] ]
len(docs)

2000

In [7]:
import textwrap
i = 4
print('\n'.join(textwrap.wrap(docs[i]['text'], 120)))

# Anthony Joshua vs. Wladimir Klitschko Anthony Joshua vs. Wladimir Klitschko was a professional boxing match contested
between Anthony Joshua and Wladimir Klitschko. The event took place on 29 April 2017 at Wembley Stadium in London,
England, with Joshua's IBF and the vacant WBA (Super) and IBO heavyweight titles on the line. Joshua won the match via
technical knockout in the 11th round. Klitschko announced his retirement from boxing a few months after the fight.


In [8]:
df['n_hops'] = df['info'].map(lambda x: x['n_hops'])
df['n_hops']

0     2
1     2
2     2
3     2
4     2
     ..
95    2
96    2
97    2
98    2
99    2
Name: n_hops, Length: 100, dtype: int64

In [9]:
vf_env.oai_tools

[{'type': 'function',
  'function': {'name': 'retrieve_documents',
   'description': 'Retrieve documents by the query. The results get better with more specific queries.',
   'parameters': {'properties': {'query': {'description': 'The query to retrieve documents for.',
      'title': 'Query',
      'type': 'string'}},
    'required': ['query'],
    'title': 'retrieve_documents_args',
    'type': 'object',
    'additionalProperties': False},
   'strict': True}},
 {'type': 'function',
  'function': {'name': 'complete',
   'description': '',
   'parameters': {'properties': {'reasoning': {'title': 'Reasoning',
      'type': 'string'},
     'cited_doc_ids': {'items': {'type': 'string'},
      'title': 'Cited Doc Ids',
      'type': 'array'},
     'final_answer': {'title': 'Final Answer', 'type': 'string'}},
    'required': ['reasoning', 'cited_doc_ids', 'final_answer'],
    'title': 'complete_args',
    'type': 'object',
    'additionalProperties': False},
   'strict': True}}]

In [10]:
# Environments provide rollout interfaces for RL
example = vf_env.dataset[1]

prompt = example["prompt"]
answer = example["answer"]
info = example["info"]
info['oai_tools'] = vf_env.oai_tools


sampling_args = dict(temperature=1.0)

completion, state = await vf_env.rollout(
    openai_client,
    model=model,
    prompt=prompt,
    answer=answer,
    info=info,
    sampling_args=sampling_args,
)

In [11]:
print(type(completion))
print(type(completion[0]))
completion[0]

<class 'list'>
<class 'dict'>


{'role': 'assistant',
 'content': '',
 'tool_calls': [ChatCompletionMessageFunctionToolCall(id='chatcmpl-tool-727a7bcc346d4086a048476278a1c1ed', function=Function(arguments='{"query": "2004 World Series Red Sox opponent last World Series win"}', name='retrieve_documents'), type='function')]}

In [12]:
state

{'id': 0,
 'prompt': [{'content': "Answer the question based on the information provided by tools.\n\nFor each step:\n1. Think about the question and the information provided by the tools. Plan next action.\n2. Use tools to retrieve documents\n3. Continue until you find the answer through multi-hop reasoning. The question is answerable from the docs. \n4. In the **last** step, call `complete` tool with the following arguments:\n    - reasoning: Your reasoning for the answer based on the information gathered\n    - cited_doc_ids: The IDs of the documents you base your answer on\n    - final_answer: Your final answer in a few words without any explanation.\n\n- Do not make up tools or arguments that aren't listed.\n- Make one tool call per step.\n- Questions require multi-hop reasoning across multiple documents.\n- Continue searching until you find all necessary information to answer the question.",
   'role': 'system'},
  {'content': 'When did the team that the Red Sox played against in

In [13]:
for response in state['responses']:
    print(response)

ChatCompletion(id='chatcmpl-c2cf8bb8d5fa40f6acd955e21694f199', choices=[Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageFunctionToolCall(id='chatcmpl-tool-727a7bcc346d4086a048476278a1c1ed', function=Function(arguments='{"query": "2004 World Series Red Sox opponent last World Series win"}', name='retrieve_documents'), type='function')], reasoning_content=None), stop_reason=None, token_ids=None)], created=1758888993, model='Qwen/Qwen2.5-7B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=32, prompt_tokens=522, total_tokens=554, completion_tokens_details=None, prompt_tokens_details=None), prompt_logprobs=None, prompt_token_ids=None, kv_transfer_params=None)
ChatCompletion(id='chatcmpl-aa026eae4d094337bff9534f02c6360f', choices=[Choice(finish_reason='t

In [14]:
vf_env.rubric.reward_funcs

[<function vf_musique_structured.rewards.exact_match_reward(completion, answer, info, parser, **kwargs)>,
 <function vf_musique_structured.rewards.f1_reward(completion, answer, info, parser, **kwargs)>,
 <function vf_musique_structured.rewards.retrieval_recall_reward(completion, info, **kwargs)>,
 <function vf_musique_structured.rewards.retrieval_precision_reward(completion, info, **kwargs)>,
 <function vf_musique_structured.rewards.citation_reward(completion, info, parser, **kwargs)>,
 <function vf_musique_structured.rewards.format_reward(completion, parser, **kwargs)>,
 <function vf_musique_structured.rewards.combined_reward(*args, **kwargs)>]

In [15]:
rewards = await vf_env.rubric.score_rollout(prompt=prompt, completion=completion, answer=answer, state=state, info=info)

In [16]:
rewards

RolloutScore(reward=0.45, metrics={'exact_match_reward': 0.0, 'f1_reward': 0.0, 'retrieval_recall_reward': 1.0, 'retrieval_precision_reward': 1.0, 'citation_reward': 0.5, 'format_reward': 1.0, 'combined_reward': 0.45})

In [17]:
completion

[{'role': 'assistant',
  'content': '',
  'tool_calls': [ChatCompletionMessageFunctionToolCall(id='chatcmpl-tool-727a7bcc346d4086a048476278a1c1ed', function=Function(arguments='{"query": "2004 World Series Red Sox opponent last World Series win"}', name='retrieve_documents'), type='function')]},
 {'role': 'tool',
  'content': "Document ID: 4\n# 2004 World Series\nThe 2004 World Series was the championship series of Major League Baseball's (MLB) 2004 season. The 100th edition of the World Series, it was a best - of - seven playoff between the American League (AL) champion Boston Red Sox and the National League (NL) champion St. Louis Cardinals; the Red Sox swept the Cardinals in four games. The series was played from October 23 to 27, 2004, at Fenway Park and Busch Memorial Stadium, broadcast on Fox, and watched by an average of just under 25.5 million viewers. The Red Sox's World Series championship was their first since 1918.",
  'tool_call_id': 'chatcmpl-tool-727a7bcc346d4086a0484762