In [1]:
from pathlib import Path
import json
import typer
import pandas as pd
import verifiers as vf
from verifiers.types import SamplingArgs
from dotenv import load_dotenv
from openai import AsyncOpenAI

assert load_dotenv(), "Failed to load .env file"

openai_client = AsyncOpenAI()


def jprint(obj):
    print(json.dumps(obj, indent=2))


In [2]:
import mlflow

# Tell MLflow about the server URI.
mlflow.set_tracking_uri("http://127.0.0.1:5000")
# Enable autologging with all features
mlflow.openai.autolog()
# Create a unique name for your experiment.
mlflow.set_experiment("rlvr-explore")

<Experiment: artifact_location='mlflow-artifacts:/15', creation_time=1756559484585, experiment_id='15', last_update_time=1756559484585, lifecycle_stage='active', name='rlvr-explore', tags={}>

In [3]:
model_response = await openai_client.models.list()
available_models = [item.id for item in model_response.data if item.object == 'model']
model = available_models[0]
print(model)

Qwen/Qwen2.5-7B-Instruct


In [4]:
datasets_str = "bdsaglam/musique-mini,answerable,train"
noise_rate = 1.0
retriever = "hybrid"

print(f"📊 Dataset: {datasets_str} (noise rate: {noise_rate})")
print(f"🔍 Retriever: {retriever}")

# Load MuSiQue environment
print("🌍 Loading MuSiQue environment...")

vf_env = vf.load_environment(
    env_id="vf-musique",
    datasets_str=datasets_str,
    noise_rate=noise_rate,
    retriever_name=retriever,
)
print(f"✅ Environment loaded with {len(vf_env.dataset)} examples")

📊 Dataset: bdsaglam/musique-mini,answerable,train (noise rate: 1.0)
🔍 Retriever: hybrid
🌍 Loading MuSiQue environment...


Map: 100%|##########| 300/300 [00:00<?, ? examples/s]

Filter:   0%|          | 0/300 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

✅ Environment loaded with 100 examples


In [5]:
df = vf_env.dataset.to_pandas()
df

Unnamed: 0,question,answer,info,prompt
0,What's the population of the city where Anthon...,190884,"{'answers': ['190,884'], 'docs': [{'body': 'Ol...",[{'content': 'Answer the question based on the...
1,When did the team that the Red Sox played agai...,2011,"{'answers': ['2011'], 'docs': [{'body': 'Exact...",[{'content': 'Answer the question based on the...
2,Who is the father of the artist who created Ti...,Alfred Lennon,"{'answers': ['Alfred Lennon', 'alfred lennon']...",[{'content': 'Answer the question based on the...
3,"In Doctor Who, who plays the creator of Trees ...",Tony Curran,"{'answers': ['Tony Curran', 'tony curran'], 'd...",[{'content': 'Answer the question based on the...
4,Little Annie Rooney's screenwriter has who for...,Jack Pickford,"{'answers': ['Jack Pickford', 'jack pickford']...",[{'content': 'Answer the question based on the...
...,...,...,...,...
95,What is the size of the continent Marian Cove ...,5400000,"{'answers': ['5,400,000'], 'docs': [{'body': '...",[{'content': 'Answer the question based on the...
96,When was the state of emergency declared in th...,20 October 1952,"{'answers': ['20 october 1952', '20 October 19...",[{'content': 'Answer the question based on the...
97,Who is the spouse of the performer of Mistleto...,Barbara Marx,"{'answers': ['Barbara Marx', 'barbara marx'], ...",[{'content': 'Answer the question based on the...
98,When did the Kuomintang defeat and take over t...,1928,"{'answers': ['1928'], 'docs': [{'body': 'He qu...",[{'content': 'Answer the question based on the...


In [6]:
df['n_hops'] = df['info'].map(lambda x: x['n_hops'])
df['n_hops']

0     2
1     2
2     2
3     2
4     2
     ..
95    2
96    2
97    2
98    2
99    2
Name: n_hops, Length: 100, dtype: int64

In [7]:
vf_env.oai_tools

[{'type': 'function',
  'function': {'name': 'retrieve_documents',
   'description': 'Retrieve relevant documents by the query. The results get better with more specific queries.',
   'parameters': {'type': 'object',
    'properties': {'query': {'type': 'string',
      'description': 'Parameter `query` of type string.'}},
    'required': ['query']}}}]

In [8]:
# Environments provide rollout interfaces for RL
example = vf_env.dataset[0]

prompt = example["prompt"]
answer = example["answer"]
info = example["info"]
info['oai_tools'] = vf_env.oai_tools


sampling_args = dict(temperature=1.0)

completion, state = await vf_env.rollout(
    openai_client,
    model=model,
    prompt=prompt,
    answer=answer,
    info=info,
    sampling_args=sampling_args,
)

In [9]:
print(type(completion))
print(type(completion[0]))
completion[0]

<class 'list'>
<class 'dict'>


{'role': 'assistant',
 'content': '',
 'tool_calls': [ChatCompletionMessageFunctionToolCall(id='chatcmpl-tool-b074d662c4f24b3d9df6c618bedb6350', function=Function(arguments='{"query": "Anthony Howe birthplace population"}', name='retrieve_documents'), type='function')]}

In [10]:
state

{'prompt': [{'content': "Answer the question based on the information provided by tools.\n\nFor each step:\n1. Think through your reasoning inside <think> tags\n2. Use tools to retrieve relevant documents\n3. Continue until you find the answer through multi-hop reasoning. The question is answerable from the docs. \n4. In the **last** step:\n    - Reflect on your previous steps inside <think> tags\n    - Cite the documents you base your answer on inside <cite> tags by their IDs, e.g. `<cite>1, 2, 3</cite>`\n    - Give your final answer inside <answer> tags\nAn example for your final message:\n```\n<think>\n[your thinking and explanation here]\n</think> \n<cite>\n[IDs of the documents that back your answer]\n</cite>\n<answer>\n[your final answer in **a few words**. no explanation here.]\n</answer>\n```\n\n- Do not make up tools or arguments that aren't listed.\n- Make one tool call per step.\n- Questions require multi-hop reasoning across multiple documents.\n- Continue searching until y

In [11]:
for response in state['responses']:
    print(response)

ChatCompletion(id='chatcmpl-1f66d9cd89ab4ad9bb4018360969b8ed', choices=[Choice(finish_reason='tool_calls', index=0, logprobs=None, message=ChatCompletionMessage(content=None, refusal=None, role='assistant', annotations=None, audio=None, function_call=None, tool_calls=[ChatCompletionMessageFunctionToolCall(id='chatcmpl-tool-b074d662c4f24b3d9df6c618bedb6350', function=Function(arguments='{"query": "Anthony Howe birthplace population"}', name='retrieve_documents'), type='function')], reasoning_content=None), stop_reason=None)], created=1756559435, model='Qwen/Qwen2.5-7B-Instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=24, prompt_tokens=672, total_tokens=696, completion_tokens_details=None, prompt_tokens_details=None), prompt_logprobs=None, kv_transfer_params=None)
ChatCompletion(id='chatcmpl-ac7b7847394444c78c15273f3fcaaae8', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(c

In [12]:
vf_env.rubric.reward_funcs

[<function vf_musique.rewards.exact_match_reward(completion, answer, info, parser, **kwargs)>,
 <function vf_musique.rewards.f1_reward(completion, answer, info, parser, **kwargs)>,
 <function vf_musique.rewards.retrieval_recall_reward(completion, info, **kwargs)>,
 <function vf_musique.rewards.citation_reward(completion, info, parser, **kwargs)>,
 <function vf_musique.rewards.format_reward(completion, parser, **kwargs)>,
 <function vf_musique.rewards.combined_reward(*args, **kwargs)>]

In [13]:
rewards = await vf_env.rubric.score_rollout(prompt=prompt, completion=completion, answer=answer, state=state, info=info)

In [14]:
rewards

RolloutScore(reward=0.7625, metrics={'exact_match_reward': 1.0, 'f1_reward': 1.0, 'retrieval_recall_reward': 1.0, 'citation_reward': 1.0, 'format_reward': 1.0, 'combined_reward': 0.7625})

In [15]:
completion

[{'role': 'assistant',
  'content': '',
  'tool_calls': [ChatCompletionMessageFunctionToolCall(id='chatcmpl-tool-b074d662c4f24b3d9df6c618bedb6350', function=Function(arguments='{"query": "Anthony Howe birthplace population"}', name='retrieve_documents'), type='function')]},
 {'role': 'tool',
  'content': 'Document ID: 15\n# Anthony Howe (sculptor)\nAnthony Howe (born 1954, Salt Lake City, Utah) is an American kinetic sculptor who creates wind-driven sculptures resembling pulsing, alien creatures and vortices. He makes use of computer-aided design, shaping the metal components with a plasma cutter, and completing his work by use of traditional metalworking techniques.\n\nDocument ID: 14\n# Salt Lake City\nSalt Lake City (often shortened to Salt Lake or SLC) is the capital and the most populous municipality of the U.S. state of Utah. With an estimated population of 190,884 in 2014, the city is the core of the Salt Lake City metropolitan area, which has a population of 1,153,340 (2014 est