# Policy Deep Research Rollout

Run a single inference episode against the OpenEnv environment. Set `USE_OPENAI` to `True` to drive the env with an OpenAI ChatCompletions model (requires `OPENAI_API_KEY`), or keep it `False` to use a local Hugging Face model via `AutoModelForCausalLM`. Make sure you've already built the environment image with `openenv build --tag openenv-policy-deep-research-env:latest` and exported `S2_API_KEY` if you want Semantic Scholar access.

In [1]:
import sys
from pathlib import Path

def call_openai(messages, temperature):
    response = openai_client.chat.completions.create(
        model=OPENAI_MODEL,
        messages=messages,
        temperature=temperature,
        response_format={'type': 'json_object'},
    )
    return response.choices[0].message.content.strip()

REPO_ROOT = Path.cwd()
if not (REPO_ROOT / 'training').exists():
    for parent in REPO_ROOT.parents:
        if (parent / 'training').exists():
            REPO_ROOT = parent
            break

if not (REPO_ROOT / 'training').exists():
    raise RuntimeError('Could not locate repo root containing training/. Start the notebook from the project root.')

if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))

print(f'Using repo root: {REPO_ROOT}')

Using repo root: /Users/spangher/Projects/stanford-research/rfi-research/regulations-demo/scripts/policy-deep-research-agent


In [2]:
# ---- Configuration ----
USE_OPENAI = True                    # Toggle to use OpenAI ChatCompletions instead of a local HF model
MODEL_ID = 'mistralai/Mistral-7B-Instruct-v0.3'
OPENAI_MODEL = 'gpt-5-mini'
ENV_IMAGE = 'openenv-policy-deep-research-env:latest'
USE_CACHED = True
TASK_INDEX = 0
MAX_STEPS = 12
TEMPERATURE = 1

In [3]:
import json
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

from training.rollout_openenv import rollout_openenv, _parse_action_payload, _format_feedback
from src.envs.policy_deep_research_env.client import PolicyDeepResearchEnv
from src.envs.policy_deep_research_env.models import ResearchAction

system_prompt = (REPO_ROOT / 'training' / 'prompts' / 'system.txt').read_text().strip()
example_path = REPO_ROOT / 'training' / 'prompts' / 'example_1.txt'
if example_path.exists():
    system_prompt = f"{system_prompt}\n\n{example_path.read_text().strip()}"
action_schema = (REPO_ROOT / 'training' / 'prompts' / 'action_schema.md').read_text()
print('Loaded prompts.')

Loaded prompts.


In [4]:
from openai import OpenAI
os.environ['OPENAI_API_KEY'] = open('/Users/spangher/.openai-reglab-project-key.txt').read().strip()
openai_client = OpenAI()

In [5]:
env_client = PolicyDeepResearchEnv.from_docker_image(
    ENV_IMAGE,
    env_vars={
        'OPENENV_USE_CACHED': '1' if USE_CACHED else '0',
        'OPENENV_TASK_INDEX': str(TASK_INDEX),
        'MAX_STEPS': str(MAX_STEPS),
        'S2_API_KEY': os.getenv('S2_API_KEY', ''),
    },
)
print('Environment ready.')

Environment ready.


In [6]:
messages = [{'role': 'system', 'content': system_prompt.strip()}]
reset = env_client.reset()
obs = reset.observation
messages.append({'role': 'user', 'content': f"{obs.question}\n\n{obs.instructions}"})

transcript = []
final_reward = 0.0
final_memo = ''
steps = 0
task_id = obs.task_id
bib = []
temperature=1

while steps < 1:
    completion = call_openai(messages, temperature)
    transcript.append({'messages_snapshot': messages[-4:], 'completion': completion})
    try:
        action_dict, _ = _parse_action_payload(completion)
    except ValueError as exc:
        messages.append({'role': 'assistant', 'content': completion})
        messages.append({'role': 'user', 'content': f"Invalid JSON ({exc}). Return <action>{{...}}</action> matching:\n{action_schema}"})
        continue

    action = ResearchAction(**action_dict)
    step = env_client.step(action)
    obs = step.observation
    steps += 1
    bib = obs.bib
    meta = obs.metadata or {}
    if meta.get('final_memo'):
        final_memo = meta.get('final_memo', '')
    final_reward = float(step.reward or 0.0)
    task_id = obs.task_id

    messages.append({'role': 'assistant', 'content': completion})
    messages.append({'role': 'user', 'content': _format_feedback(obs)})

    if step.done:
        break

In [7]:
action

ResearchAction(metadata={}, type='SEARCH SEMANTIC_SCHOLAR', query='state carbon pricing revenue use resilience RGGI California cap-and-trade auction proceeds climate resilience coastal adaptation "auction proceeds" "resilience"', paper_id=None, top_k=10, filters={}, content=None)

In [8]:
step

StepResult(observation=ResearchObservation(done=False, reward=None, metadata={}, task_id='policy_001', question='How are US states leveraging carbon pricing mechanisms to fund climate resilience?', instructions='Valid actions:\n1) SEARCH SEMANTIC SCHOLAR - query Semantic Scholar for academic papers. Requires {"type":"SEARCH SEMANTIC SCHOLAR","query":"..."}.\n2) FETCH PAPER - fetch a specific paper. Requires {"type":"FETCH PAPER","paper_id":"..."}.\n3) ADD_TO_BIB - add a paper to your bibliography with a justification. Requires {"type":"ADD_TO_BIB","paper_id":"...","metadata":{"reason":"..."}}.\n4) WRITE_NOTE - write a free-form note. Requires {"type":"WRITE_NOTE","content":"..."}.\n5) SUBMIT - deliver the final memo. Requires {"type":"SUBMIT","content":"..."}.e.g.: <think>your private reasoning</think><action>{"type":"..."}</action>\n\n', last_tool_result={'error': 'Unknown action.type=SEARCH SEMANTIC_SCHOLAR'}, bib=[], notes=[], remaining_steps=11), reward=0.0, done=False)

In [8]:
completion

'{ "error": "You must include a <think> tag followed by an <action> command in this turn." }'

In [8]:
action_dict

{'type': 'SEARCH',
 'query': "US states carbon pricing revenues fund climate resilience academic papers reports 'Regional Greenhouse Gas Initiative' California cap-and-trade 'carbon fee' 'carbon pricing' 'resilience' 'adaptation' 'use of proceeds'",
 'metadata': {},
 'filters': {},
 'top_k': 10}

In [11]:
import pyperclip
pyperclip.copy(str(messages))

In [10]:
rollout = run_openai_episode(
    env_client=env_client,
    system_prompt=system_prompt,
    action_schema=action_schema,
    max_steps=MAX_STEPS,
    temperature=TEMPERATURE,
)
reward = rollout['reward']
final_memo = rollout.get('final_memo', '')

print('Reward:', reward)
print('Task ID:', rollout.get('task_id'))
print('Steps:', rollout.get('steps'))
print('Bibliography entries:')
for paper in rollout.get('bib', []):
    print('-', paper.get('title'), paper.get('paperId'))
print('Final memo snippet:', (final_memo or '')[:800])

Reward: -1.0
Task ID: policy_001
Steps: 12
Bibliography entries:
Final memo snippet: 


In [None]:
env_client.close()
print('Environment closed.')