# Policy Deep Research Rollout

Run a single inference episode against the OpenEnv environment. Set `USE_OPENAI` to `True` to drive the env with an OpenAI ChatCompletions model (requires `OPENAI_API_KEY`), or keep it `False` to use a local Hugging Face model via `AutoModelForCausalLM`. Make sure you've already built the environment image with `openenv build --tag openenv-policy-deep-research-env:latest` and exported `S2_API_KEY` if you want Semantic Scholar access.

In [1]:
import sys
from pathlib import Path

def call_openai(messages, temperature):
    response = openai_client.chat.completions.create(
        model=OPENAI_MODEL,
        messages=messages,
        temperature=temperature,
        # response_format={'type': 'json_object'},
    )
    return response.choices[0].message.content.strip()

REPO_ROOT = Path.cwd()
if not (REPO_ROOT / 'training').exists():
    for parent in REPO_ROOT.parents:
        if (parent / 'training').exists():
            REPO_ROOT = parent
            break

if not (REPO_ROOT / 'training').exists():
    raise RuntimeError('Could not locate repo root containing training/. Start the notebook from the project root.')

if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))

print(f'Using repo root: {REPO_ROOT}')

Using repo root: /Users/spangher/Projects/stanford-research/rfi-research/regulations-demo/scripts/policy-deep-research-agent


In [2]:
# ---- Configuration ----
USE_OPENAI = True                    # Toggle to use OpenAI ChatCompletions instead of a local HF model
MODEL_ID = 'mistralai/Mistral-7B-Instruct-v0.3'
OPENAI_MODEL = 'gpt-5-mini'
ENV_IMAGE = 'openenv-policy-deep-research-env:latest'
USE_CACHED = True
TASK_INDEX = 0
MAX_STEPS = 12
TEMPERATURE = 1

In [3]:
import json
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

from training.rollout_openenv import rollout_openenv, _parse_action_payload, _format_feedback
from src.envs.policy_deep_research_env.client import PolicyDeepResearchEnv
from src.envs.policy_deep_research_env.models import ResearchAction

system_prompt = (REPO_ROOT / 'training' / 'prompts' / 'system.txt').read_text().strip()
example_path = REPO_ROOT / 'training' / 'prompts' / 'example_1.txt'
if example_path.exists():
    system_prompt = f"{system_prompt}\n\n{example_path.read_text().strip()}"
action_schema = (REPO_ROOT / 'training' / 'prompts' / 'action_schema.md').read_text()
print('Loaded prompts.')

Loaded prompts.


In [4]:
from openai import OpenAI
os.environ['OPENAI_API_KEY'] = open('/Users/spangher/.openai-reglab-project-key.txt').read().strip()
openai_client = OpenAI()

In [5]:
env_client = PolicyDeepResearchEnv.from_docker_image(
    ENV_IMAGE,
    env_vars={
        'OPENENV_USE_CACHED': '1' if USE_CACHED else '0',
        'OPENENV_TASK_INDEX': str(TASK_INDEX),
        'MAX_STEPS': str(MAX_STEPS),
        'S2_API_KEY': os.getenv('S2_API_KEY', ''),
    },
)
print('Environment ready.')

Environment ready.


In [6]:
messages = [{'role': 'system', 'content': system_prompt.strip()}]
reset = env_client.reset()
obs = reset.observation
messages.append({'role': 'user', 'content': f"{obs.question}\n\n{obs.instructions}"})

transcript = []
final_reward = 0.0
final_memo = ''
steps = 0
task_id = obs.task_id
bib = []
temperature=1

def run_one_round(messages):
    completion = call_openai(messages, temperature)
    transcript.append({'messages_snapshot': messages[-4:], 'completion': completion})
    action_dict, _ = _parse_action_payload(completion)
    action = ResearchAction(**action_dict)
    step = env_client.step(action)
    return completion, action, step

while steps < 1:
    completion, action, step = run_one_round(messages)
    obs = step.observation
    steps += 1
    bib = obs.bib
    meta = obs.metadata or {}
    if meta.get('final_memo'):
        final_memo = meta.get('final_memo', '')
    final_reward = float(step.reward or 0.0)
    task_id = obs.task_id
    
    messages.append({'role': 'assistant', 'content': completion})
    messages.append({'role': 'user', 'content': _format_feedback(obs)})

    if step.done:
        break

In [34]:
completion, action, step = run_one_round(messages)

In [33]:
obs = step.observation
steps += 1
bib = obs.bib
meta = obs.metadata or {}
if meta.get('final_memo'):
    final_memo = meta.get('final_memo', '')
final_reward = float(step.reward or 0.0)
task_id = obs.task_id

messages.append({'role': 'assistant', 'content': completion})
messages.append({'role': 'user', 'content': _format_feedback(obs)})

In [35]:
completion

'<think>I will search for papers or policy analyses on RGGI proceeds and how states use them for resilience or adaptation funding.</think>\n<action>{"type":"SEARCH SEMANTIC_SCHOLAR","query":"RGGI proceeds resilience adaptation funding"}</action>'

In [38]:
action.query

'RGGI proceeds resilience adaptation funding'

In [37]:
step

StepResult(observation=ResearchObservation(done=False, reward=None, metadata={}, task_id='policy_001', question='How are US states leveraging carbon pricing mechanisms to fund climate resilience?', instructions='Valid actions:\n1) SEARCH SEMANTIC SCHOLAR - query Semantic Scholar for academic papers. Use SHORT, KEYWORD-FOCUSED queries (3â€“6 terms). Avoid punctuation like quotes or OR clauses. Be sure to keep your queries short!!! Requires {"type":"SEARCH SEMANTIC SCHOLAR","query":"..."}.\n2) FETCH SEMANTIC SCHOLAR PAPER - fetch a specific semantic scholar paper from the search results. Requires {"type":"FETCH SEMANTIC SCHOLAR PAPER","paper_id":"..."}.\n3) ADD TO BIB - add a paper to your bibliography with a justification. Requires {"type":"ADD TO BIB","paper_id":"...","metadata":{"reason":"..."}}.\n4) WRITE NOTE - write a free-form note. Requires {"type":"WRITE NOTE","content":"..."}.\n5) SUBMIT - deliver the final memo. Requires {"type":"SUBMIT","content":"..."}.\n', last_tool_result=

In [26]:
s2_api_key = open('/Users/spangher/.s2_api_key.txt').read().strip()

In [39]:
test_semantic_scholar_search(query=action.query, api_key=s2_api_key)

Query 'RGGI proceeds resilience adaptation funding' returned 1 result(s).


[{'paperId': '527dc99b95d2e9a1cf37b619d0acfac22d92fe60',
  'title': 'Argentina - Adaptation Fund: Increasing Climate Resilience and Enhancing Sustainable Land Management in the Southwest of the Buenos Aires Province Project : restructuring',
  'year': 2018,
  'url': 'https://www.semanticscholar.org/paper/527dc99b95d2e9a1cf37b619d0acfac22d92fe60',
  'authors': 'Tuuli Johanna Bernardini'}]

# Test Semantic Scholar

In [25]:
import requests

def test_semantic_scholar_search(query: str, limit: int = 5, fields: str | None = None, api_key: str | None = None):
    """Fire a basic Semantic Scholar search to verify API/key wiring."""
    api_key = api_key or os.getenv('S2_API_KEY')
    headers = {'x-api-key': api_key} if api_key else {}
    params = {
        'query': query,
        'limit': limit,
        'fields': fields or 'title,year,authors.name,url',
    }
    url = 'https://api.semanticscholar.org/graph/v1/paper/search'
    response = requests.get(url, params=params, headers=headers, timeout=10)
    response.raise_for_status()
    payload = response.json()
    results = []
    for item in payload.get('data', []):
        authors = ', '.join(author.get('name', '') for author in item.get('authors', []) if author.get('name'))
        results.append(
            {
                'paperId': item.get('paperId'),
                'title': item.get('title'),
                'year': item.get('year'),
                'url': item.get('url'),
                'authors': authors,
            }
        )
    print(f"Query '{query}' returned {len(results)} result(s).")
    return results


In [None]:
env_client.close()
print('Environment closed.')