<a href="https://colab.research.google.com/gist/bradhilton/a4cfc8d61f52b0c789524afed83d4f89/fix-openpipe-rl.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!uv pip install -q openpipe-art==0.3.11.post2 langchain-core tenacity "gql<4" --prerelease allow --no-cache-dir
!uv pip install transformers==4.51.3 trl==0.15.2 vllm==0.9.1 peft==0.15.2

[2mUsing Python 3.11.13 environment at: /usr[0m
[2K[37m⠧[0m [2mtriton==3.3.0                                                                 [0m

In [None]:
!uv pip install reasoning-gym==0.1.23

In [None]:
import os
from dotenv import load_dotenv
load_dotenv()

WANDB_API_KEY = "REDACTED"  # set your key
OPENAI_API_KEY = "REDACTED"  # set your OpenAI key
OPENPIPE_API_KEY = "REDACTED"

os.environ["WANDB_API_KEY"] = WANDB_API_KEY

os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

os.environ["OPENPIPE_API_KEY"] = OPENPIPE_API_KEY

In [None]:
"""### Imports & Helpers
"""
import re, math, random, time, os, requests
from typing import TypedDict

from dotenv import load_dotenv
import openai
from openai import AsyncOpenAI
import art
from art.local import LocalBackend
from pydantic import BaseModel
import weave

# -- Reasoning Gym
import reasoning_gym as rg
from reasoning_gym.composite import DatasetSpec
from reasoning_gym import get_score_answer_fn

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
random.seed(42)  # reproducibility

# -------- Answer extraction --------
ANSWER_TAG_RE = re.compile(r"<answer>(.*?)</answer>", re.S | re.I)

def extract_answer_text(text: str) -> str | None:
    """Return raw contents inside <answer>...</answer>, trimmed.
    If no tags, return None (we'll penalize)."""
    m = ANSWER_TAG_RE.search(text or "")
    if not m:
        return None
    return m.group(1).strip()

# -------- RG task spec --------
RG_SPECS = [
    # Arithmetic
    DatasetSpec("basic_arithmetic",          weight=3, config={}),
    DatasetSpec("decimal_arithmetic",        weight=2, config={}),
    DatasetSpec("fraction_simplification",   weight=2, config={}),
    DatasetSpec("power_function",            weight=1, config={}),
    DatasetSpec("time_intervals",            weight=1, config={}),
    DatasetSpec("calendar_arithmetic",       weight=1, config={}),
    DatasetSpec("leg_counting",              weight=1, config={}),

    # Algorithms
    DatasetSpec("prime_factorization",       weight=2, config={}),  # has min_value/max_value
    DatasetSpec("gcd",                       weight=1, config={}),
    DatasetSpec("lcm",                       weight=1, config={}),
    DatasetSpec("count_primes",              weight=1, config={}),
    DatasetSpec("count_bits",                weight=1, config={}),
    DatasetSpec("chain_sum",                 weight=1, config={}),
    DatasetSpec("decimal_chain_sum",         weight=1, config={}),
    DatasetSpec("base_conversion",           weight=1, config={}),
    DatasetSpec("number_sorting",            weight=1, config={}),

    # Geometry
    DatasetSpec("rectangle_count",           weight=1, config={}),
    DatasetSpec("simple_geometry",           weight=1, config={}),

    # Induction
    DatasetSpec("number_sequence",           weight=1, config={}),
]

def make_rg_entry(step: int, seed: int | None = None):
    """Generate a single RG sample (composite, weighted)."""
    if seed is None:
        seed = random.randint(0, 2**31 - 1)

    # You can implement a curriculum by varying configs based on `step`.
    # For now we rely on dataset defaults (already balanced easy→medium).
    data = rg.create_dataset(
        "composite",
        size=1,
        seed=seed,
        datasets=RG_SPECS,
    )
    entry = data[0]  # dict with keys: question, answer, metadata
    return entry

def score_rg_answer(entry: dict, answer_text: str) -> float:
    """Returns 1.0 if correct, else 0.0. Uses RG’s official verifier."""
    dataset_name = entry["metadata"]["source_dataset"]
    rg_score = get_score_answer_fn(dataset_name)
    return float(rg_score(answer_text, entry))


INFO 07-30 23:13:03 [__init__.py:244] Automatically detected platform cuda.


  if event.key is 'enter':

  from pkg_resources import resource_stream, resource_exists

  if memoize is "recursive":

  if memoize is "recursive":

  if memoize is "recursive":

  if memoize is "recursive":



In [None]:
"""### Declare the Trainable Model
"""

model = art.TrainableModel(
    name='testname',
    project    = "autotransmission-single-turn",
    base_model = "Qwen/Qwen2.5-0.5B-Instruct",
)
model._internal_config = art.dev.InternalModelConfig(
    init_args   = art.dev.InitArgs(max_seq_length=4096),
    engine_args = art.dev.EngineArgs(enforce_eager=True, gpu_memory_utilization=0.7),
)

backend = LocalBackend(in_process=True, path="./.art")
await model.register(backend)

weave.init(model.project)

In [None]:
"""### Rollout Scenario
"""
class ScenarioAIME(BaseModel):
    step: int

"""### Rollout Function (now uses Reasoning Gym)

We train Qwen to *rewrite* the user problem into a great prompt for GPT‑4.1‑nano.
We then score GPT‑4.1‑nano’s final <answer> against RG’s verifier.
Reward = 2 if correct, 0 if incorrect, −1 if no <answer>.
"""

@weave.op
@art.retry(exceptions=(requests.ReadTimeout,))
async def rollout(model: art.Model, _unused, scenario: ScenarioAIME) -> art.Trajectory:
    # --- sample a fresh RG entry ------------------------------------------------
    entry = make_rg_entry(step=scenario.step)
    problem_text = entry["question"]                 # plain text question
    gold_answer = entry["answer"]                    # ground-truth string (not used directly)
    source_ds    = entry["metadata"]["source_dataset"]

    # --- init trajectory --------------------------------------------------------
    # sys_prompt = (
    #     "You are a prompt improver. Given a user problem, rewrite an instruction "
    #     "for GPT‑4.1‑nano to SOLVE it, including precise format requirements. "
    #     "Do NOT solve it yourself. If the task expects a list or grid, instruct the exact "
    #     "required spacing and ordering for the final answer. Ensure that GPT-4.1-nano is instructed to put its final answer after thinking inside <answer> tags, otherwise, it will not be counted. You may instruct it to do anything you want it to to get to the <answer> before committing to it. Ensure the ONLY thing inside the <answer> tags is the actual answer, no other text or explanation. For example: <answer>43-24=19</answer> includes explanation, so it would not be counted. But <answer>19</answer> will be counted."
    # )

    sys_prompt = """<task_description>
You will be given a question (<original_question>). Your job is to rewrite this question in a way that will help another model (gpt-4.1-nano) answer it correctly. The prompt you rewrite will be the *only* thing gpt-4.1-nano generates against, so make it clear, faithful to the <original_question>, and easier to answer properly than just the raw <original_question>. Think: "what reframing would make it easier to the model to arrive at the correct answer?"

You should make the question clearer and easier to understand and answer while keeping the same meaning.
</task_description>

<IMPORTANT>
Ensure that GPT-4.1-nano is instructed to put its final answer after thinking inside <answer> tags, otherwise, it will not be counted. You may instruct it to do anything you want it to to get to the <answer> before committing to it. Ensure the ONLY thing inside the <answer> tags is the actual answer, no other text or explanation. For example: <answer>43-24=19</answer> includes explanation, so it would not be counted. But <answer>19</answer> will be counted.
</IMPORTANT>

<examples>
**Example 1 — Math word problem (structured variables)**

* **Original:** “A car drives 60 mph for 2 hours and 30 mph for 1 hour. What is the average speed for the whole trip?”
* **Rewritten:**
  “You are given two travel segments.

  * Segment A: speed = 60 mph, time = 2 h
  * Segment B: speed = 30 mph, time = 1 h
    Let total distance `D = 60*2 + 30*1`. Let total time `T = 2 + 1`. Compute average speed `S_avg = D / T` in mph.
    Show your calculation steps if needed, then give only the numeric average speed (mph not included) as the final result inside `<answer>` tags.
    **Final answer format:** `<answer>{number}</answer>`.”

**Example 2 — Unit conversion (explicit constants + IO spec)**

* **Original:** “How many grams are in 2.5 pounds?”
* **Rewritten:**
  “Convert mass from pounds to grams. Use the exact factor `1 lb = 453.59237 g`.
  Compute `grams = 2.5 * 453.59237`. Round to the nearest whole number.
  Output only the rounded integer inside `<answer>` tags (no units).
  **Final answer format:** `<answer>{integer}</answer>`.”

**Example 3 — Logic puzzle (edge cases + minimality)**

* **Original:** “There are three boxes labeled ‘apples’, ‘oranges’, and ‘mixed’, but all labels are wrong. What is the smallest number of fruit you must draw to label the boxes correctly?”
* **Rewritten:**
  “You have 3 mislabeled boxes: A, O, M. Each contains {only apples}, {only oranges}, or {a mix}, and each current label is incorrect. Determine the **minimum** number of single‑fruit draws (without looking) needed to deduce the true contents of all three boxes, assuming you can choose which box to draw from and observe the fruit type drawn. To do this, first think for at least 25 paragraphs inside <thinking> tags, and then write your answer inside <answer> tags.
  Provide only the minimal required number inside `<answer>` tags.
  **Final answer format:** `<answer>{integer}</answer>`.”

**Example 4 — Programming/string rules (clarify requirements)**

* **Original:** “Compress ‘aaabbcaaa’ with run-length encoding.”
* **Rewritten:**
  “Apply simple run‑length encoding (RLE) to the ASCII string `aaabbcaaa` with these rules:

  * Consecutive runs are encoded as `<char><count>`.
  * If a run length is 1, write only the character (no ‘1’).
  * Example: `AAB` → `A2B`.
    Compute the RLE of `aaabbcaaa` and return only the encoded string inside `<answer>` tags.
    **FINAL ANSWER SHOULD BE IN THIS FORMAT:** `<answer>{string}</answer>`.”

**Example 5 — Geography (disambiguation + output constraints)**

* **Original:** “Which African country borders both the Atlantic Ocean and the Mediterranean Sea?”
* **Rewritten:**
  “Find the **single** African country that has coastlines on **both** the Atlantic Ocean and the Mediterranean Sea. To do this, first, list all African countries. Then, for each, one by one, list the coastlines (with specifics) that each country has. From there, select the one that has coastlines on **both** the Atlantic Ocean and the Mediterranean Sea. Once you are sure of your answer, return only the country name, with standard English spelling, inside `<answer>` tags.
  **Place your final answer in this format at the end of your response:** `<answer>{country}</answer>`.”
</examples>

Remember, your job is to rewrite the <original_question> to make it more likely for gpt-4.1-nano to get the correct answer. Do this incredibly well."""

    trajectory = art.Trajectory(
        messages_and_choices=[
            {"role": "system", "content": sys_prompt},
            {"role": "user",   "content": f"<original_question>{problem_text}</original_question>\n\nRewrite the <original_question> to make it more likely for gpt-4.1-nano to get the correct answer. DO NOT solve it yourself. Just rewrite the <original_question>."},
        ],
        metadata={
            "problem_id"   : f"{source_ds}:{entry['metadata'].get('source_index', 'na')}",
            "step"         : scenario.step,
            "rg_dataset"   : source_ds,
        },
        reward=0,
    )

    # --- Qwen produces the improved prompt -------------------------------------
    client = model.openai_client()  # Qwen inference
    try:
        chat = await client.chat.completions.create(
            messages      = trajectory.messages(),
            model         = model.name,
            max_tokens    = 2000,
            stream        = False,
            temperature   = 1.0,
            stop          = [],
        )
    except Exception as e:
        print("Qwen completion error:", e)
        trajectory.reward = -1
        return trajectory

    choice  = chat.choices[0]
    qwen_prompt = (choice.message.content or "").strip()
    trajectory.messages_and_choices.append(choice)

    # --- GPT‑4.1‑nano solves with strict formatting ----------------------------
    # Official model name documented by OpenAI. :contentReference[oaicite:8]{index=8}
    openai_client = AsyncOpenAI()
    tool_resp = await openai_client.chat.completions.create(
        model    = "gpt-4.1-nano",
        messages = [
            # {
            #     "role": "system",
            #     "content": (
            #         "Place your final answer inside <answer> tags, otherwise, it will not be counted. You may do anything you want to get to the <answer> before committing to it. Ensure the ONLY thing inside the <answer> tags is the actual answer, no other text or explanation. For example: <answer>43-24=19</answer> includes explanation, so it would not be counted. But <answer>19</answer> will be counted."
            #     ),
            # },
            {"role": "user", "content": qwen_prompt},
        ],
        temperature = 0.0,
    )
    tool_content = (tool_resp.choices[0].message.content or "").strip()

    # --- extract final answer text & score -------------------------------------
    answer_text = extract_answer_text(tool_content)
    if answer_text is None:
        trajectory.reward = -1
    else:
        score = score_rg_answer(entry, answer_text)   # 1.0 or 0.0
        trajectory.reward = 2 if score >= 1.0 else 0
        print(
            f"Step {scenario.step}  |  DS={source_ds:<20}  "
            f"{'Correct' if score>=1.0 else 'Wrong'}  "
            f"ANS='{answer_text}'"
            f"CORRECT={entry['answer']}"
        )

    return trajectory


In [None]:
"""### Training Loop (RG procedural data)
"""
# ---- pick how many problems per step and rollouts per problem
# P = 4                      # problems per step
# R = 8                      # rollouts per problem
# NUM_EPOCHS = 2
# NUM_STEPS_PER_EPOCH = 64

P = 1                      # problems per step
R = 2                      # rollouts per problem
NUM_EPOCHS = 2
NUM_STEPS_PER_EPOCH = 128

for epoch in range(NUM_EPOCHS):
    print(f"=== Epoch {epoch+1}/{NUM_EPOCHS} ===")
    for step in range(NUM_STEPS_PER_EPOCH):
        global_step = epoch * NUM_STEPS_PER_EPOCH + step

        # Sample P entries (you can bias by weights yourself if desired)
        entries = [make_rg_entry(step=global_step) for _ in range(P)]

        train_groups_iter = []
        for entry in entries:
            tg = art.TrajectoryGroup(
                rollout(model, entry, ScenarioAIME(step=global_step))
                for _ in range(R)
            )
            train_groups_iter.append(tg)

        train_groups = await art.gather_trajectory_groups(
            tuple(train_groups_iter),
            pbar_desc=f"Epoch {epoch+1}/{NUM_EPOCHS}  Step {step+1}/{NUM_STEPS_PER_EPOCH}",
            max_exceptions=18,
        )

        await model.train(
            train_groups,
            config  = art.TrainConfig(learning_rate=0.0000025),
            _config = {"logprob_calculation_chunk_size": 8},
        )

        await model.delete_checkpoints()


In [None]:
client = model.openai_client()  # Qwen inference

chat = await client.chat.completions.create(
    messages      = [{"role": "user", "content": "hi"}],
    model         = model.name,
    max_tokens    = 20,
    stream        = False,
    temperature   = 1.0,
    stop          = [],
)


choice  = chat.choices[0]
choice

[36m[1mweave[0m: 🍩 https://wandb.ai/otherside/autotransmission-single-turn/r/call/01985d76-2057-7362-b195-3ab68e897106


APIConnectionError: Connection error.