# 0 - Quick Imports

In [1]:
# set path to the parent dir
import os
import sys
sys.path.append('src')
os.chdir(os.path.join(os.path.expanduser('~'),'project/r2e/r2e-edits-internal'))

!pwd

/home/gcpuser/project/r2e/r2e-edits-internal


# TestGen

In [2]:
import fire
import glob
import pandas as pd
from pathlib import Path
from collections import defaultdict
from datasets import load_dataset, Dataset
from concurrent.futures import ProcessPoolExecutor, as_completed

import numpy as np
from tqdm import tqdm

from r2e_edits.agenthub.runtime.docker import DockerRuntime
from r2e_edits.agenthub.environment.env import EnvArgs, RepoEnv
from r2e_edits.agenthub.agent.agent import AgentArgs, Agent
from r2e_edits.agenthub.trajectory.trajectory import Trajectory
from r2e_edits.agenthub.run.testgen import runagent

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
2025-03-15 03:02:24,652 - httpx - INFO - HTTP Request: GET https://raw.githubusercontent.com/BerriAI/litellm/main/model_prices_and_context_window.json "HTTP/1.1 200 OK"
* 'fields' has been removed


In [3]:
def extract_patch(messages):
    """
    Extracts the patch content from a message list.

    Assumes that the patch is contained within markers:
      [PATCH]
      ...
      [/PATCH]
    """
    try:
        content = messages[1]["content"]
        if "[PATCH]" in content and "[/PATCH]" in content:
            # Extract and return the patch content between the markers
            return content.split("[PATCH]\n")[1].split("\n[/PATCH]")[0]
    except (IndexError, KeyError, AttributeError):
        pass
    return None


def best_verif(df):
    def topn(n):
        # top-1 verif
        dfmaxprob = df.groupby("docker_image")["avg_yes_prob"].nlargest(n).reset_index()
        dfmaxprob_df = pd.merge(df, dfmaxprob, on=["docker_image", "avg_yes_prob"])
        pass_1_highest_p2p = dfmaxprob_df.groupby("docker_image")[
            "rewards"
        ].max().sum() / len(df.docker_image.unique())

        print(
            f"Pass@1 top-{n} verif: {pass_1_highest_p2p} ({len(df.docker_image.unique())})"
        )

    topn(1)
    topn(2)
    topn(3)
    topn(4)
    topn(5)


agent_args = AgentArgs.from_yaml(Path("src/r2e_edits/agenthub/train/edit-rft.yaml"))

def run_test_patch(ds, docker_image, test_patch, patch):
    """
    Applies a patch in the given environment, runs a test command, and computes the predicted reward.

    If the output contains the word 'resolved', the predicted reward is 1.0, otherwise 0.0.
    The patch is undone after testing.
    """
    # print(ds)
    try:
        env_args = EnvArgs(ds, docker_image=docker_image)
        env = RepoEnv(env_args)
        env.reset()
        env.add_commands(agent_args.command_files)
        if test_patch:
            env.runtime.apply_patch(test_patch)
        else:
            return 0
        try:
            # Apply the patch if provided
            if patch:  # -> check for empty patches
                env.runtime.apply_patch(patch)
            # Execute the test command
            out, error_code = env.runtime.run(
                "execute_bash --cmd 'python3 test_issue.py -v'"
            )
            # Decide reward based on output
            pred_reward = 1.0 if "resolved" in out else 0.0
            # Reverse the patch if it was applied
        except Exception as e:
            print(f"Error during patch testing: {e}")
            pred_reward = 0.0
        return pred_reward
    finally:
        env.close()


In [4]:
rollout_ds = load_dataset("r2e-edits/32b_swebv_temp08_10_patch_verifier")
df = (
    rollout_ds["train"]
    .to_pandas()
    .rename(columns={"docker_images": "docker_image"})
)
df["patch"] = df["messages"].apply(extract_patch)


In [5]:
grouped = df.groupby('docker_image', as_index=False).agg(list)
docker_ds = rollout_ds['train']

In [6]:
def run_all_patches(ds_entry, test_patch, patches):
    res = {}
    with ProcessPoolExecutor(10) as executor:
        futures = {executor.submit(run_test_patch, ds_entry, ds_entry['docker_image'], test_patch, patch): idx for idx, patch in enumerate(patches)}
        for future in tqdm(as_completed(futures), total=len(patches)):
            try:
                result = future.result()
                res[futures[future]] = result
            except Exception as e:
                print(f"~~~~ Error: {e}")
                res[futures[future]] = 0
    return [res[idx] for idx in sorted(res.keys())]
            

In [None]:
true_ds = load_dataset("r2e-edits/swebench-verified-v1", split="test").to_pandas().set_index("docker_image", drop=False)

In [None]:
def run_index(index, filter=True):
    # Select the matching dataset entry using the provided index
    docker_image = grouped.iloc[index]["docker_image"]
    gt_rewards = grouped.iloc[index]['rewards']
    patches = grouped.iloc[index]['patch']
    p2p = [len(x) for x in grouped.iloc[index]['p2p_rate']]

    max_p2p = max(p2p)
    max_p2p_indices = [i for i, x in enumerate(p2p) if x == max_p2p]

    if filter:
        patches = [patches[i] for i in max_p2p_indices]
        gt_rewards = [gt_rewards[i] for i in max_p2p_indices]
        p2p = [p2p[i] for i in max_p2p_indices]


    selected_ds_entry = true_ds.loc[docker_image].to_dict()

    patch_string = ""
    patch_string = f"\n\nHere are some example patches that have been attempted:\n"
    patch_string += f"Patch #1:\n\n{patches[-1]}\n\n"
    patch_string += f"Patch #2:\n\n{patches[-2]}\n\n"
    # patch_string += f"Patch #3:\n\n{patches[-2]}\n\n"
    # patch_string += "IMPORTANT: ANALYZE THESE PATCHES TO IDENTIFY POTENTIAL CORNER CASES YOU SHOULD ADD TO TEST_ISSUE. YOUR TEST SHOULD CORRECTLY DISAMBIGUATE BETWEEN CORRECT (GENERALIZABLE) PATCHES AND INCORRECT (NON-GENERALIZABLE) PATCHES"


    selected_ds_entry['problem_statement'] = selected_ds_entry['problem_statement'] + patch_string 

    

    testgen_traj = Trajectory.load_from_model_dump_json(runagent(
        selected_ds_entry,
        'test_exp',
        max_steps=30,
        max_steps_absolute=40,
        llm_name="vertex_ai/claude-3-5-sonnet-v2@20241022"
    ))

    print(testgen_traj.output_patch)


    rewards = run_all_patches(selected_ds_entry, testgen_traj.output_patch, patches)

    print(f"GT rewards: {gt_rewards}")
    print(f"Predicted rewards: {rewards}")
    return testgen_traj
    


In [None]:
def run_docker_name(docker_name):
    index = grouped[grouped['docker_image'] == docker_name].index[0]
    run_index(index)

In [None]:
# run_index(0)

In [None]:
run_docker_name("slimshetty/swebench-verified:sweb.eval.x86_64.pytest-dev__pytest-5631")

In [None]:
true_ds.test_patch

In [None]:
def count_num_files(patch):
    ## count number of diff --git {file} {file} in a patch for unique file
    return len(set([line.split()[2][2:] for line in patch.split('\n') if line.startswith('diff --git')]))
true_ds['test_num_files'] = true_ds['test_patch'].apply(count_num_files)

In [None]:
true_ds['test_num_files'].value_counts()

In [None]:
print(true_ds.test_patch['slimshetty/swebench-verified:sweb.eval.x86_64.pytest-dev__pytest-5631'])

In [None]:
index = grouped[grouped['docker_image'] == "slimshetty/swebench-verified:sweb.eval.x86_64.pytest-dev__pytest-5631"].index[0]

docker_image = grouped.iloc[index]["docker_image"]
gt_rewards = grouped.iloc[index]['rewards']
patches = grouped.iloc[index]['patch']
p2p = [len(x) for x in grouped.iloc[index]['p2p_rate']]

max_p2p = max(p2p)
max_p2p_indices = [i for i, x in enumerate(p2p) if x == max_p2p]

if filter:
    patches = [patches[i] for i in max_p2p_indices]
    gt_rewards = [gt_rewards[i] for i in max_p2p_indices]
    p2p = [p2p[i] for i in max_p2p_indices]

print(patches[-2])

In [None]:
print(patches[-1])