In [1]:
index_store_dir = f"/tmp/index_store"
repo_base_dir = f"/tmp/repos"

evaluations_dir = "/tmp/moatless/evaluations"
evaluation_name = "20240609_moatless_gpt4o"

evaluation_dir = f"{evaluations_dir}/{evaluation_name}"
trajectory_dir = f"{evaluations_dir}/{evaluation_name}/trajs"
predictions_path = f"{evaluation_dir}/all_preds.jsonl"

In [2]:
import os
import subprocess
import time
import traceback

from moatless.benchmark.swebench import setup_swebench_repo, get_repo_dir_name
from moatless.benchmark.utils import setup_langfuse_tracing, get_total_cost, get_missing_spans
from moatless.loop import CodeLoop, SearchLoop

from moatless.workspace import Workspace

if not os.path.exists(trajectory_dir):
    os.makedirs(trajectory_dir)

def to_result(trajectory: dict, instance: dict) -> dict:
    info = trajectory["info"]
    
    if "error" in info:
        status = "error"
    elif "submission" in info and info["submission"]:
        status = "generated"
    else:
        status = "not_generated"
    
    result = {
        "instance_id": info["instance_id"],
        "duration": info["duration"],
        "total_cost": info["total_cost"],
        "status": status,
        "steps": len(trajectory["steps"])
    }

    return result

def run_test(instance):
    trajectory_path = os.path.join(trajectory_dir, f"{instance['instance_id']}.json")
    if os.path.exists(trajectory_path):
        with open(trajectory_path) as file:
            trajectory = json.load(file)
        if "info" in trajectory and trajectory["info"].get("submission") or "error" in trajectory["info"]:
            return trajectory

    repo_dir = setup_swebench_repo(instance)
    persist_dir = os.path.join(
        index_store_dir, get_repo_dir_name(instance["instance_id"])
    )
    workspace = Workspace.from_dirs(repo_dir=repo_dir, index_dir=persist_dir)
    
    problem_statement = instance["problem_statement"]
    
    search_instructions = f"""Find the code relevant to solve this issue: 

{problem_statement}
"""
    
    code_instructions = f"""Resolve this issue: 

{problem_statement}
"""

    info = {
        "evaluation_name": evaluation_name,
        "instance_id": instance["instance_id"]
    }
    
    trajectory = workspace.create_trajectory(
        "moatless", input_data={"problem_statement": problem_statement}
    )
    
    start_time = time.time()
    try:
        search_loop = SearchLoop(workspace, instructions=search_instructions, trajectory=trajectory, max_cost=0.25)
        search_response = search_loop.execute()
        
        if not search_response.files:
            info["error"] = "No files found"
        else:
            code_loop = CodeLoop(workspace, trajectory=trajectory, instructions=code_instructions, files=search_response.files, max_cost=0.5)
            response = code_loop.execute()
    
            output = subprocess.run(
                  ["git", "diff"],
                  capture_output=True,
                  text=True,
                  cwd=repo_dir,
            )
            
            info["submission"] = output.stdout
    except Exception as e:
        info["error"] = traceback.format_exc()
        logging.exception(f"Error in evaluation of {instance['instance_id']} ")
  
    info["duration"] = time.time() - start_time
    info["total_cost"] = trajectory.total_cost()
    
    trajectory.save_info(info)
    trajectory.persist(file_path=trajectory_path)
    
    return workspace.trajectory.dict()

In [3]:
from moatless.benchmark.swebench import sorted_instances
from pandas import DataFrame
import pandas as pd
from tqdm.notebook import tqdm
import json

def run_evaluation(dataset: str = "princeton-nlp/SWE-bench_Lite", split="test"):
    instances = sorted_instances(dataset, split)
    
    count = 0
    generated = 0
    error = 0
    
    sum_duration = 0
    sum_total_cost = 0

    with open(predictions_path, "w") as file:
        file.write("")

    results = []
    
    stats = {}
    pbar = tqdm(instances)
    for instance in pbar:        
        trajectory = run_test(instance)
        if not trajectory:
            error += 1
            continue
    
        result = to_result(trajectory, instance)
        results.append(result)
        
        sum_duration += result["duration"]
        sum_total_cost += result["total_cost"]
        
        if result["status"] == "error":
            error += 1

        if result["status"] in ["generated", "failed", "resolved"]:
            generated += 1
        
        count += 1

        if sum_duration > 0:
            stats["avg_duration"] = sum_duration / count

        if sum_total_cost > 0:
            stats["avg_cost"] = sum_total_cost / count
            stats["total_cost"] = sum_total_cost
        
        if generated > 0:
            success_rate = (generated / count) * 100
            stats["generated"] = f"{success_rate:.2f}%"
    
        stats["error"] = error
        
        pbar.set_postfix(stats)
    
        prediction = {
            "model_name_or_path": evaluation_name,
            "instance_id": instance["instance_id"],
            "model_patch": trajectory["info"].get("submission", ""),
        }
    
        with open(predictions_path, "a") as file:
            json_string = json.dumps(prediction)
            file.write(json_string + "\n")

    return pd.DataFrame(results)

run_evaluation()

  0%|          | 0/300 [00:00<?, ?it/s]

Unnamed: 0,instance_id,duration,total_cost,status,steps
0,psf__requests-863,12.111127,0.114315,generated,9
1,psf__requests-1963,15.384166,0.054165,generated,5
2,psf__requests-2148,25.590530,0.168155,generated,11
3,psf__requests-2317,16.561865,0.051275,generated,5
4,psf__requests-2674,23.387423,0.192025,generated,12
...,...,...,...,...,...
295,astropy__astropy-14995,15.336708,0.130895,generated,7
296,mwaskom__seaborn-3407,10.660600,0.116240,generated,6
297,pytest-dev__pytest-11148,28.592011,0.338270,generated,9
298,django__django-17051,23.842120,0.118780,generated,7
