In [2]:
from moatless.benchmark.utils import get_missing_files


def evaluate_prediction(info: dict) -> str:
    if "error" in info:
        return "error"
    
    if "submission" not in info or not info["submission"]:
        return "not_submitted"
    
    prediction = info["submission"]
    
    result_file = f"{evaluation_dir}/report.json"
    if not os.path.exists(result_file) and prediction:
        # No support for evaluation yet. Generate the swe bench evaluation result file and run again...
        return "generated"
    
    with open(os.path.join(result_file), "r") as f:
        report = json.load(f)
        
    if info["instance_id"] in report["resolved"]:
        return "resolved"
    elif info["instance_id"] in report["applied"]:
        return "failed"
    else:
        return "generated"

def to_result(trajectory: dict, instance: dict) -> dict:
    info = trajectory["info"]
    
    result = {
        "instance_id": info["instance_id"],
        "duration": info["duration"],
        "total_cost": info["total_cost"],
        "status": evaluate_prediction(info),
        "steps": len(trajectory["steps"]),
        "search_steps": 0,
        "found_file": False,
        "changes": 0,
        "no_changes": 0,
        "resolved_by": len(instance["resolved_by"])
    }
    
    for step in trajectory["steps"]:
        counted_search_step = False
        for action in step["actions"]:
            action_name = f"action_{action['name']}"
            if action_name not in result:
                result[action_name] = 0
            result[action_name] += 1
            
            if action["name"] == "search" and not counted_search_step:
                result["search_steps"] += 1
                counted_search_step = True
                
                for key in action["input"]:
                    search_key = f"search_{key}"
                    if search_key not in result:
                        result[search_key] = 0
                    
                    result[search_key] += 1
                        
            if action["name"] == "identify":
                actual_spans = {}
                for span in action["input"]["files_with_spans"]:
                    actual_spans[span["file_path"]] = span["span_ids"]

                missing_files = get_missing_files(instance["expected_spans"], actual_spans)
                result["found_file"] = not missing_files
            
            if action["name"] == "request_for_change":
                if "type_of_change" in action["input"]:
                    type_of_change = f"change_{action['input']['type_of_change']}"
                    if type_of_change not in result:
                        result[type_of_change] = 0
                    result[type_of_change] += 1
                
            if action["name"] == "search_replace":               
                if "diff" in action["output"] and action["output"]["diff"]:
                    result["changes"] += 1
                else:
                    result["no_changes"] += 1

    return result

In [3]:
from moatless.search import SearchLoop
from moatless.workspace import Workspace
import subprocess
from moatless.code import CodeLoop
import traceback
import logging
import os
import time
from moatless.benchmark.swebench import setup_swebench_repo, get_repo_dir_name

logging.basicConfig(level=logging.WARNING)
logging.getLogger().setLevel(logging.WARNING)
logging.getLogger("LiteLLM").setLevel(logging.WARNING)
logging.getLogger("httpx").setLevel(logging.WARNING)

index_store_dir = f"/home/albert/index_store"
repo_base_dir = f"/tmp/repos"

evaluations_dir = "/home/albert/repos/albert/moatless/evaluations"
evaluation_name = "20240609_moatless_gpt4o_2"
evaluation_dir = f"{evaluations_dir}/{evaluation_name}"
trajectory_dir = f"{evaluations_dir}/{evaluation_name}/trajs"
predictions_path = f"{evaluation_dir}/all_preds.jsonl"

if not os.path.exists(trajectory_dir):
    os.makedirs(trajectory_dir)

def run_test(instance):
    repo_dir = setup_swebench_repo(instance)
    persist_dir = os.path.join(
        index_store_dir, get_repo_dir_name(instance["instance_id"])
    )
    workspace = Workspace.from_dirs(repo_dir=repo_dir, index_dir=persist_dir)
    
    trajectory_path = os.path.join(trajectory_dir, f"{instance['instance_id']}.json")
    if os.path.exists(trajectory_path):
        with open(trajectory_path) as file:
            trajectory = json.load(file)
        if "info" in trajectory and trajectory["info"].get("submission"):#  or "error" in trajectory["info"]:
            return trajectory

    problem_statement = instance["problem_statement"]
    
    search_instructions = f"""Find the code relevant to solve this issue: 

{problem_statement}
"""
    
    code_instructions = f"""Resolve this issue: 

{problem_statement}
"""
    

    info = {
        "evaluation_name": evaluation_name,
        "instance_id": instance["instance_id"],
        "trace_id": trace_id
    }
    
    trajectory = workspace.create_trajectory(
        "moatless", input_data={"problem_statement": problem_statement}
    )
    
    start_time = time.time()
    try:
        search_loop = SearchLoop(workspace, instructions=search_instructions, trajectory=trajectory, max_cost=0.25)
        search_response = search_loop.execute()
        
        if not search_response.files:
            info["error"] = "No files found"
        else:
            code_loop = CodeLoop(workspace, trajectory=trajectory, instructions=code_instructions, files=search_response.files, max_cost=0.5)
            response = code_loop.execute()
    
            output = subprocess.run(
                  ["git", "diff"],
                  capture_output=True,
                  text=True,
                  cwd=repo_dir,
            )
            
            info["submission"] = output.stdout
    except Exception as e:
        info["error"] = traceback.format_exc()
        logging.exception(f"Error in evaluation of {instance['instance_id']} ")
  
    info["duration"] = time.time() - start_time
    info["total_cost"] = trajectory.total_cost()
    
    trajectory.save_info(info)
    trajectory.persist(file_path=trajectory_path)
    
    return workspace.trajectory.dict()


In [4]:
from moatless.benchmark.swebench import generate_md_report
from pandas import DataFrame
import pandas as pd
from tqdm.notebook import tqdm
import json

def run_evaluation(dataset_file: str) -> DataFrame:
    count = 0
    generated = 0
    error = 0
    
    sum_duration = 0
    sum_total_cost = 0
    
    with open(dataset_file, "r") as f:
        instances = json.load(f)

    with open(predictions_path, "w") as file:
        file.write("")

    results = []
    #instances = [instance for instance in instances if len(instance["resolved_by"]) >= 1]
    
    to_test = [
    "django__django-10914",
    "django__django-13757",
    "django__django-15789",
    "matplotlib__matplotlib-23913",
    "pytest-dev__pytest-7373",
    "scikit-learn__scikit-learn-13142",
    "scikit-learn__scikit-learn-13496",
    "scikit-learn__scikit-learn-15535",
    "scikit-learn__scikit-learn-13779",
    "scikit-learn__scikit-learn-13584",
    "django__django-14915",
    "pytest-dev__pytest-5692",
    "scikit-learn__scikit-learn-10297",
    "scikit-learn__scikit-learn-15535",
    "django__django-13158",
    "django__django-14787",
    "django__django-15790",
    "matplotlib__matplotlib-25442",
    "mwaskom__seaborn-3190",
    "sphinx-doc__sphinx-8721",
    "sphinx-doc__sphinx-8595",
    "sympy__sympy-15678",
    "sympy__sympy-18532",
    "sympy__sympy-21847"
  ]
    
    # instances = [instance for instance in instances if instance["instance_id"] in to_test]

    stats = {}
    pbar = tqdm(instances)
    for instance in pbar:
        pbar.set_description(f"Instance {instance['instance_id']}")
        
        trajectory = run_test(instance)
        if not trajectory:
            continue
    
        result = to_result(trajectory, instance)
        results.append(result)
        
        if result["status"] in ["failed"]:
            md_report = generate_md_report(trajectory, instance)
            with open(f"{evaluation_dir}/reports/{instance['instance_id']}.md", "w") as file:
                file.write(md_report)
        
        if result["status"] in ["error"]:
            md_report = generate_md_report(trajectory, instance)
            with open(f"{evaluation_dir}/errors/{instance['instance_id']}.md", "w") as file:
                file.write(md_report)
        
        sum_duration += result["duration"]
        sum_total_cost += result["total_cost"]
        
        if result["status"] == "error":
            error += 1

        if result["status"] in ["generated", "failed", "resolved"]:
            generated += 1
        
        count += 1

        if sum_duration > 0:
            stats["avg_duration"] = sum_duration / count

        if sum_total_cost > 0:
            stats["avg_cost"] = sum_total_cost / count
            stats["total_cost"] = sum_total_cost
        
        if generated > 0:
            success_rate = (generated / count) * 100
            stats["generated"] = f"{success_rate:.2f}%"
    
        stats["error"] = error
        
        pbar.set_postfix(stats)
    
        prediction = {
            "model_name_or_path": evaluation_name,
            "instance_id": instance["instance_id"],
            "model_patch": trajectory["info"].get("submission", ""),
        }
    
        with open(predictions_path, "a") as file:
            json_string = json.dumps(prediction)
            file.write(json_string + "\n")

    return pd.DataFrame(results)

df = run_evaluation("/home/albert/repos/albert/moatless/datasets/swebench_lite_all_evaluations.json")

In [5]:
df.to_csv('output.csv', index=False)
df