In [9]:
import litellm
import datetime
import os

index_store_dir = f"/home/albert/index_store"
repo_base_dir = f"/tmp/repos"

previous_evaluation = "20240615_moatless_search_gpt4o"

model = "gpt-4o-2024-05-13"
edit_model = "gpt-4o-2024-05-13"

date_str = datetime.datetime.now().strftime("%Y%m%d")
model_file_name = f"{model.replace('/', '_')}"

if model != edit_model:
    model_file_name += f"_edit_2_{edit_model.replace('/', '_')}"

evaluations_dir = "/home/albert/repos/albert/moatless/evaluations"
evaluation_name = f"{date_str}_moatless_code_2_{model_file_name}"
evaluation_dir = f"{evaluations_dir}/{evaluation_name}"
trajectory_dir = f"{evaluations_dir}/{evaluation_name}/trajs"
predictions_path = f"{evaluation_dir}/all_preds.jsonl"

previous_trajectories_dir = f"{evaluations_dir}/{previous_evaluation}/trajs"

if not os.path.exists(trajectory_dir):
    os.makedirs(trajectory_dir)

litellm.success_callback = ["langfuse"]
litellm.failure_callback = ["langfuse"]

In [10]:
def determine_status(info: dict) -> str:
    if "error" in info:
        return "error"
    
    if "submission" not in info or not info["submission"]:
        return "not_submitted"
    
    prediction = info["submission"]
    
    result_file = f"{evaluation_dir}/result.json"
    if not os.path.exists(result_file) and prediction:
        # No support for evaluation yet. Generate the swe bench evaluation result file and run again...
        return "generated"
    
    with open(os.path.join(result_file), "r") as f:
        report = json.load(f)
        
    if info["instance_id"] in report["resolved"]:
        return "resolved"
    else:
        return "failed"

def to_result(trajectory: dict, instance: dict) -> dict:
    info = trajectory["info"]
    
    result = {
        "instance_id": info["instance_id"],
        "duration": info["duration"],
        "total_cost": info["total_cost"],
        "status": determine_status(info),
        #"transitions": len(trajectory["transitions"]),
        "rejections": 0,
        "changes": 0,
        "resolved_by": len(instance["resolved_by"])
    }

    return result

In [11]:
from moatless.edit.edit import EditCode
from moatless.workspace import Workspace
import subprocess

from moatless.transitions import code_transitions
from moatless.loop import AgenticLoop
import traceback
from moatless.benchmark.utils import trace_metadata
import logging
import os
import time
from moatless.benchmark.swebench import setup_swebench_repo, get_repo_dir_name, sync_file_context_with_search_trajectory


def evaluate(instance):
    repo_dir = setup_swebench_repo(instance)
    instance_id = instance["instance_id"]
    persist_dir = os.path.join(
        index_store_dir, get_repo_dir_name(instance["instance_id"])
    )
    
    workspace = Workspace.from_dirs(repo_dir=repo_dir, index_dir=persist_dir)

    trajectory_path = os.path.join(trajectory_dir, f"{instance['instance_id']}.json")
    if os.path.exists(trajectory_path):
        with open(trajectory_path) as file:
            trajectory = json.load(file)
        if "info" in trajectory and trajectory["info"].get("submission"):
            return trajectory
    
    previous_trajectory_file = os.path.join(previous_trajectories_dir, f"{instance['instance_id']}.json")
    if os.path.exists(previous_trajectory_file):
        with open(previous_trajectory_file) as file:
            previous_trajectory = json.load(file)
        sync_file_context_with_search_trajectory(workspace, previous_trajectory)
    else:
        print(f"Missing previous trajectory file {previous_trajectory_file}")
        return None

    if not workspace.file_context.files:
        print(f"No files in context from previous trajectory for {instance['instance_id']}")
        return None
    
    metadata = trace_metadata(instance_id=instance_id, session_id=evaluation_name, trace_name="code")
    transitions = code_transitions(global_params={"model": model}, state_params={EditCode: {"model": edit_model}})
    
    code_loop = AgenticLoop(transitions=transitions, workspace=workspace, metadata=metadata, trajectory_path=trajectory_path, max_cost=0.5)

    info = {
        "evaluation_name": evaluation_name,
        "instance_id": instance["instance_id"]
    }
    
    start_time = time.time()
    try:
        response = code_loop.run(message=instance["problem_statement"])
    except Exception as e:
        info["error"] = traceback.format_exc()
        logging.exception(f"Error in evaluation of {instance['instance_id']} ")
        raise e

    info["duration"] = time.time() - start_time
    info["total_cost"] = code_loop.trajectory.total_cost()
    
    workspace.save()
    
    output = subprocess.run(
          ["git", "diff"],
          capture_output=True,
          text=True,
          cwd=repo_dir,
    )
    
    info["submission"] = output.stdout

    code_loop.trajectory.save_info(info)
    
    return code_loop.trajectory.to_dict()


In [12]:
from moatless.benchmark.swebench import generate_md_report
from pandas import DataFrame
import pandas as pd
from tqdm.notebook import tqdm
import json

def run_evaluation(dataset_file: str) -> DataFrame:
    count = 0
    generated = 0
    error = 0
    
    sum_duration = 0
    sum_total_cost = 0
    
    with open(dataset_file, "r") as f:
        instances = json.load(f)

    with open(predictions_path, "w") as file:
        file.write("")

    results = []
    instances = [instance for instance in instances if len(instance["resolved_by"]) >= 6]
    
    instances = [instance for instance in instances if instance["instance_id"] in ["django__django-15789","django__django-15851","pytest-dev__pytest-5692","scikit-learn__scikit-learn-13241","scikit-learn__scikit-learn-13779","scikit-learn__scikit-learn-25570"]]

    stats = {}
    pbar = tqdm(instances)
    for instance in pbar:
        pbar.set_description(f"Instance {instance['instance_id']}")
        
        trajectory = evaluate(instance)
        if not trajectory:
            continue
    
        result = to_result(trajectory, instance)
        results.append(result)
        
        try:
            if result["status"] in ["failed"]:
                md_report = generate_md_report(trajectory, instance)
                if not os.path.exists(f"{evaluation_dir}/reports"):
                    os.makedirs(f"{evaluation_dir}/reports")
                with open(f"{evaluation_dir}/reports/{instance['instance_id']}.md", "w") as file:
                    file.write(md_report)
        except Exception as e:
            logging.exception(f"Error in generating report for {instance['instance_id']} ")
        
        sum_duration += result["duration"]
        sum_total_cost += result["total_cost"]
        
        if result["status"] == "error":
            error += 1

        if result["status"] in ["generated", "failed", "resolved"]:
            generated += 1
        
        count += 1

        if sum_duration > 0:
            stats["avg_duration"] = sum_duration / count

        if sum_total_cost > 0:
            stats["avg_cost"] = sum_total_cost / count
            stats["total_cost"] = sum_total_cost
        
        if generated > 0:
            success_rate = (generated / count) * 100
            stats["generated"] = f"{success_rate:.2f}%"
    
        stats["error"] = error
        
        pbar.set_postfix(stats)
    
        prediction = {
            "model_name_or_path": evaluation_name,
            "instance_id": instance["instance_id"],
            "model_patch": trajectory["info"].get("submission", ""),
        }
    
        with open(predictions_path, "a") as file:
            json_string = json.dumps(prediction)
            file.write(json_string + "\n")
        
        df = pd.DataFrame(results)
        df.to_csv(f"{evaluation_dir}/result.csv", index=False, sep=';', decimal=',')

    return pd.DataFrame(results)

df = run_evaluation("/home/albert/repos/albert/moatless/datasets/swebench_lite_all_evaluations.json")

  0%|          | 0/6 [00:00<?, ?it/s]

ERROR:root:Error in evaluation of pytest-dev__pytest-5692 
Traceback (most recent call last):
  File "/tmp/ipykernel_606705/375867374.py", line 57, in evaluate
    response = code_loop.run(message=instance["problem_statement"])
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/albert/repos/albert/moatless/moatless/loop/loop.py", line 161, in run
    raise RuntimeError(
RuntimeError: The loop was aborted because the cost exceeded the limit.


RuntimeError: The loop was aborted because the cost exceeded the limit.

In [None]:
df.to_csv(f"{evaluation_dir}/result.csv", index=False, sep=';', decimal=',')
df