In [1]:
import litellm
import datetime
import os

index_store_dir = f"/home/albert/20240522-voyage-code-2"
repo_base_dir = f"/tmp/repos"

model = "gpt-3.5-turbo-0125" # "gpt-4o-2024-05-13"

date_str = datetime.datetime.now().strftime("%Y%m%d")
model_file_name = f"{model.replace('/', '_')}"

evaluations_dir = "/home/albert/repos/albert/moatless/evaluations"
evaluation_name = f"{date_str}_moatless_{model_file_name}"
evaluation_dir = f"{evaluations_dir}/{evaluation_name}"
trajectory_dir = f"{evaluations_dir}/{evaluation_name}/trajs"
predictions_path = f"{evaluation_dir}/all_preds.jsonl"

if not os.path.exists(trajectory_dir):
    os.makedirs(trajectory_dir)

litellm.success_callback = ["langfuse"]
litellm.failure_callback = ["langfuse"]

In [2]:
import logging
import os
import subprocess
import time
import traceback

from moatless.benchmark.swebench import setup_swebench_repo, get_repo_dir_name, verify_search_trajectory

from moatless.workspace import Workspace

if not os.path.exists(trajectory_dir):
    os.makedirs(trajectory_dir)

def determine_status(info: dict) -> str:
    if "error" in info:
        return "error"
    
    if "submission" not in info or not info["submission"]:
        return "not_generated"
    
    prediction = info["submission"]
    
    result_file = f"{evaluation_dir}/result.json"
    if not os.path.exists(result_file) and prediction:
        # No support for evaluation yet. Generate the swe bench evaluation result file and run again...
        return "generated"
    
    with open(os.path.join(result_file), "r") as f:
        report = json.load(f)
        
    if info["instance_id"] in report["resolved"]:
        return "resolved"
    else:
        return "failed"

def to_result(instance: dict, trajectory: dict, workspace: Workspace) -> dict:
    info = trajectory["info"]
    
    result = {
        "instance_id": info["instance_id"],
        "duration": info["duration"],
        "total_cost": info["total_cost"],
        "status": determine_status(info),
        "transitions": len(trajectory["transitions"])
    }
    result.update(verify_search_trajectory(trajectory, instance, workspace))

    return result


In [3]:
from moatless.edit.plan import PlanToCode
from moatless.edit.edit import EditCode
from moatless.loop import AgenticLoop
from moatless.transitions import search_and_code_transitions
from moatless.evaluation.utils import trace_metadata

def evaluate(instance):
    instance_id = instance["instance_id"]

    repo_dir = setup_swebench_repo(instance)
    persist_dir = os.path.join(
        index_store_dir, get_repo_dir_name(instance_id)
    )
    workspace = Workspace.from_dirs(repo_dir=repo_dir, index_dir=persist_dir)
    
    trajectory_path = os.path.join(trajectory_dir, f"{instance_id}.json")
    if os.path.exists(trajectory_path):
        with open(trajectory_path) as file:
            trajectory = json.load(file)
        if "info" in trajectory and trajectory["info"].get("submission") or "error" in trajectory["info"]:
            return to_result(instance, trajectory, workspace), trajectory

    problem_statement = instance["problem_statement"]
    
    metadata = trace_metadata(instance_id=instance_id, session_id=evaluation_name, trace_name="search_and_code")
    transitions = search_and_code_transitions(global_params={"model": model, "max_iterations": 15}, state_params={EditCode: {"max_iterations": 8}, PlanToCode: {"max_iterations": 8}})
    
    loop = AgenticLoop(transitions=transitions, workspace=workspace, metadata=metadata, trajectory_path=trajectory_path, max_cost=0.5, max_transitions=40, max_retries=6, max_message_tokens=14000, max_rejections=4)

    info = {
        "evaluation_name": evaluation_name,
        "instance_id": instance["instance_id"]
    }
    
    start_time = time.time()
    try:
        response = loop.run(problem_statement)
        
    except Exception as e:
        info["error"] = traceback.format_exc()
        logging.exception(f"Error in evaluation of {instance['instance_id']} ")
  
    info["duration"] = time.time() - start_time
    info["total_cost"] = loop.trajectory.total_cost()
    
    workspace.save()
    
    output = subprocess.run(
          ["git", "diff"],
          capture_output=True,
          text=True,
          cwd=repo_dir,
    )
    
    info["submission"] = output.stdout

    loop.trajectory.save_info(info)
    trajectory = loop.trajectory.to_dict()

    return to_result(instance, trajectory, workspace), trajectory

In [None]:
from moatless.benchmark.swebench import sorted_instances
import pandas as pd
from tqdm.notebook import tqdm
import json

def run_evaluation(dataset_file: str, dataset: str = "princeton-nlp/SWE-bench_Lite", split="test"):
    if dataset_file:    
        with open(dataset_file, "r") as f:
            instances = json.load(f)

        instances = sorted(instances, key=lambda x: len(x["resolved_by"]), reverse=True)
    else:
        instances = sorted_instances(dataset, split)
    
    count = 0
    identified = 0
    generated = 0
    error = 0
    
    sum_duration = 0
    sum_total_cost = 0

    with open(predictions_path, "w") as file:
        file.write("")

    results = []
    
    stats = {}
    pbar = tqdm(instances)
    for instance in pbar:
        result, trajectory = evaluate(instance)
        if not result:
            error += 1
            continue
    
        sum_duration += result["duration"]
        sum_total_cost += result["total_cost"]
        
        if result["status"] == "error":
            error += 1

        if result["status"] in ["generated", "failed", "resolved"]:
            generated += 1
            
        if result["identified"] is not None:
            identified += 1
        
        count += 1

        if sum_duration > 0:
            stats["avg_duration"] = sum_duration / count

        if sum_total_cost > 0:
            stats["avg_cost"] = sum_total_cost / count
            stats["total_cost"] = sum_total_cost
        
        if identified > 0:
            success_rate = (identified / count) * 100
            stats["identified"] = f"{success_rate:.2f}%"
    
        if generated > 0:
            success_rate = (generated / count) * 100
            stats["generated"] = f"{success_rate:.2f}%"
    
        stats["error"] = error
        
        pbar.set_postfix(stats)
    
        prediction = {
            "model_name_or_path": evaluation_name,
            "instance_id": instance["instance_id"],
            "model_patch": trajectory["info"].get("submission", ""),
        }
    
        with open(predictions_path, "a") as file:
            json_string = json.dumps(prediction)
            file.write(json_string + "\n")

    return pd.DataFrame(results)
df = run_evaluation("/home/albert/repos/albert/moatless/datasets/swebench_lite_all_evaluations.json")

# run_evaluation()

  0%|          | 0/300 [00:00<?, ?it/s]

In [None]:
df.to_csv(f"{evaluation_dir}/result.csv", index=False, sep=';', decimal=',')
df