In [1]:
import datetime
import os

index_store_dir = f"/home/albert/index_store"
repo_base_dir = f"/tmp/repos"

evaluations_dir = "/home/albert/repos/albert/moatless/evaluations"
evaluation_name = f""
evaluation_dir = f"{evaluations_dir}/{evaluation_name}"
trajectory_dir = f"{evaluations_dir}/{evaluation_name}/trajs"

if not os.path.exists(trajectory_dir):
    os.makedirs(trajectory_dir)


In [2]:
from moatless.find.identify import IdentifyCode
from moatless.find.search import SearchCode
from moatless.transitions import search_transitions

global_params = {
    "model": "gpt-4o-2024-05-13",
    "temperature": 0.2,
    "max_tokens": 2000,
    "max_prompt_file_tokens": 8000,
}

state_params = {
    SearchCode: {
        "provide_initial_context": True,
        "max_search_results": 75,
        "initial_context_tokens": 6000,
        "initial_search_results": 100,
        "initial_context_spans_per_file": 5,
    },
    IdentifyCode: {"expand_context": True},
}

transitions = search_transitions(
    global_params=global_params,
    state_params=state_params,
)

In [3]:
import logging
import traceback
from moatless.transitions import search_transitions
from moatless.loop import AgenticLoop
import time
from moatless.evaluation.utils import trace_metadata
from moatless import Workspace
from moatless.benchmark.swebench import setup_swebench_repo, get_repo_dir_name

def evaluate(instance):
    repo_dir = setup_swebench_repo(instance)
    instance_id = instance["instance_id"]
    persist_dir = os.path.join(
        index_store_dir, get_repo_dir_name(instance_id)
    )
    workspace = Workspace.from_dirs(repo_dir=repo_dir, index_dir=persist_dir)
    metadata = trace_metadata(instance_id=instance_id, session_id=evaluation_name, trace_name="search")

    trajectory_path = os.path.join(trajectory_dir, f"{instance_id}.json")
    if os.path.exists(trajectory_path):
        with open(trajectory_path) as file:
            trajectory = json.load(file)
        if "info" in trajectory:
            
            return to_result(instance, trajectory, workspace)

    problem_statement = instance["problem_statement"]
    
    search_instructions = f"""Find the code relevant to solve this issue: 

{problem_statement}
"""
    
    info = {
        "evaluation_name": evaluation_name,
        "instance_id": instance_id,
        "trace_id": metadata["trace_id"]
    }
    transitions = search_transitions(global_params={"model": model})
    search_loop = AgenticLoop(transitions=transitions, workspace=workspace, metadata=metadata, trajectory_path=trajectory_path)

    start_time = time.time()
    try:
        search_response = search_loop.run(message=search_instructions)
        
    except Exception as e:
        info["error"] = traceback.format_exc()
        logging.exception(f"Error in evaluation of {instance['instance_id']} ")
  
    info["duration"] = time.time() - start_time
    info["total_cost"] = search_loop.trajectory.total_cost()
    search_loop.trajectory.save_info(info)
    
    return to_result(instance, search_loop.trajectory.to_dict(), workspace)

In [4]:
from pandas import DataFrame
import pandas as pd
from tqdm.notebook import tqdm
import json

def run_evaluation(dataset_file: str) -> DataFrame:
    count = 0
    expected_identified = 0
    error = 0
    
    sum_duration = 0
    sum_total_cost = 0
    
    with open(dataset_file, "r") as f:
        instances = json.load(f)

    results = []
    instances = [instance for instance in instances if len(instance["resolved_by"]) >= 6]
    instances = sorted(instances, key=lambda x: x["instance_id"])

    stats = {}
    pbar = tqdm(instances)
    for instance in pbar:
        pbar.set_description(f"Instance {instance['instance_id']}")
        
        result = evaluate(instance)
        if not result:
            continue
    
        results.append(result)
        
        count += 1
        
        if result["expected_identified"] or result["alt_identified"]:
            expected_identified += 1
        
        sum_duration += result["duration"]
        sum_total_cost += result["total_cost"]

        if sum_duration > 0:
            stats["avg_duration"] = sum_duration / count

        if sum_total_cost > 0:
            stats["avg_cost"] = sum_total_cost / count
            stats["total_cost"] = sum_total_cost
    
        if expected_identified:
            success_rate = (expected_identified / count) * 100
            stats["success_rate"] = f"{success_rate:.2f}%"
    
        stats["error"] = error
        
        pbar.set_postfix(stats)

    return pd.DataFrame(results)

df = run_evaluation("/home/albert/repos/albert/moatless/datasets/swebench_lite_all_evaluations.json")

In [None]:
df.to_csv(f"{evaluation_dir}/result.csv", index=False, sep=';', decimal=',')
df