In [3]:
from tqdm.auto import tqdm
import pandas as pd
from moatless.benchmark.utils import get_file_spans_from_patch
from moatless.repository import FileRepository
from moatless.benchmark.swebench import setup_swebench_repo, sorted_instances
import json

runs = [
    "20240402_sweagent_claude3opus",
    "20240402_sweagent_gpt4",
    "20240509_amazon-q-developer-agent-20240430-dev",
    "20240523_aider",
    "20240524_opencsg_starship_gpt4",
    "20240527_marscode-agent-dev",
    "20240530_autocoderover-v20240408",
    "20240604_CodeR",
    "20240609_moatless_gpt4o",
    "20240612_IBM_Research_Agent101"
]

dataset_path = "/home/albert/repos/albert/moatless/datasets/swebench_lite_all_evaluations.json"

def read_predictions(pred_path: str):
    predictions = {}
    with open(pred_path) as f:
        for line in f.readlines():
            prediction = json.loads(line)
            predictions[prediction["instance_id"]] = prediction["model_patch"]
    return predictions

def generate_report():
    results = {}
    
    for run_name in runs:
        prediction_file = f"/home/albert/repos/stuffs/experiments/evaluation/lite/{run_name}/all_preds.jsonl"
        result_file = f"/home/albert/repos/stuffs/experiments/evaluation/lite/{run_name}/results/results.json"
        
        with open(result_file, "r") as file:
            final_report = json.load(file)

        resolved_tasks = final_report["resolved"]
        predictions_by_id = read_predictions(prediction_file)

        results[run_name] = {
            "resolved_tasks": resolved_tasks,
            "predictions": predictions_by_id,
        }

    evaluation_dataset = []
    
    report = []

    instances = sorted_instances(
        split="test", dataset_name="princeton-nlp/SWE-bench_Lite"
    )
    for instance in tqdm(instances):
        instance_id = instance["instance_id"]
        
        expected_patch = instance["patch"]
        repo_dir = setup_swebench_repo(instance)
        file_repo = FileRepository(repo_dir)

        expected_file_spans = get_file_spans_from_patch(file_repo, expected_patch)
        
        evaluation_instance = {
            "instance_id": instance_id,
            "repo": instance["repo"],
            "base_commit": instance["base_commit"],
            "problem_statement": instance["problem_statement"],
            "golden_patch": instance["patch"],
            "expected_spans": expected_file_spans,
            "resolved_by": [],
            "alternative_spans": []
        }

        for run_name in runs:
            prediction = results[run_name]["predictions"].get(instance_id)

            if instance_id not in results[run_name]["resolved_tasks"]:
                continue

            resolved = {
                "name": run_name,
                "patch": prediction,
            }

            evaluation_instance["resolved_by"].append(resolved)

            file_spans = get_file_spans_from_patch(file_repo, prediction)
            
            is_different = False
            alternative_spans = {}
            for file_path, span_ids in file_spans.items():
                if file_path in expected_file_spans:
                    alternative_spans[file_path] = span_ids

                    if set(expected_file_spans[file_path]).difference(set(span_ids)):
                        is_different = True

            if is_different:
                evaluation_instance["alternative_spans"].append({
                    "run_name": run_name,
                    "spans": alternative_spans
                })
        
        report.append({
            "instance_id": instance_id,
            "resolved_by": len(evaluation_instance["resolved_by"]),
            "alternative_spans": len(evaluation_instance["alternative_spans"])
        })
        
        evaluation_dataset.append(evaluation_instance)
        
        with open(dataset_path, "w") as f:
            json.dump(evaluation_dataset, f, indent=2)
        
    return pd.DataFrame(report)

df = generate_report()

In [4]:
# df to csv
df.to_csv("evaluation_report.csv", index=False)