python -m swebench.harness.run_evaluation \
    --dataset_name princeton-nlp/SWE-bench_Lite \
    --predictions_path <path_to_predictions> \
    --max_workers <num_workers> \
    --run_id <run_id>
    # use --predictions_path 'gold' to verify the gold patches
    # use --run_id to name the evaluation run

In [None]:

import json
from harness.constants import (
    SWEbenchInstance,
    KEY_INSTANCE_ID,
)
from pathlib import Path
from typing import cast
from datasets import Dataset, load_dataset
from harness.constants import (
    KEY_INSTANCE_ID,
    KEY_MODEL,
    KEY_PREDICTION
)

def load_swebench_dataset(name="princeton-nlp/SWE-bench", split="test", instance_ids=None) -> list[SWEbenchInstance]:
    """
    Load SWE-bench dataset from Hugging Face Datasets or local .json/.jsonl file
    """
    # check that all instance IDs are in the dataset
    if instance_ids:
        instance_ids = set(instance_ids)
    # Load from local .json/.jsonl file
    if name.endswith(".json") or name.endswith(".jsonl"):
        dataset = json.loads(Path(name).read_text())
        dataset_ids = {instance[KEY_INSTANCE_ID] for instance in dataset}
    else:
        # Load from Hugging Face Datasets
        if name.lower() in {"swe-bench", "swebench", "swe_bench"}:
            name = "princeton-nlp/SWE-bench"
        elif name.lower() in {"swe-bench-lite", "swebench-lite", "swe_bench_lite", "swe-bench_lite", "lite"}:
            name = "princeton-nlp/SWE-bench_Lite"
        dataset = cast(Dataset, load_dataset(name, split=split))
        dataset_ids = {instance[KEY_INSTANCE_ID] for instance in dataset}
    if instance_ids:
        if instance_ids - dataset_ids:
            raise ValueError(
                (
                    "Some instance IDs not found in dataset!"
                    f"\nMissing IDs:\n{' '.join(instance_ids - dataset_ids)}"
                )
            )
        dataset = [instance for instance in dataset if instance[KEY_INSTANCE_ID] in instance_ids]
    return [cast(SWEbenchInstance, instance) for instance in dataset]


def get_gold_predictions(dataset_name: str, split: str):
    """
    Get gold predictions for the given dataset and split.
    """
    dataset = load_swebench_dataset(dataset_name, split)
    return [
        {
            KEY_INSTANCE_ID: datum[KEY_INSTANCE_ID],
            KEY_PREDICTION: datum["patch"],
            KEY_MODEL: "gold",
        } for datum in dataset
    ]

def main():
    """
    Run evaluation harness for the given dataset and predictions.
    """
    # set open file limit
    # assert len(run_id) > 0, "Run ID must be provided"

    # load predictions as map of instance_id to prediction

    print("Using gold predictions - ignoring predictions_path")
    predictions = get_gold_predictions("princeton-nlp/SWE-bench_Lite", "test")
    predictions = {pred[KEY_INSTANCE_ID]: pred for pred in predictions}
    print(predictions)
        # get dataset from predictions
        # dataset = get_dataset_from_preds(dataset_name, split, instance_ids, predictions, run_id)
        # full_dataset = load_swebench_dataset(dataset_name, split, instance_ids)
        # existing_images = list_images(client)
        # print(f"Running {len(dataset)} unevaluated instances...")
        # if not dataset:
        #     print("No instances to run.")
        # else:
        #     # build environment images + run instances
        #     build_env_images(client, dataset, force_rebuild, max_workers)
        #     run_instances(predictions, dataset, cache_level, clean, force_rebuild, max_workers, run_id, timeout)

        # # clean images + make final report
        # clean_images(client, existing_images, cache_level, clean)
        # make_run_report(predictions, full_dataset, client, run_id)

In [None]:
main()