In [2]:
from datasets import load_dataset
import pandas as pd

swebench = load_dataset('princeton-nlp/SWE-bench_Lite', split='test')

df = swebench.to_pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import os
# Get all patch files from the dataset
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
for i, row in df.iterrows():
    patch_file = os.path.join(SCRIPT_DIR, f'{row["instance_id"]}.diff')
    with open(patch_file, 'w') as f:
        f.write(row['patch'])

In [18]:
import re

addition_only_count = 0
deletion_only_count = 0
standalone_additions_count = 0
standalone_deletions_count = 0
patches_with_standalone_additions = []
patches_with_standalone_deletions = []
for i, row in df.iterrows():
    patch = row['patch']
    
    # Split into diff sections
    diff_sections = re.split(r'diff --git', patch)[1:]  # Skip first empty section
    
    # Check if any section has deletions
    has_deletions = False
    for section in diff_sections:
        # Look for lines starting with - that aren't file headers (---) 
        deletion_lines = [line for line in section.split('\n') if line.startswith('-') and not line.startswith('---')]
        if deletion_lines:
            has_deletions = True
            break
    
    has_additions = False
    for section in diff_sections:
        lines = section.split('\n')
        # Skip first three lines as they are file headers
        lines = lines[3:]
        if any(line.startswith('+') for line in lines):
            has_additions = True
            break

    # Check if any section has standalone additions (not part of modifications)
    has_standalone_additions = False
    for section in diff_sections:
        lines = section.split('\n')
        # Skip first three lines as they are file headers
        lines = lines[3:]
        # Transform the lines into a sequence of +, -, or " "
        # Representing addition, deletion, or no change
        add_or_mod = ""
        for line in lines:
            if line.startswith('+'):
                add_or_mod += "+"
            elif line.startswith('-'):
                add_or_mod += "-"
            else:
                add_or_mod += " "

        # Then use regex to check how many standalone +s there are
        standalone_additions = len(re.findall(r'(?:^| )\++(?:$| )', add_or_mod))
        standalone_additions_count += standalone_additions

        standalone_deletions = len(re.findall(r'(?:^| )\-+(?:$| )', add_or_mod))
        standalone_deletions_count += standalone_deletions
    
    if standalone_additions > 0:
        patches_with_standalone_additions.append(row['instance_id'])

    if standalone_deletions > 0:
        patches_with_standalone_deletions.append(row['instance_id'])

    if not has_deletions:
        addition_only_count += 1
    if not has_additions:
        deletion_only_count += 1

print(f"Number of patches that only add code: {addition_only_count}")
print(f"Number of standalone additions: {standalone_additions_count}")
print(f"Number of patches with standalone additions: {len(patches_with_standalone_additions)}")
print(f"Patches with standalone additions: {patches_with_standalone_additions}")
print()
print(f"Number of patches that only delete code: {deletion_only_count}")
print(f"Number of standalone deletions: {standalone_deletions_count}")
print(f"Number of patches with standalone deletions: {len(patches_with_standalone_deletions)}")
print(f"Patches with standalone deletions: {patches_with_standalone_deletions}")


Number of patches that only add code: 77
Number of standalone additions: 233
Number of patches with standalone additions: 167
Patches with standalone additions: ['astropy__astropy-14182', 'astropy__astropy-7746', 'django__django-11019', 'django__django-11179', 'django__django-11283', 'django__django-11564', 'django__django-11583', 'django__django-11630', 'django__django-11742', 'django__django-11905', 'django__django-11910', 'django__django-12113', 'django__django-12184', 'django__django-12286', 'django__django-12308', 'django__django-12453', 'django__django-12589', 'django__django-12700', 'django__django-12856', 'django__django-12908', 'django__django-12915', 'django__django-13158', 'django__django-13220', 'django__django-13230', 'django__django-13265', 'django__django-13315', 'django__django-13321', 'django__django-13401', 'django__django-13447', 'django__django-13448', 'django__django-13590', 'django__django-13710', 'django__django-13757', 'django__django-13768', 'django__django-139

In [22]:
import numpy as np

num_files_edited = []
for i, row in df.iterrows():
    gold_patch = row['patch']
    diff_pattern = r"diff --git a/.* b/(.*)"
    gold_files = re.findall(diff_pattern, gold_patch)
    num_files_edited.append(len(gold_files))

print(f"Average number of files edited: {np.mean(num_files_edited)}")
print(f"Max number of files edited: {np.max(num_files_edited)}")
print(f"Min number of files edited: {np.min(num_files_edited)}")

Average number of files edited: 1.0
Max number of files edited: 1
Min number of files edited: 1
