In [2]:
import tomllib
import altair as alt
import pandas as pd
import os
from common.interfaces import RetrievedFile, FileDisplayLevel
from retrieval.gold import GoldenRetriever
alt.data_transformers.disable_max_rows()


config = tomllib.load(open("configs/main.toml", "rb"))
working_stage = config["working_stage"]
dataset = "princeton-nlp/SWE-bench_Lite"
split = "dev"

In [3]:
def make_stats_df(dataset, split, filter_by_count=None, code_only=True):
    retriever = GoldenRetriever(dataset_name=dataset, split=split, num_shards=None, shard_id=None)
    stats = retriever.collect_stats(code_only=True)
    instance_ids = [k for k in stats['num_files'].keys()]
    num_files = [v for v in stats['num_files'].values()]
    num_edits = [v for v in stats['num_edits'].values()]
    num_lines = [v for v in stats['num_lines'].values()]
    num_imports = [v for v in stats['num_imports'].values()]

    data = {
        'instance_ids': instance_ids,
        'num_files': num_files,
        'num_edits': num_edits,
        'num_imports': num_imports,
        'num_lines': num_lines
    }
    df = pd.DataFrame(data)
    if filter_by_count is not None:
        df = df[df['num_files'] == filter_by_count]
    return df

df = make_stats_df(dataset, split, filter_by_count=None, code_only=True)
simple_df = df[(df['num_files'] == 1) & (df['num_edits'] <= 2)]
f"{len(simple_df)}/{len(df)}"

Loading cache from ./working_stage/gold_retrieved_princeton-nlp__SWE-bench_Lite_dev.pkl


'20/23'

In [19]:
trunc_num_files = 10
trunc_num_edits = 50
trunc_num_lines = 100
trunc_num_imports = 10
# Make charts
fchart = alt.Chart(df[df.num_files <= trunc_num_files]).mark_bar().encode(
    x=alt.X('num_files:O', bin=True, title='Number of Files'),
    y=alt.Y('count()', title='Number of Instances'),
).properties(
    title='Number of Files per Instance',
    width=400,
    height=250,
)

echart = alt.Chart(df[df.num_edits <= trunc_num_edits]).mark_bar().encode(
    x=alt.X('num_edits:O', bin=True, title='Number of Edits'),
    y=alt.Y('count()', title='Number of Instances'),
).properties(
    title='Number of Edits per Instance',
    width=400,
    height=250,
)

lchart = alt.Chart(df[df.num_lines <= trunc_num_lines]).mark_bar().encode(
    x=alt.X('num_lines:O', bin=True, title='Number of Lines'),
    y=alt.Y('count()', title='Number of Instances'),
).properties(
    title='Number of Lines per Instance',
    width=400,
    height=250,
)


ichart = alt.Chart(df[df.num_imports <= trunc_num_imports]).mark_bar().encode(
    x=alt.X('num_imports:O', bin=True, title='Number of Imports'),
    y=alt.Y('count()', title='Number of Instances'),
).properties(
    title='Number of Imports per Instance',
    width=400,
    height=250,
)


chart = fchart | echart | lchart | ichart
chart

In [20]:
print("File Counts")
cumulative = 0
for count in range(0, 11):
    nums = len(df[df['num_files'] == count])
    percent = (100.0 * nums) / len(df)
    print(f"Percentage of {count}: {percent}") 
    cumulative += percent
print(f"Cumulative (<= {count}): {cumulative}")
print("Import Counts")
cumulative = 0
for count in range(0, 6):
    nums = len(df[df['num_imports'] == count])
    percent = (100.0 * nums) / len(df)
    print(f"Percentage of {count}: {percent}")
    cumulative += percent
print(f"Cumulative (<= {count}): {cumulative}")
print("Edit Counts")
cumulative = 0
for count in range(0, 11):
    nums = len(df[df['num_edits'] == count])
    percent = (100.0 * nums) / len(df)
    print(f"Percentage of {count}: {percent}")
    cumulative += percent
print(f"Cumulative (<= {count}): {cumulative}")
print("Line Counts")
cumulative = 0
for count in range(0, 101):
    nums = len(df[df['num_lines'] == count])
    percent = (100.0 * nums) / len(df)
    cumulative += percent
    if count > 0 and count % 10 == 0:
        print(f"Cumulative so far (<= {count}): {cumulative}")



File Counts
Percentage of 0: 0.0
Percentage of 1: 100.0
Percentage of 2: 0.0
Percentage of 3: 0.0
Percentage of 4: 0.0
Percentage of 5: 0.0
Percentage of 6: 0.0
Percentage of 7: 0.0
Percentage of 8: 0.0
Percentage of 9: 0.0
Percentage of 10: 0.0
Cumulative (<= 10): 100.0
Import Counts
Percentage of 0: 86.95652173913044
Percentage of 1: 4.3478260869565215
Percentage of 2: 4.3478260869565215
Percentage of 3: 4.3478260869565215
Percentage of 4: 0.0
Percentage of 5: 0.0
Cumulative (<= 5): 99.99999999999999
Edit Counts
Percentage of 0: 0.0
Percentage of 1: 52.17391304347826
Percentage of 2: 34.78260869565217
Percentage of 3: 13.043478260869565
Percentage of 4: 0.0
Percentage of 5: 0.0
Percentage of 6: 0.0
Percentage of 7: 0.0
Percentage of 8: 0.0
Percentage of 9: 0.0
Percentage of 10: 0.0
Cumulative (<= 10): 100.0
Line Counts
Cumulative so far (<= 10): 52.17391304347826
Cumulative so far (<= 20): 73.91304347826085
Cumulative so far (<= 30): 95.65217391304344
Cumulative so far (<= 40): 99.99

In [4]:
# Remove any existing patch file of the form essai-<instance_id>.patch
for f in os.listdir(working_stage):
    if f.startswith(f"essai") and f.endswith(".patch"):
        os.remove(f"{working_stage}/{f}")
retriever = GoldenRetriever(dataset_name=dataset, split=split, num_shards=None, shard_id=None)
res = []
for instance_id in simple_df['instance_ids']:
    patch = retriever.gold_patches[instance_id]
    print(f"Instance ID: {instance_id}.")
    print(f"Patch:\n{patch}")
    patch_file = f"{working_stage}/essai-{instance_id}.patch"
    with open(patch_file, 'w') as f:
        f.write(f"Instance ID: {instance_id}\n")
        f.write(f"{patch}")
    res.append(instance_id)

res

Loading cache from ./working_stage/gold_retrieved_princeton-nlp__SWE-bench_Lite_dev.pkl
Instance ID: sqlfluff__sqlfluff-1625.
Patch:
diff --git a/src/sqlfluff/rules/L031.py b/src/sqlfluff/rules/L031.py
--- a/src/sqlfluff/rules/L031.py
+++ b/src/sqlfluff/rules/L031.py
@@ -211,7 +211,7 @@ def _lint_aliases_in_join(
             violation_buff.append(
                 LintResult(
                     anchor=alias_info.alias_identifier_ref,
-                    description="Avoid using aliases in join condition",
+                    description="Avoid aliases in from clauses and join conditions.",
                     fixes=fixes,
                 )
             )

Instance ID: sqlfluff__sqlfluff-2419.
Patch:
diff --git a/src/sqlfluff/rules/L060.py b/src/sqlfluff/rules/L060.py
--- a/src/sqlfluff/rules/L060.py
+++ b/src/sqlfluff/rules/L060.py
@@ -59,4 +59,8 @@ def _eval(self, context: RuleContext) -> Optional[LintResult]:
             ],
         )
 
-        return LintResult(context.segm

['sqlfluff__sqlfluff-1625',
 'sqlfluff__sqlfluff-2419',
 'sqlfluff__sqlfluff-1733',
 'sqlfluff__sqlfluff-1517',
 'sqlfluff__sqlfluff-1763',
 'marshmallow-code__marshmallow-1359',
 'marshmallow-code__marshmallow-1343',
 'pvlib__pvlib-python-1707',
 'pvlib__pvlib-python-1072',
 'pvlib__pvlib-python-1854',
 'pvlib__pvlib-python-1154',
 'pylint-dev__astroid-1333',
 'pylint-dev__astroid-1196',
 'pylint-dev__astroid-1866',
 'pylint-dev__astroid-1268',
 'pyvista__pyvista-4315',
 'pydicom__pydicom-1694',
 'pydicom__pydicom-1413',
 'pydicom__pydicom-1139',
 'pydicom__pydicom-1256']