© 2020 Nokia

Licensed under the BSD 3 Clause license

SPDX-License-Identifier: BSD-3-Clause

# Create duplicates datasets for training on PACS benchmark

We filter the duplicates dataset to ensure that there is no overlap between the snippet collections and evaluation queries in the PACS benchmark.

In [1]:
import os; os.environ["http_proxy"] = "http://135.245.192.7:8000"

In [2]:
from itertools import chain

from codesearch.data import load_train_dataset, load_snippet_collection, load_eval_dataset, save_jsonl, load_jsonl

duplicates = load_train_dataset("so-duplicates-feb20")
conala_snippets = load_snippet_collection("conala-curated")
staqc_snippets = load_snippet_collection("staqc-py-cleaned")
so_ds = load_snippet_collection("so-ds-feb20")

conala_test_queries,_ = load_eval_dataset("conala-curated-0.5-test")
staqc_valid_queries, _ = load_eval_dataset("staqc-py-raw-valid")
staqc_test_queries, _ = load_eval_dataset("staqc-py-raw-test")
so_ds_valid_queries, _ = load_eval_dataset("so-ds-feb20-valid")
so_ds_test_queries, _ = load_eval_dataset("so-ds-feb20-test")


Downloading dataset from http://codebook.dyn.nesc.nokia.net:8089/codesearch-experiments/datasets/pacsv1/staqc-py-valid-raw-pacsv1.jsonl.gz
Downloading dataset from http://codebook.dyn.nesc.nokia.net:8089/codesearch-experiments/datasets/pacsv1/staqc-py-test-raw.jsonl.gz
Downloading dataset from http://codebook.dyn.nesc.nokia.net:8089/codesearch-experiments/datasets/pacsv1/so-ds-feb20-valid-pacsv1.jsonl.gz
Downloading dataset from http://codebook.dyn.nesc.nokia.net:8089/codesearch-experiments/datasets/pacsv1/so-ds-feb20-test.jsonl.gz


In [3]:
conala_snippets[:2], len(conala_snippets), staqc_snippets[:2], len(staqc_snippets), so_ds[:2], len(so_ds)

([{'id': '15080500-0',
   'code': 'os.kill(os.getpid(), signal.SIGUSR1)',
   'description': 'send a signal `signal.SIGUSR1` to the current process',
   'language': 'python',
   'attribution': 'https://stackoverflow.com/questions/15080500'},
  {'id': '3283984-0',
   'code': "bytes.fromhex('4a4b4c').decode('utf-8')",
   'description': "decode a hex string '4a4b4c' to UTF-8.",
   'language': 'python',
   'attribution': 'https://stackoverflow.com/questions/3283984'}],
 2777,
 [{'code': "\n# Create a user account if we haven't found one yet.\n'social.pipeline.user.create_user',\n",
   'attribution': 'https://stackoverflow.com/questions/36922535',
   'language': 'python',
   'rawDescription': 'Use Python social auth to only get tokens',
   'id': '36922535_0',
   'description': 'Use Python social auth to only get tokens'},
  {'code': "import ssl\nimport os\n# get the https certificate\ncert = ssl.get_server_certificate(('example.com', 443))\n# append it to my personal chain\npem_path = os.pat

In [4]:
ids_to_filter = set()
descriptions_to_filter = set()
for s in conala_snippets:
    ids_to_filter.add(s["id"].split("-")[0])
    descriptions_to_filter.add(s["description"].lower())

for s in staqc_snippets:
    ids_to_filter.add(s["id"].split("_")[0])
    descriptions_to_filter.add(s["description"].lower())

for s in so_ds:
    so_id = str(int(s["attribution"][-1].split("/")[-1]))
    ids_to_filter.add(so_id)
    descriptions_to_filter.add(s["description"].lower())

queries = chain(
    conala_test_queries, 
    staqc_valid_queries, 
    staqc_test_queries, 
    so_ds_valid_queries, 
    so_ds_test_queries)

for i, q in enumerate(queries):
    if i % 1000 == 0: print(q)
    descriptions_to_filter.add(q.lower())

How can I send a signal from a python program?
How to access python class attributes dynamically?
Python different value when printing and reading
python: if column condition met change value in that column
How to break an import line in python?
How to add attributes in input element with Beautifulsoup
Is Python's order of evaluation of function arguments and operands deterministic (+ where is it documented)?
JPG image into matrix Using python
Setting different color for each series in scatter plot on matplotlib


In [5]:
len(ids_to_filter)

133802

In [6]:
duplicates_train = []
removed_duplicates = []
num_filtered_id_only = num_filtered_descr_only = 0
for dup_record in duplicates:
    original_id, original_descr = dup_record["original"]
    ids = [original_id]
    descriptions = [original_descr]
    for dupl_id, dupl_descr in dup_record["duplicates"]:
        ids.append(dupl_id)
        descriptions.append(dupl_descr)

    id_match = any(map(lambda id_: id_ in ids_to_filter, ids))
    dupl_match = any(map(lambda d: d.lower() in descriptions_to_filter, descriptions))
    if id_match or dupl_match:
        removed_duplicates.append(dup_record)
    else:
        duplicates_train.append(descriptions)
    
    num_filtered_id_only += int(id_match and not dupl_match)
    num_filtered_descr_only += int(not id_match and dupl_match)
len(duplicates_train), len(duplicates), duplicates_train[0], num_filtered_descr_only, num_filtered_id_only

(187968,
 195498,
 ["How do I check for nulls in an '==' operator overload without infinite recursion?",
  'Best way to handle null when writing equals operator',
  "How to compare two objects of different types where one inherits the other's type",
  'Overriding == operator. How to compare to null?',
  "When overloading the equality operator, what's the best way to handle null values?",
  '(C#) Problems when overloading the == operator',
  'How can i implement == and check for null in c#',
  'C# equality operators override (== and !=)',
  'Why Use Value Equality On a Reference Type',
  'Overriding Equals/GetHashCode for class in order to use hashset Contains/ExceptWith/UnionWith',
  'How can I ignore an operator overload',
  'how to avoid stackoverflow in == overload',
  'Operator overloading giving error',
  'What is "Best Practice" For Comparing Two Instances of a Reference Type?',
  'C# operator == check for null',
  'Having problems comparing two custom class objects'],
 34,
 304)

In [27]:
save_jsonl("so-duplicates-pacsv1-train.jsonl", duplicates_train)

In [21]:
conala_test_queries,_ = load_eval_dataset("conala-curated-0.5-test")
staqc_valid_queries, _ = load_eval_dataset("staqc-py-raw-valid")
staqc_test_queries, _ = load_eval_dataset("staqc-py-raw-test")
so_ds_valid_queries, _ = load_eval_dataset("so-ds-feb20-valid")
so_ds_test_queries, _ = load_eval_dataset("so-ds-feb20-test")

Downloading dataset from http://codebook.dyn.nesc.nokia.net:8089/codesearch-experiments/datasets/pacsv1/conala-test-curated-0.5.jsonl.gz
Downloading dataset from http://codebook.dyn.nesc.nokia.net:8089/codesearch-experiments/datasets/pacsv1/staqc-py-valid-raw.jsonl.gz
Downloading dataset from http://codebook.dyn.nesc.nokia.net:8089/codesearch-experiments/datasets/pacsv1/staqc-py-test-raw.jsonl.gz
Downloading dataset from http://codebook.dyn.nesc.nokia.net:8089/codesearch-experiments/datasets/pacsv1/so-ds-feb20-valid.jsonl.gz
Downloading dataset from http://codebook.dyn.nesc.nokia.net:8089/codesearch-experiments/datasets/pacsv1/so-ds-feb20-test.jsonl.gz


In [22]:

staqc_valid_ds = load_jsonl("staqc-py-valid-raw.jsonl")
so_ds_valid_ds = load_jsonl("so-ds-feb20-valid.jsonl")

In [23]:

test_queries = set(q.lower() for q in chain(conala_test_queries, staqc_test_queries, so_ds_test_queries))


def filter_test_queries(ds):
    filtered_ds = []
    for r in ds:
        q = r["query"].lower()
        if q not in test_queries:
            filtered_ds.append(r)

    return filtered_ds
    
staqc_valid_ds_filtered = filter_test_queries(staqc_valid_ds)
so_ds_valid_ds_filtered = filter_test_queries(so_ds_valid_ds)

len(staqc_valid_ds), len(staqc_valid_ds_filtered), len(so_ds_valid_ds), len(so_ds_valid_ds_filtered)

(2748, 2599, 1112, 946)

In [24]:
save_jsonl("staqc-py-valid-raw-pacsv1.jsonl", staqc_valid_ds_filtered)
save_jsonl("so-ds-feb20-valid-pacsv1.jsonl", so_ds_valid_ds_filtered)
