© 2020 Nokia

Licensed under the BSD 3 Clause license

SPDX-License-Identifier: BSD-3-Clause

# Prepare Conala snippet collection and evaluation data

In [1]:
from pathlib import Path
import json
from collections import defaultdict

from codesearch.data import load_jsonl, save_jsonl

corpus_url = "http://www.phontron.com/download/conala-corpus-v1.1.zip"
conala_dir = Path("conala-corpus")
conala_train_fn = conala_dir/"conala-test.json"
conala_test_fn = conala_dir/"conala-train.json"
conala_mined_fn = conala_dir/"conala-mined.jsonl"

conala_snippets_fn = "conala-curated-snippets.jsonl"
conala_retrieval_test_fn = "conala-test-curated-0.5.jsonl"

if not conala_train_fn.exists():
    !wget $corpus_url
    !unzip conala-corpus-v1.1.zip

In [2]:
conala_mined = load_jsonl(conala_mined_fn)

The mined dataset seems to noisy to incorporate in the snippet collection:

In [3]:
!sed -n '10000,10009p;10010q' $conala_mined_fn

{"parent_answer_post_id": 19864272, "prob": 0.3778283882048184, "snippet": "\"\"\"{:.2E}\"\"\".format(Decimal('40800000000.00000000000000'))", "intent": "Display a decimal in scientific notation", "id": "6913532_19864272_0", "question_id": 6913532}
{"parent_answer_post_id": 39564738, "prob": 0.37777091356723147, "snippet": "madata.mean(axis=1)", "intent": "How can I use a 2D array of boolean rows to filter another 2D array?", "id": "39564421_39564738_1", "question_id": 39564421}
{"parent_answer_post_id": 35784295, "prob": 0.3777396155385873, "snippet": "mc.set_multi({'key': 'Hello', 'another': True})", "intent": "Correct way to load bulk data", "id": "35694060_35784295_0", "question_id": 35694060}
{"parent_answer_post_id": 38861665, "prob": 0.3777389688135013, "snippet": "[[1, 2], [3, 4, 5, 6], [7, 8, 9, 10, 11, 12], [13, 14]]", "intent": "Splitting a list into uneven groups?", "id": "38861457_38861665_3", "question_id": 38861457}
{"parent_answer_post_id": 14766816, "prob": 0.377727804

In [4]:
with open(conala_train_fn) as f:
    conala_train = json.load(f)

with open(conala_test_fn) as f:
    conala_test = json.load(f)

conala_all = conala_train + conala_test
conala_all[:2], len(conala_all), len(conala_train), len(conala_test)

([{'intent': 'How can I send a signal from a python program?',
   'rewritten_intent': 'send a signal `signal.SIGUSR1` to the current process',
   'snippet': 'os.kill(os.getpid(), signal.SIGUSR1)',
   'question_id': 15080500},
  {'intent': 'Decode Hex String in Python 3',
   'rewritten_intent': "decode a hex string '4a4b4c' to UTF-8.",
   'snippet': "bytes.fromhex('4a4b4c').decode('utf-8')",
   'question_id': 3283984}],
 2879,
 500,
 2379)

In [6]:
for s in conala_all:
    if s["rewritten_intent"] == "Convert the first row of numpy matrix `a` to a list":
        print(s)

{'intent': 'How to make List from Numpy Matrix in Python', 'rewritten_intent': 'Convert the first row of numpy matrix `a` to a list', 'snippet': 'numpy.array(a)[0].tolist()', 'question_id': 5183533}


In [48]:
question_ids = {r["question_id"] for r in conala_all}
intents = set(r["intent"] for r in conala_all)

len(question_ids), len(conala_all), len(intents)

(2074, 2879, 2089)

In [49]:
id2snippet = defaultdict(list)
for r in conala_all:
    id2snippet[r["question_id"]].append(r)

In [50]:
for r in conala_all:
    if not r["intent"]: 
        print(r)
    if r["intent"].lower() == (r["rewritten_intent"] or "").lower():
        print(r)

{'intent': 'How do I INSERT INTO t1 (SELECT * FROM t2) in SQLAlchemy?', 'rewritten_intent': 'How do I INSERT INTO t1 (SELECT * FROM t2) in SQLAlchemy?', 'snippet': "session.execute('INSERT INTO t1 (SELECT * FROM t2)')", 'question_id': 1849375}
{'intent': 'Django filter by hour', 'rewritten_intent': 'django filter by hour', 'snippet': "Entry.objects.filter(pub_date__contains='08:00')", 'question_id': 2984751}
{'intent': 'Reverse Y-Axis in PyPlot', 'rewritten_intent': 'reverse y-axis in pyplot', 'snippet': 'plt.gca().invert_yaxis()', 'question_id': 2051744}
{'intent': 'calculate the date six months from the current date', 'rewritten_intent': 'calculate the date six months from the current date', 'snippet': 'print((datetime.date.today() + datetime.timedelta(((6 * 365) / 12))).isoformat())', 'question_id': 546321}
{'intent': 'Find current directory', 'rewritten_intent': 'Find current directory', 'snippet': 'cwd = os.getcwd()', 'question_id': 5137497}
{'intent': 'get a list of locally insta

In [58]:
import random
random.seed(42)

snippets = []
eval_records = []
for question_id in id2snippet:
    snippets_ = [r for r in id2snippet[question_id] if r["rewritten_intent"]]
    if not snippets_: continue
    for i, record in enumerate(snippets_):

        snippet_record = {
            "id": f'{record["question_id"]}-{i}', 
            "code": record["snippet"], 
            "description": record["rewritten_intent"],
            "language": "python",
            "attribution": f"https://stackoverflow.com/questions/{record['question_id']}"
        }
        snippets.append(snippet_record)
    # occasionally snippets from the same question have a slightly different intent
    # to avoid similar queries, we create only one query per question
    query = random.choice(snippets_)["intent"]
    
    if any(query.lower() == r["description"].lower() for r in snippets[-len(snippets_):] ):
        print(f"filtering query {query}")
        continue
    relevant_ids = [r["id"] for r in snippets[-len(snippets_):] ]
    
    eval_records.append({"query": query, "relevant_ids": relevant_ids})
    
snippets[:2], len(snippets), eval_records[:2], len(eval_records)

filtering query How do I INSERT INTO t1 (SELECT * FROM t2) in SQLAlchemy?
filtering query Django filter by hour
filtering query Reverse Y-Axis in PyPlot
filtering query Find current directory
filtering query get a list of locally installed Python modules
filtering query Convert generator object to a dictionary
filtering query Change directory to the directory of a Python script
filtering query Disable abbreviation in argparse
filtering query Generate random integers between 0 and 9
filtering query Creating an empty list
filtering query get current time
filtering query What's the best way to search for a Python dictionary value in a list of dictionaries?
filtering query get current CPU and RAM usage
filtering query convert binary string to numpy array
filtering query Escaping quotes in string


([{'id': '15080500-0',
   'code': 'os.kill(os.getpid(), signal.SIGUSR1)',
   'description': 'send a signal `signal.SIGUSR1` to the current process',
   'language': 'python',
   'attribution': 'https://stackoverflow.com/questions/15080500'},
  {'id': '3283984-0',
   'code': "bytes.fromhex('4a4b4c').decode('utf-8')",
   'description': "decode a hex string '4a4b4c' to UTF-8.",
   'language': 'python',
   'attribution': 'https://stackoverflow.com/questions/3283984'}],
 2777,
 [{'query': 'How can I send a signal from a python program?',
   'relevant_ids': ['15080500-0']},
  {'query': 'Decode Hex String in Python 3', 'relevant_ids': ['3283984-0']}],
 2000)

In [59]:
id2snippet_ = {r["id"]: r for r in snippets}

In [60]:
for i, eval_record in enumerate(eval_records):
    print(f"Query: {eval_record['query']}")
    print(f"Relevant descriptions: {[id2snippet_[id]['description'] for id in eval_record['relevant_ids']]}")
    if i == 10:
        break

Query: How can I send a signal from a python program?
Relevant descriptions: ['send a signal `signal.SIGUSR1` to the current process']
Query: Decode Hex String in Python 3
Relevant descriptions: ["decode a hex string '4a4b4c' to UTF-8."]
Query: check if all elements in a list are identical
Relevant descriptions: ['check if all elements in list `myList` are identical']
Query: Format string dynamically
Relevant descriptions: ['format number of spaces between strings `Python`, `:` and `Very Good` to be `20`']
Query: How I can get rid of None values in dictionary?
Relevant descriptions: ['get rid of None values in dictionary `kwargs`', 'get rid of None values in dictionary `kwargs`']
Query: Python: how to get the final output of multiple system commands?
Relevant descriptions: ['capture final output of a chain of system commands `ps -ef | grep something | wc -l`']
Query: splitting and concatenating a string
Relevant descriptions: ["concatenate a list of strings `['a', 'b', 'c']`"]
Query: F

In [7]:
from codesearch.text_preprocessing import compute_overlap

compute_overlap("this is a test", "test test")

Initializing spacy nlp \
Initialized spacy nlp


(1, 1.0)

In [62]:
overlaps = []
filtered_eval_records = []
for r in eval_records:
    query = r["query"]
    descriptions = [id2snippet_[id]['description'] for id in r['relevant_ids']]
    overlap = max(compute_overlap(query, d)[1] for d in descriptions)
    overlaps.append(overlap)
    
    if overlap < 0.5 :
        filtered_eval_records.append(r)
filtered_eval_records[:2], len(filtered_eval_records)

([{'query': 'How can I send a signal from a python program?',
   'relevant_ids': ['15080500-0']},
  {'query': 'Finding the intersection between two series in Pandas',
   'relevant_ids': ['18079563-0']}],
 762)

In [63]:
save_jsonl(conala_snippets_fn, snippets)
save_jsonl(conala_retrieval_test_fn, filtered_eval_records)