© 2020 Nokia

Licensed under the BSD 3 Clause license

SPDX-License-Identifier: BSD-3-Clause

In [63]:
datasets = [
    ("CoNaLa", "conala-curated", None, "conala-curated-0.5-test"),
    ("StaQC-py", "staqc-py-cleaned", "staqc-py-raw-valid", "staqc-py-raw-test"),
    ("SO-DS", "so-ds-feb20", "so-ds-feb20-valid", "so-ds-feb20-test")
]

## Compute overlap between queries and snippet descriptions

In [64]:
from codesearch.data import load_eval_dataset, load_snippet_collection
from codesearch.text_preprocessing import compute_overlap

def queries_and_descriptions(snippet_collection, eval_dataset):
    _, query2ids = load_eval_dataset(eval_dataset)
    snippets = load_snippet_collection(snippet_collection)
    id2snippet = {s["id"]: s for s in snippets}
    qs_and_ds = []
    for q in query2ids:
        descriptions = [id2snippet[id]["description"] for id in query2ids[q]]
        qs_and_ds.append((q, descriptions))

    return qs_and_ds



In [66]:

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

def plot_overlaps(query_rel_overlaps, dataset):
    plt.hist(query_rel_overlaps, bins=np.arange(0, 1.1, 0.1))
    plt.title(dataset)
    plt.xlabel('word overlap (query vs matching snippet description)', fontsize=12)
    plt.ylabel('# queries', fontsize=12)
    plt.tick_params(axis='both', which='major', labelsize=11)
    plt.tick_params(axis='both', which='minor', labelsize=11)
    mean = np.array(query_rel_overlaps).mean()
    print(mean)
    plt.axvline(mean, color='red', linewidth=2)
    plt.annotate('mean = {:0.2f}'.format(mean), xy=(mean + 0.05, .95), xycoords=('data', 'axes fraction'), color='red') #, xytext=(xoff, 15)


    plt.savefig(f'/Users/heyman/Documents/code-search-paper/figures/query_rel_overlap_{dataset}.pdf')
    plt.close()
    #plt.show()

for name, snippet_collection, _, test_dataset in datasets:
    overlaps = []
    print(name, snippet_collection, test_dataset)
    for q, descriptions in queries_and_descriptions(snippet_collection, test_dataset):
        _, overlap = max(compute_overlap(q, d) for d in descriptions)
        overlaps.append(overlap)
    plot_overlaps(overlaps, name)

CoNaLa conala-curated conala-curated-0.5-test
0.27871314136946373
StaQC-py staqc-py-cleaned staqc-py-raw-test
0.2851193095778236
SO-DS so-ds-feb20 so-ds-feb20-test
0.27731259865052305



- snippet collection:
    - size (\# snippets)
    - description length (\# tokens)
    - snippet length (\# LOC)
- ground truth valid
    - size (\# queries)
    - \# matching snippets per query
- ground truth test
    - size (\# queries)
    - \# matching snippets per query


In [83]:

def description_len(snippets):
    len_sum = 0
    for s in snippets:
        len_sum += len(s["description"].strip().split())
    return len_sum/len(snippets)

def snippet_len(snippets):
    len_sum = 0
    for s in snippets:
        len_sum += len(s["code"].split("\n"))
    return len_sum/len(snippets)

def summarize_snippet_collection(snippet_collection):
    snippets = load_snippet_collection(snippet_collection)
    size = len(snippets)
    description_length = description_len(snippets)
    snippet_length = snippet_len(snippets)
    return size, description_length, snippet_length

def summarize_eval_dataset(eval_dataset):
    if not eval_dataset:
        return np.nan, np.nan
    _, query2id = load_eval_dataset(eval_dataset)
    size = len(query2id)
    num_matching = sum(len(ids) for ids in query2id.values())
    return size, num_matching/size

for name, snippet_collection, valid_dataset, test_dataset in datasets:
    snippet_summary = summarize_snippet_collection(snippet_collection)
    valid_summary = summarize_eval_dataset(valid_dataset)
    test_summary = summarize_eval_dataset(test_dataset)
    print(name)
    print(' & '.join([ '{:0.2f}'.format(x) for x in (snippet_summary + valid_summary + test_summary)]))



CoNaLa
2777.00 & 10.32 & 1.07 & nan & nan & 762.00 & 1.17
StaQC-py
203700.00 & 8.36 & 9.84 & 2599.00 & 2.01 & 2749.00 & 3.40
SO-DS
12137.00 & 7.35 & 14.98 & 947.00 & 1.69 & 1113.00 & 1.70
