© 2020 Nokia

Licensed under the BSD 3 Clause license

SPDX-License-Identifier: BSD-3-Clause

In [None]:
%load_ext autoreload
%autoreload 2

import os
import json
import time

from codesearch.data import load_snippet_collection, EVAL_DATASETS, eval_datasets_from_regex
from codesearch.encoders import BasicEncoder
from codesearch.bm25_retrieval import BM250RetrievalModel
from codesearch.evaluation import evaluate_and_dump 

start = time.time()

In [None]:
text_overrides = json.loads(os.environ.get("text_overrides", "{}"))
code_overrides = json.loads(os.environ.get("code_overrides", "{}"))
model_filename = os.environ.get("model_filename")
snippets_collection = os.environ.get("snippets_collection", "so-ds-feb20")
valid_dataset = os.environ.get("valid_dataset", "so-ds-feb20-valid")
test_dataset = os.environ.get("test_dataset", "so-ds-feb20-test")

output_dir = os.environ.get("output_dir", ".")

In [None]:
text_overrides

In [None]:
code_overrides

## Load data

In [None]:
if valid_dataset and valid_dataset not in EVAL_DATASETS:
    raise ValueError()
test_datasets = eval_datasets_from_regex(test_dataset)
snippets = load_snippet_collection(snippets_collection)

In [None]:
snippets_collection, valid_dataset, test_datasets

## Preprocessing

In [None]:
enc = BasicEncoder(text_preprocessing_params=text_overrides, code_preprocessing_params=code_overrides)

## Create retrieval model

In [None]:
retrieval_model = BM250RetrievalModel(enc, index_code=True, index_description=False)
if model_filename:
    retrieval_model.save(model_filename)

retrieval_model.add_snippets(snippets)

In [None]:
if model_filename:
    retrieval_model.save(model_filename)

## Evaluation

In [None]:
sample_queries = ["train a tensorflow model", "plot a bar chart", "merge two dataframes", "sort a list", "read a pandas dataframe from a file", "plot an image"]
config = {"text": text_overrides, "code": code_overrides}
evaluate_and_dump(
    retrieval_model, 
    config, 
    output_dir, 
    valid_dataset, 
    test_datasets,
    sample_queries=sample_queries
)

In [None]:
duration = time.time() - start
f"Running the notebook took {duration} seconds"