# Title bag-of-words embedding

© 2020 Nokia

Licensed under the BSD 3 Clause license

SPDX-License-Identifier: BSD-3-Clause

## Setup

In [None]:
%load_ext autoreload
%autoreload 2

from pathlib import Path
import time
import json
import os

from codesearch.data_config import DATASETS_DIR
from codesearch.data import load_train_dataset
from codesearch.data import load_snippet_collection, EVAL_DATASETS, eval_datasets_from_regex
from codesearch.encoders import BasicEncoder
from codesearch.utils import SaveableFunction
from codesearch import embedding_pretraining
from codesearch.embedding_pretraining import create_input_file_from_text, train_fasttext_model_from_text, load_fasttext_model
from codesearch.tnbow.tnbow_embedder import TnbowEmbedder
from codesearch.embedding_retrieval import EmbeddingRetrievalModel
from codesearch.evaluation import evaluate_and_dump 

start = time.time()

This is a first embedding-based method for computing similarity between queries and snippet titles

Read configuration parameters from environment variables (when this notebook is run as a script).

In [None]:
so_question_titles = "SO-python-question-title-feb20.tok.txt"

text_input_name = os.environ.get("text_input_raw", "so-python-question-titles-feb20")
text_input_raw = load_train_dataset(text_input_name) # returns filename and 
                                                     # downloads file if it is not yet present
text_input = DATASETS_DIR/os.environ.get("text_input", so_question_titles)

encoder_checkpoint = os.environ.get("encoder_checkpoint", None)
fast_text_checkpoint = os.environ.get("fast_text_checkpoint", None)
model_filename = os.environ.get("model_filename", None)
embedder_filename = os.environ.get("embedder_filename", None) # save embedder

snippets_collection = os.environ.get("snippet_collection", "so-ds-feb20")
valid_dataset = os.environ.get("valid_dataset", "so-ds-feb20-valid")
test_dataset = os.environ.get("test_dataset", "so-ds-feb20-test")

text_overrides = json.loads(os.environ.get("text_overrides", "{}"))
fast_text_overrides = json.loads(os.environ.get("fast_text_overrides", "{}"))

output_dir = os.environ.get("output_dir", ".")

In [None]:
model_filename, fast_text_checkpoint, encoder_checkpoint

In [None]:
snippets_collection

In [None]:
text_overrides, text_input_raw, text_input, fast_text_overrides

## Load data

In [None]:
if valid_dataset and valid_dataset not in EVAL_DATASETS:
    raise ValueError()
test_datasets = eval_datasets_from_regex(test_dataset)
snippets = load_snippet_collection(snippets_collection)

## Preprocess data for fast text model

## Train or load embedding model

In [None]:
if fast_text_checkpoint:
    model, enc = load_fasttext_model(fast_text_checkpoint)
    print("Loaded fasttext checkpoint")
    
else:
    enc = BasicEncoder(text_preprocessing_params=text_overrides)
    
    if not text_input.exists():
        # Preprocess raw text input
        create_input_file_from_text(text_input, text_input_raw, enc)
    
    model = train_fasttext_model_from_text(text_input, enc, fast_text_overrides, "./", save=False)

### Embedding snippets & queries

In [None]:
enc = SaveableFunction(enc.encode_description)
embedder = TnbowEmbedder.from_fasttext_model(model, enc)

### Create retrieval model

In [None]:
retrieval_model = EmbeddingRetrievalModel(embedder)
retrieval_model.add_snippets(snippets)

In [None]:
if model_filename: embedder.save(model_filename)

## Evaluation

In [None]:
sample_queries = ["train a tensorflow model", "plot a bar chart", "merge two dataframes", "sort a list", "read a pandas dataframe from a file", "plot an image"]
config = {"text": text_overrides, "fasttext": fast_text_overrides}
evaluate_and_dump(
    retrieval_model, 
    config, 
    output_dir, 
    valid_dataset, 
    test_datasets,
    sample_queries=sample_queries
)

In [None]:
duration = time.time() - start
f"Running the notebook took {duration} seconds"