# NCS Experiments

© 2020 Nokia

Licensed under the BSD 3 Clause license

SPDX-License-Identifier: BSD-3-Clause

In [6]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
timeout = 3600 * 10 # 10 hours

In [3]:
from pathlib import Path
import json
import os

os.environ["snippets_collection"] = "so-ds-feb20"
os.environ["train_snippets_collection"] = "so-ds-feb20"
os.environ["valid_dataset"] = "so-ds-feb20-valid"
os.environ["test_dataset"] = "so-ds-feb20-test"
output_dir = Path("so-ds-feb20")

os.environ["snippets_collection"] = "conala-curated"
os.environ["train_snippets_collection"] = "conala-curated"
os.environ["valid_dataset"] = "conala-curated-0.5-test"
os.environ["test_dataset"] = "conala-curated-0.5-test"
output_dir = Path("conala")

os.environ["snippets_collection"] = "staqc-py-cleaned"
os.environ["train_snippets_collection"] = "staqc-py-cleaned"
os.environ["valid_dataset"] = "staqc-py-raw-valid"
os.environ["test_dataset"] = "staqc-py-raw-test"
output_dir = Path("staqc-py")

os.environ["output_dir"] = str(output_dir)
if not output_dir.exists():
    output_dir.mkdir()

## Preprocessing hyperparameters

In [None]:
text_overrides_ = [{}, {"lemmatize": False}, {"remove_stop": False}] + 8 * [{}]
code_overrides_ = [{},
                  {"lemmatize": False}, 
                  {"remove_stop": False}, 
                  {"keep_comments": False},
                  {"identifier_types": ["call", "import"]}, # without other identifiers                   
                  {"identifier_types": [ "attribute", "argument", "keyword_argument", "generic", "import"]}, # without calls
                  {"identifier_types": [ "attribute", "argument", "keyword_argument", "generic", "call"]},   # without import
                  {"rstrip_numbers": False},
                  {"keep_loops": False},
                  {"keep_bin_ops": False},
                  {"case_split": False},
                  ]

In [None]:
os.environ["fast_text_overrides"] = "{}"
os.environ.pop("zip_fn", None)

for i, (text_overrides, code_overrides) in enumerate(zip(text_overrides_, code_overrides_)):
    os.environ["text_overrides"] = json.dumps(text_overrides)
    os.environ["code_overrides"] = json.dumps(code_overrides)
    output_base = str(output_dir/f"ncs_preprocess_{i}")
    !python -m nbconvert ncs.ipynb --execute --NbConvertApp.output_base=$output_base --ExecutePreprocessor.timeout=$timeout  


## Original ncs

In [None]:
text_overrides = {"lemmatize": False}
code_overrides = {"lemmatize": False,  "keep_loops": False, "keep_bin_ops": False, "rstrip_numbers": False, "identifier_types": ["call", "import"]}
os.environ.pop("zip_fn", None)

os.environ["text_overrides"] = json.dumps(text_overrides)
os.environ["code_overrides"] = json.dumps(code_overrides)
output_base = str(output_dir/f"original_ncs")
!python -m nbconvert ncs.ipynb --execute --NbConvertApp.output_base=$output_base --ExecutePreprocessor.timeout=$timeout  


## Original ncs + variable names

In [None]:
text_overrides = {"lemmatize": False}
code_overrides = {"lemmatize": False,  "keep_loops": False, "keep_bin_ops": False, "rstrip_numbers": False}
os.environ.pop("zip_fn", None)

os.environ["text_overrides"] = json.dumps(text_overrides)
os.environ["code_overrides"] = json.dumps(code_overrides)
os.environ["fast_text_overrides"] = json.dumps({})

output_base = str(output_dir/f"original_ncs+varnames")
!python -m nbconvert ncs.ipynb --execute --NbConvertApp.output_base=$output_base --ExecutePreprocessor.timeout=$timeout  


## Original ncs + zip fn

In [None]:
text_overrides = {"lemmatize": False}
code_overrides = {"lemmatize": False,   "keep_loops": False, "keep_bin_ops": False, "rstrip_numbers": False,"identifier_types": ["call", "import"]}
os.environ.pop("zip_fn", None)

os.environ["text_overrides"] = json.dumps(text_overrides)
os.environ["code_overrides"] = json.dumps(code_overrides)
os.environ["zip_fn"] = "zip_descr_middle_and_start_end"
os.environ["fast_text_overrides"] = json.dumps({})


output_base = str(output_dir/f"original_ncs+zipfn")
!python -m nbconvert ncs.ipynb --execute --NbConvertApp.output_base=$output_base --ExecutePreprocessor.timeout=$timeout  

## Original ncs + epochs

In [None]:
text_overrides = {"lemmatize": False}
code_overrides = {"lemmatize": False,  "keep_loops": False, "keep_bin_ops": False, "rstrip_numbers": False, "identifier_types": ["call", "import"]}
fasttext_overrides = {"epoch": 30}
os.environ.pop("zip_fn", None)

os.environ["text_overrides"] = json.dumps(text_overrides)
os.environ["code_overrides"] = json.dumps(code_overrides)
os.environ["fast_text_overrides"] = json.dumps(fasttext_overrides)
output_base = str(output_dir/f"original_ncs+epochs")
!python -m nbconvert ncs.ipynb --execute --NbConvertApp.output_base=$output_base --ExecutePreprocessor.timeout=$timeout  


## Original ncs + window size

In [None]:
text_overrides = {"lemmatize": False}
code_overrides = {"lemmatize": False,  "keep_loops": False, "keep_bin_ops": False, "rstrip_numbers": False, "identifier_types": ["call", "import"]}
fasttext_overrides = {"ws": 20}
os.environ.pop("zip_fn", None)

os.environ["text_overrides"] = json.dumps(text_overrides)
os.environ["code_overrides"] = json.dumps(code_overrides)
os.environ["fast_text_overrides"] = json.dumps(fasttext_overrides)
output_base = str(output_dir/f"original_ncs+ws")
!python -m nbconvert ncs.ipynb --execute --NbConvertApp.output_base=$output_base --ExecutePreprocessor.timeout=$timeout  

## Original ncs + minCount

In [None]:
text_overrides = {"lemmatize": False}
code_overrides = {"lemmatize": False,   "keep_loops": False, "keep_bin_ops": False, "rstrip_numbers": False, "identifier_types": ["call", "import"]}
fasttext_overrides = {"minCount": 1}
os.environ.pop("zip_fn", None)

os.environ["text_overrides"] = json.dumps(text_overrides)
os.environ["code_overrides"] = json.dumps(code_overrides)
os.environ["fast_text_overrides"] = json.dumps(fasttext_overrides)
output_base = str(output_dir/f"original_ncs+mincount")
!python -m nbconvert ncs.ipynb --execute --NbConvertApp.output_base=$output_base --ExecutePreprocessor.timeout=$timeout  


## Original ncs + minCount + epoch

In [None]:
text_overrides = {"lemmatize": False}
code_overrides = {"lemmatize": False,   "keep_loops": False, "keep_bin_ops": False, "rstrip_numbers": False, "identifier_types": ["call", "import"]}
fasttext_overrides = {"minCount": 1, "epoch": 30}
os.environ.pop("zip_fn", None)

os.environ["text_overrides"] = json.dumps(text_overrides)
os.environ["code_overrides"] = json.dumps(code_overrides)
os.environ["fast_text_overrides"] = json.dumps(fasttext_overrides)
output_base = str(output_dir/f"original_ncs+mincount")
!python -m nbconvert ncs.ipynb --execute --NbConvertApp.output_base=$output_base --ExecutePreprocessor.timeout=$timeout  


### Conclusions

## Fasttext hyperparameters

### Fasttext 1 (initial exploration)

In [None]:
os.environ["text_overrides"] = "{}"
os.environ["code_overrides"] = "{}"
os.environ.pop("zip_fn", None)
fast_text_overrides_ = [{"ws": 10}, {"ws": 20}, {"ws": 30}, {"dim": 50}, {"epoch": 10}, {"neg": 10}, {"t": 0.01},{"t": 0.001}, {"t": 0.00001}]

for i, fast_text_overrides in enumerate(fast_text_overrides_):
    os.environ["fast_text_overrides"] = json.dumps(fast_text_overrides)
    output_base = str(output_dir/f"fasttext_{i}")
    !python -m nbconvert ncs.ipynb --execute --NbConvertApp.output_base=$output_base --ExecutePreprocessor.timeout=$timeout  


#### Observations:

- Increasing window size helps
- Increasing the number of epochs helps
- Increasing the number of negative samples helps
- Lowering the sampling threshold does not help
- Decreasing the embedding dimensionality does not help

Window size, number of epochs, and negative samples all increase the number of times an embedding is updated. The next step is to tune the number of epochs and then check if increasing the window and number of negative samples still helps

### Fasttext 2: epochs

In [None]:
os.environ["text_overrides"] = "{}"
os.environ["code_overrides"] = "{}"
os.environ.pop("zip_fn", None)
fast_text_overrides_ = [{"epoch": 15}, {"epoch": 20}, {"epoch": 25}, {"epoch": 30}, {"epoch": 40}, {"epoch": 50}]

for i, fast_text_overrides in enumerate(fast_text_overrides_):
    os.environ["fast_text_overrides"] = json.dumps(fast_text_overrides)
    output_base = str(output_dir/f"fasttext_2.{i}")
    !python -m nbconvert ncs.ipynb --execute --NbConvertApp.output_base=$output_base --ExecutePreprocessor.timeout=$timeout  


#### Observations

Training for more than 30 epochs does not help.

### Fasttext 3: epochs and windows

In [None]:
os.environ["text_overrides"] = "{}"
os.environ["code_overrides"] = "{}"
os.environ.pop("zip_fn", None)
epochs = [30]
windows = [10, 15, 20, 25, 30, 35, 40]
fast_text_overrides_ = [{"epoch": epoch, "ws": ws} for epoch in epochs for ws in windows]

for i, fast_text_overrides in enumerate(fast_text_overrides_):
    os.environ["fast_text_overrides"] = json.dumps(fast_text_overrides)
    output_base = str(output_dir/f"fasttext_3.{i}")
    !python -m nbconvert ncs.ipynb --execute --NbConvertApp.output_base=$output_base --ExecutePreprocessor.timeout=$timeout  


#### Observations

Increasing window size still helps a lot.

### Fasttext 4: mincount

In [None]:
os.environ["text_overrides"] = "{}"
os.environ["code_overrides"] = "{}"
os.environ["fast_text_overrides"] = json.dumps({"minCount": 1, "epoch": 30, "ws": 20})
os.environ["zip_fn"] = "zip_descr_end"
output_base = str(output_dir/f"fasttext_4")
!python -m nbconvert ncs.ipynb --execute --NbConvertApp.output_base=$output_base --ExecutePreprocessor.timeout=$timeout  


## Zip function 
(How you combine code tokens and description tokens to a single fasttext *sentence/context*.)

In [None]:
os.environ["text_overrides"] = "{}"
os.environ["code_overrides"] = "{}"
os.environ["fast_text_overrides"] = json.dumps({"epoch": 30, "ws": 20, "minCount": 1})

zip_fns = ["zip_descr_start_end", "zip_descr_middle_and_start_end", "zip_descr_middle", "zip_descr_end"]

for i, zip_fn in enumerate(zip_fns):
    os.environ["zip_fn"] = zip_fn
    output_base = str(output_dir/f"zip_fn.{i}")
    !python -m nbconvert ncs.ipynb --execute --NbConvertApp.output_base=$output_base --ExecutePreprocessor.timeout=$timeout  


## Save best NCS hyperparam configuration

In [None]:
os.environ["text_overrides"] = json.dumps({"lemmatize": False})
os.environ["code_overrides"] = json.dumps({"lemmatize":False, "keep_loops": False, "keep_bin_ops": False, "rstrip_numbers": False})
os.environ["fast_text_overrides"] = json.dumps({"epoch": 30, "ws": 20, "dim":100, "minCount": 1})
os.environ["zip_fn"] = "zip_descr_middle_and_start_end"
os.environ["model_filename"] = str(output_dir/"best_ncs_embedder")
output_base = str(output_dir/f"best")
!python -m nbconvert ncs.ipynb --execute --NbConvertApp.output_base=$output_base --ExecutePreprocessor.timeout=$timeout  

os.environ["model_filename"] = ""

[NbConvertApp] Converting notebook ncs.ipynb to html
[NbConvertApp] Executing notebook with kernel: codesearch_ml4


## Best NCS ablation epoch

In [None]:
os.environ["text_overrides"] = json.dumps({"lemmatize": False})
os.environ["code_overrides"] = json.dumps({"lemmatize":False, "keep_loops": False, "keep_bin_ops": False, "rstrip_numbers": False})
os.environ["fast_text_overrides"] = json.dumps({"ws": 20, "minCount": 1})
os.environ["zip_fn"] = "zip_descr_middle_and_start_end"
#os.environ["model_filename"] = "../trained_models/ncs-embedder-so.feb20"
output_base = str(output_dir/f"best-epoch")
!python -m nbconvert ncs.ipynb --execute --NbConvertApp.output_base=$output_base --ExecutePreprocessor.timeout=$timeout  

os.environ["model_filename"] = ""

## Best NCS ablation variables

In [None]:
os.environ["text_overrides"] = json.dumps({"lemmatize": False})
os.environ["code_overrides"] = json.dumps({"lemmatize":False, "keep_loops": False, "keep_bin_ops": False, "rstrip_numbers": False,"identifier_types": ["call", "import"] })
os.environ["fast_text_overrides"] = json.dumps({"epoch": 30, "ws": 20, "minCount": 1})
os.environ["zip_fn"] = "zip_descr_middle_and_start_end"
#os.environ["model_filename"] = "../trained_models/ncs-embedder-so.feb20"
output_base = str(output_dir/f"best-variables")
!python -m nbconvert ncs.ipynb --execute --NbConvertApp.output_base=$output_base --ExecutePreprocessor.timeout=$timeout  

os.environ["model_filename"] = ""

## Best NCS ablation zip fn

In [None]:
os.environ["text_overrides"] = json.dumps({"lemmatize": False})
os.environ["code_overrides"] = json.dumps({"lemmatize":False, "keep_loops": False, "keep_bin_ops": False, "rstrip_numbers": False})
os.environ["fast_text_overrides"] = json.dumps({"epoch": 30, "ws": 20, "minCount": 1})
os.environ.pop("zip_fn", None)
#os.environ["model_filename"] = "../trained_models/ncs-embedder-so.feb20"
output_base = str(output_dir/f"best-zip")
!python -m nbconvert ncs.ipynb --execute --NbConvertApp.output_base=$output_base --ExecutePreprocessor.timeout=$timeout  

os.environ["model_filename"] = ""

## Best NCS ablation window size

In [None]:
os.environ["text_overrides"] = json.dumps({"lemmatize": False})
os.environ["code_overrides"] = json.dumps({"lemmatize":False, "keep_loops": False, "keep_bin_ops": False, "rstrip_numbers": False})
os.environ["fast_text_overrides"] = json.dumps({"epoch": 30, "minCount": 1})
os.environ["zip_fn"] = "zip_descr_middle_and_start_end"
#os.environ["model_filename"] = "../trained_models/ncs-embedder-so.feb20"
output_base = str(output_dir/f"best-ws")
!python -m nbconvert ncs.ipynb --execute --NbConvertApp.output_base=$output_base --ExecutePreprocessor.timeout=$timeout  

os.environ["model_filename"] = ""

## Best NCS ablation minCount

In [None]:
os.environ["text_overrides"] = json.dumps({"lemmatize": False})
os.environ["code_overrides"] = json.dumps({"lemmatize":False, "keep_loops": False, "keep_bin_ops": False, "rstrip_numbers": False})
os.environ["fast_text_overrides"] = json.dumps({"epoch": 30, "ws": 20})
os.environ["zip_fn"] = "zip_descr_middle_and_start_end"
#os.environ["model_filename"] = "../trained_models/ncs-embedder-so.feb20"
output_base = str(output_dir/f"best-mincount")
!python -m nbconvert ncs.ipynb --execute --NbConvertApp.output_base=$output_base --ExecutePreprocessor.timeout=$timeout  

os.environ["model_filename"] = ""

[NbConvertApp] Converting notebook ncs.ipynb to html
[NbConvertApp] Executing notebook with kernel: codesearch_ml4
Read 87M words
Number of words:  167684
Number of labels: 0
Progress:  23.9% words/sec/thread:   15705 lr:  0.038067 avg.loss:  0.335618 ETA:   1h49m 0s avg.loss:  0.609477 ETA:   2h24m52s29s words/sec/thread:   15703 lr:  0.049665 avg.loss:  0.591211 ETA:   2h22m14s22m10sh22m11s 0.049470 avg.loss:  0.598263 ETA:   2h21m48s avg.loss:  0.593532 ETA:   2h21m26s30s21m29s words/sec/thread:   15656 lr:  0.049242 avg.loss:  0.591274 ETA:   2h21m26s  15640 lr:  0.049194 avg.loss:  0.593165 ETA:   2h21m27s  15631 lr:  0.049155 avg.loss:  0.590066 ETA:   2h21m24s lr:  0.048998 avg.loss:  0.589546 ETA:   2h20m56s 0.048965 avg.loss:  0.589837 ETA:   2h20m50s words/sec/thread:   15617 lr:  0.048844 avg.loss:  0.588022 ETA:   2h20m39s  15587 lr:  0.048700 avg.loss:  0.582999 ETA:   2h20m30s  15582 lr:  0.048634 avg.loss:  0.578394 ETA:   2h20m21s  2.8% words/sec/thread:   15585 lr:  0.