                 ____       __       _                 __   ______            _ __      __     __
                / __ \___  / /______(_)__ _   ______ _/ /  / ____/___  ____  (_) /___  / /_   / /
               / /_/ / _ \/ __/ ___/ / _ \ | / / __ `/ /  / /   / __ \/ __ \/ / / __ \/ __/  / / 
              / _, _/  __/ /_/ /  / /  __/ |/ / /_/ / /  / /___/ /_/ / /_/ / / / /_/ / /_   /_/  
             /_/ |_|\___/\__/_/  /_/\___/|___/\__,_/_/   \____/\____/ .___/_/_/\____/\__/  (_)   
                                                                   /_/                           
                                       (learning every nanosecond)

## Getting code
The goal of this algorithm is to retrieve a relevant code snippet given an English description and a series of already defined code/description pairs.
![ret](../images/retrievalHighLevel.png)

In [1]:
import linecache
import pyndri
import os
import sys
import re
import numpy as np
np.random.seed(0)
import re, string, timeit
from colored import fg, bg, attr
import warnings
warnings.simplefilter('ignore')
from tensor2tensor.utils import bleu_hook
from multiprocessing import Process, Manager
import copy

## Indri 
Indri is the retrieval engine that I am currently using since it has a nice interface with python and has some of the algorithms I need.

Indri takes files in an XML format. Sentence pairs are usually stored line by line in a file. So we will need to convert from single line to formatted XML.

### Datasets 
We will currently make a full explanation for only one dataset: Django. This is because it is relatively small (18k sentences) and clean. Further descriptions and analysis are found in other notebooks in this directory.

In [2]:
def show_sample(fp, src_ext=".src", tgt_ext=".tgt", lines=[3,21,80,99]):
    linecache.clearcache()
    for l in lines:
        print("LINE: {} \nSOURCE:    {} \nTARGET:     {}\n".format(l, 
                                                                   linecache.getline(fp+src_ext, l), 
                                                                   linecache.getline(fp+tgt_ext, l)))

In [3]:
django_fp = "../datasets/django/all"
show_sample(django_fp, src_ext=".desc", tgt_ext=".code", lines=[13,14])

LINE: 13 
SOURCE:      define the function get_cache with backend and dictionary pair of elements kwargs as arguments.
 
TARGET:         def get_cache ( backend , ** kwargs ) :


LINE: 14 
 




In [4]:
!head -5 ../datasets/django/all.desc 

  from threading import local into default name space.
  from django.conf import settings into default name space.
  from django.core import signals into default name space.


In [5]:
!head -5 ../datasets/django/all.code

 from threading import local
  from django . conf import settings
 from django . core import signals


In [6]:
dirName = "temp"
 
try:
    # Create target Directory
    os.mkdir(dirName)
    print("Directory " , dirName ,  " Created ") 
except FileExistsError:
    print("Directory " , dirName ,  " already exists")

Directory  temp  already exists


### Dataset and train / test split
Copy the full dataset to the temp folder. We then split the data into a training and testing set at around 90% / 10%

In [7]:
train_ratio = 0.9 # this means 90% of the data will be used for training, thus 10% for testing
num_samples = sum(1 for line in open(django_fp + ".desc"))
train_cutoff = int(num_samples * train_ratio)

lines = np.arange(num_samples)
np.random.shuffle(lines)

train_lines = lines[:train_cutoff]
test_lines = lines[train_cutoff:]

In [8]:
train_fp = "temp/retrieval_train"
test_fp = "temp/retrieval_test"

##### Train split for .desc and .code

In [9]:
with open(train_fp + ".desc", "w") as out:
    for l in train_lines:
        src = linecache.getline(django_fp + ".desc", l)
        out.write(src)

In [10]:
with open(train_fp + ".code", "w") as out:
    for l in train_lines:
        src = linecache.getline(django_fp + ".code", l)
        out.write(src)

##### Test split for .desc and .code

In [11]:
with open(test_fp + ".desc", "w") as out:
    for l in test_lines:
        src = linecache.getline(django_fp + ".desc", l)
        out.write(src)

In [12]:
with open(test_fp + ".code", "w") as out:
    for l in test_lines:
        src = linecache.getline(django_fp + ".code", l)
        out.write(src)

### Convert to TrecText format

In [13]:
with open(train_fp + ".desc", "r") as f, open("temp/train_desc.trectext", "w") as out:
    count = 0
    while True:
        line = f.readline()
        
        if not line :
            break
            
        out.write("<DOC>\n  <DOCNO>{}</DOCNO>\n  <TEXT>\n{}  </TEXT>\n</DOC>\n".format(count, line))
        count += 1

### Create the index with indri
To create an index we need to supply Indri with a parameter file specifying how to handle each document. Indri will then generate an index folder with is fast to query.

In [14]:
with open("temp/IndriBuildIndex.conf", "w") as out:
    conf = """
<parameters>
<index>temp/django_index/</index>
<memory>1024M</memory>
<storeDocs>true</storeDocs>
<corpus><path>temp/train_desc.trectext</path><class>trectext</class></corpus>
<stemmer><name>krovetz</name></stemmer>
</parameters>"""
    
    out.write(conf)
    

In [15]:
!IndriBuildIndex temp/IndriBuildIndex.conf

kstem_add_table_entry: Duplicate word emeritus will be ignored.
kstem_add_table_entry: Duplicate word emeritus will be ignored.
0:00: Opened repository temp/django_index/
0:00: Opened temp/train_desc.trectext
0:00: Documents parsed: 16923 Documents indexed: 0
0:00: Closed temp/train_desc.trectext
0:00: Closing index
0:00: Finished


In [16]:
index = pyndri.Index("temp/django_index/")
env = pyndri.TFIDFQueryEnvironment(index, k1=1.2, b=0.75)

## Query example

In [17]:
results = env.query('error handler', results_requested=5)

In [18]:
show_sample(train_fp, src_ext=".desc", tgt_ext=".code", lines=[doc[0] for doc in results])

LINE: 1718 
SOURCE:      define the method resolve_error_handler with arguments self and view_type.
 
TARGET:        def resolve_error_handler ( self , view_type ) :


LINE: 15350 
SOURCE:      for every handler in handlers,
 
TARGET:                               for handler in handlers :


LINE: 10247 
SOURCE:      for every handler in handlers,
 
TARGET:          for handler in handlers :


LINE: 6919 
SOURCE:      substitute self._upload_handlers for handlers.
 
TARGET:      handlers = self . _upload_handlers


LINE: 2756 
SOURCE:      substitute upload_handlers for self._upload_handlers.
 
TARGET:       self . _upload_handlers = upload_handlers




In [19]:
top_code = [linecache.getline(django_fp + ".code", doc[0]) for doc in results]

We need to remove punctuation from input query strings so that Indri accepts them.

In [20]:
pattern = re.compile(r'[^\w\s]')

In [23]:
linecache.clearcache()
with open(test_fp + ".desc", "r") as f, open("temp/retrieval_predictions" + ".code", "w") as out:
    lines = f.readlines()
    l = 1
    for line in lines:
        result = env.query(pattern.sub(" ", line), results_requested=1)
        if result != ():
            out.write(linecache.getline(train_fp + ".code",result[0][0]))
        else:
            out.write("\n")
            print("LINE: {} \n{}{}QUERY:{}    {}\nSANITIZED QUERY      {}\n{}{}PRED DESCRIPTION{}:   ######### NO PREDICTION ##########\n".format(
                l, 
                fg(24), 
                bg(85),
                attr(0),
                line,
                fg(24), 
                bg(217),
                attr(0),
                line.translate(str.maketrans('', '', string.punctuation))))
        if l % 500 == 1:
            print("LINE: {} \n{}{}QUERY:{}    {}\n{}{}PRED DESCRIPTION:{}    {} \n{}{}PRED CODE:{}     {}\n{}{}TRUTH:{}    {}\n".format(
                l, 
                fg(24), 
                bg(85),
                attr(0),
                line, 
                fg(24), 
                bg(153),
                attr(0),
                linecache.getline(train_fp + ".desc", result[0][0]), 
                fg(24), 
                bg(153),
                attr(0),
                linecache.getline(train_fp + ".code", result[0][0]),
                fg(24), 
                bg(85),
                attr(0),
                linecache.getline(test_fp + ".code", l)))
        l += 1

LINE: 1 
[38;5;24m[48;5;85mQUERY:[0m      raise a ValidationError exception with 2 arguments: return value of the function _ called with an argument string 'Enter a valid IPv6 address.', and code set to string 'invalid'.

[38;5;24m[48;5;153mPRED DESCRIPTION:[0m      raise a ValidationError exception with 2 arguments: return value of the function _ called with an argument string 'Enter a valid IPv4 or IPv6 address.', and code set to string 'invalid'.
 
[38;5;24m[48;5;153mPRED CODE:[0m                   raise ValidationError ( _ ( 'Enter a valid IPv4 or IPv6 address.' ) , code = 'invalid' )

[38;5;24m[48;5;85mTRUTH:[0m              raise ValidationError ( _ ( 'Enter a valid IPv6 address.' ) , code = 'invalid' )


LINE: 501 
[38;5;24m[48;5;85mQUERY:[0m      convert max_entries into a string, substitute it for self._max_entries.

[38;5;24m[48;5;153mPRED DESCRIPTION:[0m      convert self into a string and return it.
 
[38;5;24m[48;5;153mPRED CODE:[0m               retur

### Calculating BLEU Score
We use a librarry from Google's tensor2tensor library. This way we ensure the methods are correct by being actively maintained. The method takes two files and compares them line by line.

The only disadvantage is that it doesn't work with TF2.0 yet. Curently using TF1.14.0

In [21]:
bleu = 100 * bleu_hook.bleu_wrapper("temp/retrieval_test.code", "temp/retrieval_predictions.code",
                                          case_sensitive=False)
print("BLEU_uncased = %6.2f" % bleu)

W0811 23:07:14.371142 140469340874496 deprecation_wrapper.py:119] From /usr/local/lib/python3.5/dist-packages/tensor2tensor/utils/bleu_hook.py:205: The name tf.gfile.Open is deprecated. Please use tf.io.gfile.GFile instead.



BLEU_uncased =  37.13


## Multiprocessing
In order to make fast optimisations we will use a multiprocessing library to fully utilise our hardware.

In [22]:
uid = 0

In [23]:
base_task = {
        "job_id":0,
        "train_fp":"temp/retrieval_train",
        "test_fp":"temp/retrieval_test",
        "src_ext":".desc",
        "tgt_ext":".code",
        "hyps":{
            "b":0.75,
            "k1":1.2
        },
        "env":pyndri.TFIDFQueryEnvironment(index, k1=1.2, b=0.75),
        "folds":1,
        "fold":1
    }

In [24]:
def TFIDF_task_generator(task, k1s=[1.0, 1.5, 2.0], bs=[0.0,0.25,0.5,0.75,1.0]):
    global uid
    tasks = [] 
    for k1 in k1s:
        for b in bs:
            new_task = dict(task)
            new_task["hyps"] = {"b":b, "k1":k1}
            new_task["env"] = pyndri.TFIDFQueryEnvironment(index, k1=k1, b=b)
            new_task["job_id"] = uid
            uid += 1
            tasks.append(new_task)
    return tasks

In [25]:
tasks = []

In [26]:
tasks += TFIDF_task_generator(base_task)

In [27]:
tasks

[{'env': <pyndri.TFIDFQueryEnvironment at 0x7fc169bf0900>,
  'fold': 1,
  'folds': 1,
  'hyps': {'b': 0.0, 'k1': 1.0},
  'job_id': 0,
  'src_ext': '.desc',
  'test_fp': 'temp/retrieval_test',
  'tgt_ext': '.code',
  'train_fp': 'temp/retrieval_train'},
 {'env': <pyndri.TFIDFQueryEnvironment at 0x7fc142e1f168>,
  'fold': 1,
  'folds': 1,
  'hyps': {'b': 0.25, 'k1': 1.0},
  'job_id': 1,
  'src_ext': '.desc',
  'test_fp': 'temp/retrieval_test',
  'tgt_ext': '.code',
  'train_fp': 'temp/retrieval_train'},
 {'env': <pyndri.TFIDFQueryEnvironment at 0x7fc142e1f0d8>,
  'fold': 1,
  'folds': 1,
  'hyps': {'b': 0.5, 'k1': 1.0},
  'job_id': 2,
  'src_ext': '.desc',
  'test_fp': 'temp/retrieval_test',
  'tgt_ext': '.code',
  'train_fp': 'temp/retrieval_train'},
 {'env': <pyndri.TFIDFQueryEnvironment at 0x7fc142e1f1f8>,
  'fold': 1,
  'folds': 1,
  'hyps': {'b': 0.75, 'k1': 1.0},
  'job_id': 3,
  'src_ext': '.desc',
  'test_fp': 'temp/retrieval_test',
  'tgt_ext': '.code',
  'train_fp': 'temp/retri

In [28]:
def train_eval(task):
    print(task["hyps"], task["job_id"])
    linecache.clearcache()
    env = task["env"]
    
    out_file_name = "temp/retrieval_predictions_fold:{}-{}_k1_{}_b_{}".format(
                                                                            task["fold"],
                                                                            task["folds"],
                                                                            task["hyps"]["k1"],
                                                                            task["hyps"]["b"])
    with open(task["test_fp"] + ".desc", "r") as f, open(out_file_name + ".code", "w") as out:
        lines = f.readlines()
        for line in lines:
            result = env.query(pattern.sub(" ", line), results_requested=1)
            if result != ():
                out.write(linecache.getline(task["train_fp"] + ".code",result[0][0]))
            else:
                out.write("\n")
                
    bleu = 100 * bleu_hook.bleu_wrapper(task["test_fp"] + ".code", out_file_name + ".code",
                                          case_sensitive=False)
                
    task["BLEU"] = bleu
    measures.append(task)

In [29]:
with Manager() as manager:
    measures = manager.list()
    processes = []
    for task in tasks:
        p = Process(target=train_eval, args=(task,))
        p.start()
        processes.append(p)
    for p in processes:
        p.join()
        measures = list(measures)

{'k1': 1.0, 'b': 0.0} 0
{'k1': 1.0, 'b': 0.5} 2
{'k1': 1.0, 'b': 0.75} 3
{'k1': 1.0, 'b': 0.25} 1
{'k1': 1.0, 'b': 1.0} 4
{'k1': 1.5, 'b': 0.0} 5
{'k1': 1.5, 'b': 0.25} 6
{'k1': 1.5, 'b': 0.75} 8
{'k1': 1.5, 'b': 1.0} 9
{'k1': 1.5, 'b': 0.5} 7
{'k1': 2.0, 'b': 0.0} 10
{'k1': 2.0, 'b': 0.25} 11
{'k1': 2.0, 'b': 0.5} 12
{'k1': 2.0, 'b': 0.75} 13
{'k1': 2.0, 'b': 1.0} 14


In [30]:
[m["job_id"] for m in measures]

[9, 7, 14, 3, 2, 4, 1, 13, 8, 10, 5, 6, 11, 12, 0]

In [34]:
[(m["hyps"], m["BLEU"]) for m in measures]

[({'b': 1.0, 'k1': 1.5}, 32.57192671298981),
 ({'b': 0.5, 'k1': 1.5}, 33.298951387405396),
 ({'b': 1.0, 'k1': 2.0}, 30.727407336235046),
 ({'b': 0.75, 'k1': 1.0}, 37.75593340396881),
 ({'b': 0.5, 'k1': 1.0}, 33.298060297966),
 ({'b': 1.0, 'k1': 1.0}, 35.48528254032135),
 ({'b': 0.25, 'k1': 1.0}, 27.687522768974304),
 ({'b': 0.75, 'k1': 2.0}, 35.04375219345093),
 ({'b': 0.75, 'k1': 1.5}, 36.403024196624756),
 ({'b': 0.0, 'k1': 2.0}, 20.81363797187805),
 ({'b': 0.0, 'k1': 1.5}, 21.450144052505493),
 ({'b': 0.25, 'k1': 1.5}, 27.032572031021118),
 ({'b': 0.25, 'k1': 2.0}, 26.073500514030457),
 ({'b': 0.5, 'k1': 2.0}, 32.88987576961517),
 ({'b': 0.0, 'k1': 1.0}, 22.978107631206512)]

In [32]:
len(measures)

15