                 ____       __       _                 __   ______            _ __      __     __
                / __ \___  / /______(_)__ _   ______ _/ /  / ____/___  ____  (_) /___  / /_   / /
               / /_/ / _ \/ __/ ___/ / _ \ | / / __ `/ /  / /   / __ \/ __ \/ / / __ \/ __/  / / 
              / _, _/  __/ /_/ /  / /  __/ |/ / /_/ / /  / /___/ /_/ / /_/ / / / /_/ / /_   /_/  
             /_/ |_|\___/\__/_/  /_/\___/|___/\__,_/_/   \____/\____/ .___/_/_/\____/\__/  (_)   
                                                                   /_/                           
                                       (learning every nanosecond)

## Getting code
The goal of this algorithm is to retrieve a relevant code snippet given an English description and a series of already defined code/description pairs.
![ret](../images/retrievalHighLevel.png)

In [59]:
import linecache
import pyndri
import os
import sys
import re
import numpy as np
np.random.seed(0)
import re, string, timeit
from colored import fg, bg, attr
import warnings
warnings.simplefilter('ignore')
from tensor2tensor.utils import bleu_hook
from multiprocessing import Process, Manager

## Indri 
Indri is the retrieval engine that I am currently using since it has a nice interface with python and has some of the algorithms I need.

Indri takes files in an XML format. Sentence pairs are usually stored line by line in a file. So we will need to convert from single line to formatted XML.

### Datasets 
We will currently make a full explanation for only one dataset: Django. This is because it is relatively small (18k sentences) and clean. Further descriptions and analysis are found in other notebooks in this directory.

In [25]:
def show_sample(fp, src_ext=".src", tgt_ext=".tgt", lines=[3,21,80,99]):
    linecache.clearcache()
    for l in lines:
        print("LINE: {} \nSOURCE:    {} \nTARGET:     {}\n".format(l, 
                                                                   linecache.getline(fp+src_ext, l), 
                                                                   linecache.getline(fp+tgt_ext, l)))

In [26]:
django_fp = "../datasets/django/all"
show_sample(django_fp, src_ext=".desc", tgt_ext=".code", lines=[13,14])

LINE: 13 
SOURCE:      define the function get_cache with backend and dictionary pair of elements kwargs as arguments.
 
TARGET:         def get_cache ( backend , ** kwargs ) :


LINE: 14 
 




In [6]:
!head -5 ../datasets/django/all.desc 

  from threading import local into default name space.
  from django.conf import settings into default name space.
  from django.core import signals into default name space.


In [7]:
!head -5 ../datasets/django/all.code

 from threading import local
  from django . conf import settings
 from django . core import signals


In [40]:
dirName = "temp"
 
try:
    # Create target Directory
    os.mkdir(dirName)
    print("Directory " , dirName ,  " Created ") 
except FileExistsError:
    print("Directory " , dirName ,  " already exists")

Directory  temp  Created 


### Dataset and train / test split
Copy the full dataset to the temp folder. We then split the data into a training and testing set at around 90% / 10%

In [41]:
train_ratio = 0.9 # this means 90% of the data will be used for training, thus 10% for testing
num_samples = sum(1 for line in open(django_fp + ".desc"))
train_cutoff = int(num_samples * train_ratio)

lines = np.arange(num_samples)
np.random.shuffle(lines)

train_lines = lines[:train_cutoff]
test_lines = lines[train_cutoff:]

In [42]:
train_fp = "temp/retrieval_train"
test_fp = "temp/retrieval_test"

##### Train split for .desc and .code

In [43]:
with open(train_fp + ".desc", "w") as out:
    for l in train_lines:
        src = linecache.getline(django_fp + ".desc", l)
        out.write(src)

In [44]:
with open(train_fp + ".code", "w") as out:
    for l in train_lines:
        src = linecache.getline(django_fp + ".code", l)
        out.write(src)

##### Test split for .desc and .code

In [45]:
with open(test_fp + ".desc", "w") as out:
    for l in test_lines:
        src = linecache.getline(django_fp + ".desc", l)
        out.write(src)

In [46]:
with open(test_fp + ".code", "w") as out:
    for l in test_lines:
        src = linecache.getline(django_fp + ".code", l)
        out.write(src)

### Convert to TrecText format

In [47]:
with open(train_fp + ".desc", "r") as f, open("temp/train_desc.trectext", "w") as out:
    count = 0
    while True:
        line = f.readline()
        
        if not line :
            break
            
        out.write("<DOC>\n  <DOCNO>{}</DOCNO>\n  <TEXT>\n{}  </TEXT>\n</DOC>\n".format(count, line))
        count += 1

### Create the index with indri
To create an index we need to supply Indri with a parameter file specifying how to handle each document. Indri will then generate an index folder with is fast to query.

In [48]:
with open("temp/IndriBuildIndex.conf", "w") as out:
    conf = """
<parameters>
<index>temp/django_index/</index>
<memory>1024M</memory>
<storeDocs>true</storeDocs>
<corpus><path>temp/train_desc.trectext</path><class>trectext</class></corpus>
<stemmer><name>krovetz</name></stemmer>
</parameters>"""
    
    out.write(conf)
    

In [49]:
!IndriBuildIndex temp/IndriBuildIndex.conf

kstem_add_table_entry: Duplicate word emeritus will be ignored.
0:00: Created repository temp/django_index/
0:00: Opened temp/train_desc.trectext
0:06: Documents parsed: 9401 Documents indexed: 9401
0:06: Closed temp/train_desc.trectext
0:06: Closing index
0:06: Finished


In [50]:
index = pyndri.Index("temp/django_index/")
env = pyndri.TFIDFQueryEnvironment(index, k1=1.2, b=0.75)

## Query example

In [52]:
results = env.query('error handler', results_requested=5)

In [53]:
show_sample(train_fp, src_ext=".desc", tgt_ext=".code", lines=[doc[0] for doc in results])

LINE: 1718 
SOURCE:      define the method resolve_error_handler with arguments self and view_type.
 
TARGET:        def resolve_error_handler ( self , view_type ) :


LINE: 6919 
SOURCE:      substitute self._upload_handlers for handlers.
 
TARGET:      handlers = self . _upload_handlers


LINE: 2756 
SOURCE:      substitute upload_handlers for self._upload_handlers.
 
TARGET:       self . _upload_handlers = upload_handlers


LINE: 2253 
SOURCE:      for every handler in self._upload_handlers,
 
TARGET:                  for handler in self . _upload_handlers :


LINE: 1925 
SOURCE:      substitute _upload_handlers for self.__upload_handlers.
 
TARGET:      self . _upload_handlers = upload_handlers




In [54]:
top_code = [linecache.getline(django_fp + ".code", doc[0]) for doc in results]

We need to remove punctuation from input query strings so that Indri accepts them.

In [75]:
pattern = re.compile(r'[^\w\s]')

In [56]:
linecache.clearcache()
with open(test_fp + ".desc", "r") as f, open("temp/retrieval_predictions" + ".code", "w") as out:
    lines = f.readlines()
    l = 1
    for line in lines:
        result = env.query(p.sub(" ", line), results_requested=1)
        if result != ():
            out.write(linecache.getline(train_fp + ".code",result[0][0]))
        else:
            out.write("\n")
            print("LINE: {} \n{}{}QUERY:{}    {}\nSANITIZED QUERY      {}\n{}{}PRED DESCRIPTION{}:   ######### NO PREDICTION ##########\n".format(
                l, 
                fg(24), 
                bg(85),
                attr(0),
                line,
                fg(24), 
                bg(217),
                attr(0),
                line.translate(str.maketrans('', '', string.punctuation))))
        if l % 500 == 1:
            print("LINE: {} \n{}{}QUERY:{}    {}\n{}{}PRED DESCRIPTION:{}    {} \n{}{}PRED CODE:{}     {}\n{}{}TRUTH:{}    {}\n".format(
                l, 
                fg(24), 
                bg(85),
                attr(0),
                line, 
                fg(24), 
                bg(153),
                attr(0),
                linecache.getline(train_fp + ".desc", result[0][0]), 
                fg(24), 
                bg(153),
                attr(0),
                linecache.getline(train_fp + ".code", result[0][0]),
                fg(24), 
                bg(85),
                attr(0),
                linecache.getline(test_fp + ".code", l)))
        l += 1

LINE: 1 
[38;5;24m[48;5;85mQUERY:[0m      if not,

[38;5;24m[48;5;153mPRED DESCRIPTION:[0m      if not,
 
[38;5;24m[48;5;153mPRED CODE:[0m        else :

[38;5;24m[48;5;85mTRUTH:[0m      else :


LINE: 501 
[38;5;24m[48;5;85mQUERY:[0m      define the method __init__ with, self, server, params, library, value_not_found_exception as arguments.

[38;5;24m[48;5;153mPRED DESCRIPTION:[0m      call method __init__ from the base class of the class PyLibMCCache with arguments: server, params,
 
[38;5;24m[48;5;153mPRED CODE:[0m      super ( PyLibMCCache , self ) . __init__ ( server , params ,  library = pylibmc ,  value_not_found_exception = pylibmc . NotFound )

[38;5;24m[48;5;85mTRUTH:[0m          def __init__ ( self , server , params , library , value_not_found_exception ) :


LINE: 1001 
[38;5;24m[48;5;85mQUERY:[0m      call the method self._css.keys, sort the result and substitute it for media.

[38;5;24m[48;5;153mPRED DESCRIPTION:[0m      sort elements of kwds

### Calculating BLEU Score
We use a librarry from Google's tensor2tensor library. This way we ensure the methods are correct by being actively maintained. The method takes two files and compares them line by line.

The only disadvantage is that it doesn't work with TF2.0 yet. Curently using TF1.14.0

In [57]:
bleu = 100 * bleu_hook.bleu_wrapper("temp/retrieval_test.code", "temp/retrieval_predictions.code",
                                          case_sensitive=False)
print("BLEU_uncased = %6.2f" % bleu)

BLEU_uncased =  33.66


## Multiprocessing
In order to make fast optimisations we will use a multiprocessing library to fully utilise our hardware.

In [79]:
tasks = [
    {
        "job_id":0,
        "train_fp":"temp/retrieval_train",
        "test_fp":"temp/retrieval_test",
        "src_ext":".desc",
        "tgt_ext":".code",
        "hyps":{
            "b":0.75,
            "k1":1.2
        },
        "index":index,
        "folds":1,
        "fold":1
    },
    {
        "job_id":0,
        "train_fp":"temp/retrieval_train",
        "test_fp":"temp/retrieval_test",
        "src_ext":".desc",
        "tgt_ext":".code",
        "hyps":{
            "b":0.75,
            "k1":1.3
        },
        "index":index,
        "folds":1,
        "fold":1
    },
    {
        "job_id":0,
        "train_fp":"temp/retrieval_train",
        "test_fp":"temp/retrieval_test",
        "src_ext":".desc",
        "tgt_ext":".code",
        "hyps":{
            "b":0.75,
            "k1":1.4
        },
        "index":index,
        "folds":1,
        "fold":1
    }
]

In [80]:
def train_eval(task):
    linecache.clearcache()
    env = pyndri.TFIDFQueryEnvironment(task["index"], k1=task["hyps"]["k1"], b=task["hyps"]["b"])
    
    out_file_name = "temp/retrieval_predictions_fold:{}-{}_k1:{}_b:{}".format(
                                                                            task["fold"],
                                                                            task["folds"],
                                                                            task["hyps"]["k1"],
                                                                            task["hyps"]["b"])
    with open(task["test_fp"] + ".desc", "r") as f, open(out_file_name + ".code", "w") as out:
        lines = f.readlines()
        for line in lines:
            result = env.query(pattern.sub(" ", line), results_requested=1)
            if result != ():
                out.write(linecache.getline(task["train_fp"] + ".code",result[0][0]))
            else:
                out.write("\n")
                
    bleu = 100 * bleu_hook.bleu_wrapper(task["test_fp"] + ".code", out_file_name + ".code",
                                          case_sensitive=False)
                
    task["BLEU"] = bleu
    measures.append(task)

In [81]:
with Manager() as manager:
    measures = manager.list()
    processes = []
    for task in tasks:
        p = Process(target=train_eval, args=(task,))
        p.start()
        processes.append(p)
    for p in processes:
        p.join()
        measures = list(measures)

In [82]:
measures

[{'BLEU': 33.36871564388275,
  'fold': 1,
  'folds': 1,
  'hyps': {'b': 0.75, 'k1': 1.3},
  'index': <pyndri.Index of 0 documents>,
  'job_id': 0,
  'src_ext': '.desc',
  'test_fp': 'temp/retrieval_test',
  'tgt_ext': '.code',
  'train_fp': 'temp/retrieval_train'},
 {'BLEU': 33.23887288570404,
  'fold': 1,
  'folds': 1,
  'hyps': {'b': 0.75, 'k1': 1.4},
  'index': <pyndri.Index of 0 documents>,
  'job_id': 0,
  'src_ext': '.desc',
  'test_fp': 'temp/retrieval_test',
  'tgt_ext': '.code',
  'train_fp': 'temp/retrieval_train'},
 {'BLEU': 33.663979172706604,
  'fold': 1,
  'folds': 1,
  'hyps': {'b': 0.75, 'k1': 1.2},
  'index': <pyndri.Index of 0 documents>,
  'job_id': 0,
  'src_ext': '.desc',
  'test_fp': 'temp/retrieval_test',
  'tgt_ext': '.code',
  'train_fp': 'temp/retrieval_train'}]