                 ____       __       _                 __   ______            _ __      __     __
                / __ \___  / /______(_)__ _   ______ _/ /  / ____/___  ____  (_) /___  / /_   / /
               / /_/ / _ \/ __/ ___/ / _ \ | / / __ `/ /  / /   / __ \/ __ \/ / / __ \/ __/  / / 
              / _, _/  __/ /_/ /  / /  __/ |/ / /_/ / /  / /___/ /_/ / /_/ / / / /_/ / /_   /_/  
             /_/ |_|\___/\__/_/  /_/\___/|___/\__,_/_/   \____/\____/ .___/_/_/\____/\__/  (_)   
                                                                   /_/                           
                                       (learning every nanosecond)

## Getting code
The goal of this algorithm is to retrieve a relevant code snippet given an English description and a series of already defined code/description pairs.
![ret](../images/retrievalHighLevel.png)

In [32]:
import linecache
import pyndri
import os
import sys
import re
import numpy as np
np.random.seed(0)
import re, string, timeit
from colored import fg, bg, attr
import warnings
warnings.simplefilter('ignore')

## Indri 
Indri is the retrieval engine that I am currently using since it has a nice interface with python and has some of the algorithms I need.

Indri takes files in an XML format. Sentence pairs are usually stored line by line in a file. So we will need to convert from single line to formatted XML.

### Datasets 
We will currently make a full explanation for only one dataset: Django. This is because it is relatively small (18k sentences) and clean. Further descriptions and analysis are found in other notebooks in this directory.

In [25]:
def show_sample(fp, src_ext=".src", tgt_ext=".tgt", lines=[3,21,80,99]):
    linecache.clearcache()
    for l in lines:
        print("LINE: {} \nSOURCE:    {} \nTARGET:     {}\n".format(l, 
                                                                   linecache.getline(fp+src_ext, l), 
                                                                   linecache.getline(fp+tgt_ext, l)))

In [26]:
django_fp = "../datasets/django/all"
show_sample(django_fp, src_ext=".desc", tgt_ext=".code", lines=[13,14])

LINE: 13 
SOURCE:      define the function get_cache with backend and dictionary pair of elements kwargs as arguments.
 
TARGET:         def get_cache ( backend , ** kwargs ) :


LINE: 14 
 




In [6]:
!head -5 ../datasets/django/all.desc 

  from threading import local into default name space.
  from django.conf import settings into default name space.
  from django.core import signals into default name space.


In [7]:
!head -5 ../datasets/django/all.code

 from threading import local
  from django . conf import settings
 from django . core import signals


In [11]:
dirName = "temp"
 
try:
    # Create target Directory
    os.mkdir(dirName)
    print("Directory " , dirName ,  " Created ") 
except FileExistsError:
    print("Directory " , dirName ,  " already exists")

Directory  temp  Created 


### Dataset and train / test split
Copy the full dataset to the temp folder. We then split the data into a training and testing set at around 90% / 10%

In [12]:
train_ratio = 0.9 # this means 90% of the data will be used for training, thus 10% for testing
num_samples = sum(1 for line in open(django_fp + ".desc"))
train_cutoff = int(num_samples * train_ratio)

lines = np.arange(num_samples)
np.random.shuffle(lines)

train_lines = lines[:train_cutoff]
test_lines = lines[train_cutoff:]

In [13]:
train_fp = "temp/retrieval_train"
test_fp = "temp/retrieval_test"

##### Train split for .desc and .code

In [14]:
with open(train_fp + ".desc", "w") as out:
    for l in train_lines:
        src = linecache.getline(django_fp + ".desc", l)
        out.write(src)

In [15]:
with open(train_fp + ".code", "w") as out:
    for l in train_lines:
        src = linecache.getline(django_fp + ".code", l)
        out.write(src)

##### Test split for .desc and .code

In [16]:
with open(test_fp + ".desc", "w") as out:
    for l in test_lines:
        src = linecache.getline(django_fp + ".desc", l)
        out.write(src)

In [17]:
with open(test_fp + ".code", "w") as out:
    for l in test_lines:
        src = linecache.getline(django_fp + ".code", l)
        out.write(src)

### Convert to TrecText format

In [18]:
with open(train_fp + ".desc", "r") as f, open("temp/train_desc.trectext", "w") as out:
    count = 0
    while True:
        line = f.readline()
        
        if not line :
            break
            
        out.write("<DOC>\n  <DOCNO>{}</DOCNO>\n  <TEXT>\n{}  </TEXT>\n</DOC>\n".format(count, line))
        count += 1

### Create the index with indri
To create an index we need to supply Indri with a parameter file specifying how to handle each document. Indri will then generate an index folder with is fast to query.

In [19]:
with open("temp/IndriBuildIndex.conf", "w") as out:
    conf = """
<parameters>
<index>temp/django_index/</index>
<memory>1024M</memory>
<storeDocs>true</storeDocs>
<corpus><path>temp/train_desc.trectext</path><class>trectext</class></corpus>
<stemmer><name>krovetz</name></stemmer>
</parameters>"""
    
    out.write(conf)
    

In [20]:
!IndriBuildIndex temp/IndriBuildIndex.conf

kstem_add_table_entry: Duplicate word emeritus will be ignored.
0:00: Created repository temp/django_index/
0:00: Opened temp/train_desc.trectext
0:08: Documents parsed: 16923 Documents indexed: 16923
0:08: Closed temp/train_desc.trectext
0:08: Closing index
0:08: Finished


In [21]:
index = pyndri.Index("temp/django_index/")
env = pyndri.TFIDFQueryEnvironment(index)

## Query example

In [22]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [23]:
results = env.query('error handler', results_requested=5)

In [27]:
show_sample(train_fp, src_ext=".desc", tgt_ext=".code", lines=[doc[0] for doc in results])

LINE: 16012 
SOURCE:      for every handler in handlers,
 
TARGET:          for handler in handlers :


LINE: 10339 
SOURCE:      for every handler in handlers,
 
TARGET:                               for handler in handlers :


LINE: 8784 
SOURCE:      substitute self._upload_handlers for handlers.
 
TARGET:      handlers = self . _upload_handlers


LINE: 7760 
SOURCE:      substitute upload_handlers for self._upload_handlers.
 
TARGET:       self . _upload_handlers = upload_handlers


LINE: 4361 
SOURCE:      substitute _upload_handlers for self.__upload_handlers.
 
TARGET:      self . _upload_handlers = upload_handlers




In [28]:
top_code = [linecache.getline(django_fp + ".code", doc[0]) for doc in results]

We need to remove punctuation from input query strings so that Indri accepts them.

In [29]:
p = re.compile(r'[^\w\s]')

In [30]:
linecache.clearcache()
with open(test_fp + ".desc", "r") as f, open("temp/retrieval_predictions" + ".code", "w") as out:
    lines = f.readlines()
    l = 1
    for line in lines:
        result = env.query(p.sub(" ", line), results_requested=1)
        if result != ():
            out.write(linecache.getline(train_fp + ".code",result[0][0]))
        else:
            out.write("\n")
            print("LINE: {} \n{}{}QUERY:{}    {}\nSANITIZED QUERY      {}\n{}{}PRED DESCRIPTION{}:   ######### NO PREDICTION ##########\n".format(
                l, 
                fg(24), 
                bg(85),
                attr(0),
                line,
                fg(24), 
                bg(217),
                attr(0),
                line.translate(str.maketrans('', '', string.punctuation))))
        if l % 500 == 1:
            print("LINE: {} \n{}{}QUERY:{}    {}\n{}{}PRED DESCRIPTION:{}    {} \n{}{}PRED CODE:{}     {}\n{}{}TRUTH:{}    {}\n".format(
                l, 
                fg(24), 
                bg(85),
                attr(0),
                line, 
                fg(24), 
                bg(153),
                attr(0),
                linecache.getline(train_fp + ".desc", result[0][0]), 
                fg(24), 
                bg(153),
                attr(0),
                linecache.getline(train_fp + ".code", result[0][0]),
                fg(24), 
                bg(85),
                attr(0),
                linecache.getline(test_fp + ".code", l)))
        l += 1

LINE: 1 
[38;5;24m[48;5;85mQUERY:[0m      if p_pattern starts with a string '^',

[38;5;24m[48;5;153mPRED DESCRIPTION:[0m      if token_string starts with VARIABLE_TAG_START,
 
[38;5;24m[48;5;153mPRED CODE:[0m                   if token_string . startswith ( VARIABLE_TAG_START ) :

[38;5;24m[48;5;85mTRUTH:[0m     if p_pattern . startswith ( '^' ) :


LINE: 501 
[38;5;24m[48;5;85mQUERY:[0m      return value.

[38;5;24m[48;5;153mPRED DESCRIPTION:[0m      return value.
 
[38;5;24m[48;5;153mPRED CODE:[0m               return value

[38;5;24m[48;5;85mTRUTH:[0m      return value


LINE: 1001 
[38;5;24m[48;5;85mQUERY:[0m      define the method _isdst with 2 arguments self and dt.

[38;5;24m[48;5;153mPRED DESCRIPTION:[0m      tt is a tuple with 9 elements: dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second, result of the method dt.weekday,
 
[38;5;24m[48;5;153mPRED CODE:[0m               tt = ( dt . year , dt . month , dt . day ,  dt . hour , dt . minute 

In [37]:
from tensor2tensor.utils import bleu_hook

bleu = 100 * bleu_hook.bleu_wrapper("temp/retrieval_test.code", "temp/retrieval_predictions.code",
                                          case_sensitive=False)
print("BLEU_uncased = %6.2f" % bleu)

BLEU_uncased =  40.40
