In [36]:
import pdb
import json
import glob
import pickle
import tokenize
from io import StringIO
import numpy as np
import sys
from collections import Counter

In [37]:
def load_data(file_path):
    with open(file_path, 'rb') as fp:
        data = pickle.load(fp)
    return data

# Filtering \n s from code and intent

In [38]:
def filter_newline(data, mode='nl', casefolding=True):
    line_stripped_data = []
    for item in data:
        src_code, tgt, src_nl = [], [], []
        
        for line in item[0]:
            if line == '\n' or line.strip() == '':
                continue
            tgt.append(line)
            
        #if mode == 'nl':
        for text in item[2]:
            if text == '\n' or text.strip() == '':
                continue
            if casefolding:
                src_nl.append(text.lower())
            else:
                src_nl.append(text)
                    
        #if mode == 'code':
        for text in item[1]:
            if text == '\n' or text.strip() == '':
                continue
            src_code.append(text)
                
        if len(src_code) and len(src_nl) and len(tgt):
            line_stripped_data.append((src_code, src_nl, tgt))
            
    return line_stripped_data

# Filtering non ascii characters from code and intent

In [39]:
def is_ascii(s):
    return all(ord(c) < 128 for c in s)

In [49]:
def filter_ascii_data(data):
    ascii_filtered_data = []
    for item in data:
        code_tgt, code_src, intent = [], [], []
        for line in item[2]:
            if is_ascii(line):
                code_tgt.append(line)
                
        for line in item[0]:
            if is_ascii(line):
                code_src.append(line)
    
        for line in item[1]:
            if is_ascii(line):
                intent.append(line)
            
        if code_src == [] or intent == [] or code_tgt == []:
            continue
        ascii_filtered_data.append((code_src,intent,code_tgt))
    return ascii_filtered_data

# Filtering examples which are only 3 lines (both code and intent)

In [41]:
def lines_reduced_filter(length, data):
    lines_reduced_data = []
    
    for item in data:
        if 0 < len(item[0]) <= length and 0 < len(item[1]) <= length and 0 < len(item[2]) <= length:
            lines_reduced_data.append(item)
            
    return lines_reduced_data

# Filtering based on Pandas functions

In [42]:
with open('pickles/pandas_fns_names.pkl', 'rb') as fp:
    pandas_fns = pickle.load(fp)

In [43]:
def contains_pandas_api(codeline):
    for func in pandas_fns:
        if '.'+func in codeline:
            return True
    return False

In [44]:
def pandas_fn_name_filter(data):
    pandas_fn_filtered_data = []
    
    for item in data:
        code = item[2]
        if any(list(map(contains_pandas_api, code))):
            pandas_fn_filtered_data.append(item)
            
    return pandas_fn_filtered_data

# Chop every line to a maxium fixed length

In [45]:
def chop_data(data, length):
    chopped_data = []
    
    for item in data:
        code_src,intent,code_tgt = item[0], item[1], item[2]
        #src, tgt = item[0], item[1]
        code_src_chopped,intent_chopped,code_tgt_chopped = [], [], []
        #src_chopped, tgt_chopped = [], []
        
        for line in code_src:
            code_src_chopped.append(line[:length])
            
        for line in intent:
            intent_chopped.append(line[:length])
            
        for line in code_tgt:
            code_tgt_chopped.append(line[:length])
            
        chopped_data.append((code_src_chopped,intent_chopped,code_tgt_chopped))
    return chopped_data

In [46]:
pandas_data = load_data('/home/anushap/Code-Generation-Old/pandas_context_dataset_5years_no_dups.pkl')
print (len(pandas_data), ' is original raw data')
line_stripped_data = filter_newline(pandas_data, mode='nl')

126084  is original raw data


In [47]:
line_stripped_data[0]

(["API_KEY = 'hEerZVQ4FJm8y725RJsB'"],
 ["keep in mind that the json responses you will be getting from the api map almost one-to-one to python's dictionaries. unfortunately, they can be very nested, so make sure you read up on indexing dictionaries in the documentation provided above."],
 ['import requests\n'])

In [50]:
ascii_filtered_data = filter_ascii_data(line_stripped_data)
print (len(ascii_filtered_data), ' is after ascii filtering')

111728  is after ascii filtering


In [51]:
ascii_filtered_data[1]

(['import requests\n'],
 ["keep in mind that the json responses you will be getting from the api map almost one-to-one to python's dictionaries. unfortunately, they can be very nested, so make sure you read up on indexing dictionaries in the documentation provided above."],
 ['r= requests.get("https://www.quandl.com/api/v3/datasets/FSE/AFX_X/data.json")\n',
  'json_data = r.json()\n',
  'json_data["dataset_data"]["data"][0]'])

In [31]:
#lines_reduced_data = lines_reduced_filter(5, ascii_filtered_data)
#print (len(lines_reduced_data), ' is after num of lines reduced')

17170  is after num of lines reduced


In [52]:
pandas_fn_filtered_data = pandas_fn_name_filter(ascii_filtered_data)
print (len(pandas_fn_filtered_data), ' is after filtering for pandas fns')

95298  is after filtering for pandas fns


In [53]:
pandas_fn_filtered_data[0]

(['import requests\n'],
 ["keep in mind that the json responses you will be getting from the api map almost one-to-one to python's dictionaries. unfortunately, they can be very nested, so make sure you read up on indexing dictionaries in the documentation provided above."],
 ['r= requests.get("https://www.quandl.com/api/v3/datasets/FSE/AFX_X/data.json")\n',
  'json_data = r.json()\n',
  'json_data["dataset_data"]["data"][0]'])

In [None]:
#chopped_data = chop_data(pandas_fn_filtered_data, 150)
#print (len(chopped_data), ' is after chopping every line for target size (char)')

In [54]:
pandas_fn_filtered_data_code2code = [(i[0],i[2]) for i in pandas_fn_filtered_data]
pandas_fn_filtered_data_nl2code = [(i[1],i[2]) for i in pandas_fn_filtered_data]

In [55]:
pickle.dump(pandas_fn_filtered_data_code2code,open('/home/anushap/Code-Generation-Old/pandas_context_dataset_5years_no_dups_filtered_code2code.pkl','wb'))

In [56]:
pickle.dump(pandas_fn_filtered_data_nl2code,open('/home/anushap/Code-Generation-Old/pandas_context_dataset_5years_no_dups_filtered_nl2code.pkl','wb'))