In [1]:
import pdb
import json
import glob
import pickle
import tokenize
from io import StringIO
import numpy as np
import sys
from collections import Counter

In [2]:
def load_data(file_path):
    with open(file_path, 'rb') as fp:
        data = pickle.load(fp)
    return data

# Filtering \n s from code and intent

In [3]:
def filter_newline(data, mode='nl', casefolding=True):
    line_stripped_data = []
    for item in data:
        src_code, tgt, src_nl = [], [], []
        
        for line in item[0]:
            if line == '\n' or line.strip() == '':
                continue
            tgt.append(line)
            
        #if mode == 'nl':
        for text in item[2]:
            if text == '\n' or text.strip() == '':
                continue
            if casefolding:
                src_nl.append(text.lower())
            else:
                src_nl.append(text)
                    
        #if mode == 'code':
        for text in item[1]:
            if text == '\n' or text.strip() == '':
                continue
            src_code.append(text)
                
        if len(src_code) and len(src_nl) and len(tgt):
            line_stripped_data.append((src_code, src_nl, tgt))
            
    return line_stripped_data

# Filtering non ascii characters from code and intent

In [4]:
def is_ascii(s):
    return all(ord(c) < 128 for c in s)

In [5]:
def filter_ascii_data(data):
    ascii_filtered_data = []
    for item in data:
        code_tgt, code_src, intent = [], [], []
        for line in item[2]:
            if is_ascii(line):
                code_tgt.append(line)
                
        for line in item[0]:
            if is_ascii(line):
                code_src.append(line)
    
        for line in item[1]:
            if is_ascii(line):
                intent.append(line)
            
        if code_src == [] or intent == [] or code_tgt == []:
            continue
        ascii_filtered_data.append((code_src,intent,code_tgt))
    return ascii_filtered_data

# Filtering examples which are only 3 lines (both code and intent)

In [6]:
def lines_reduced_filter(length, data):
    lines_reduced_data = []
    
    for item in data:
        if 0 < len(item[0]) <= length and 0 < len(item[1]) <= length and 0 < len(item[2]) <= length:
            lines_reduced_data.append(item)
            
    return lines_reduced_data

# Filtering based on Pandas functions

In [7]:
with open('pickles/pandas_fns_names.pkl', 'rb') as fp:
    pandas_fns = pickle.load(fp)

In [8]:
def contains_pandas_api(codeline):
    for func in pandas_fns:
        if '.'+func in codeline:
            return True
    return False

In [9]:
def pandas_fn_name_filter(data):
    pandas_fn_filtered_data = []
    
    for item in data:
        code = item[2]
        if any(list(map(contains_pandas_api, code))):
            pandas_fn_filtered_data.append(item)
            
    return pandas_fn_filtered_data

# Chop every line to a maxium fixed length

In [10]:
def chop_data(data, length):
    chopped_data = []
    
    for item in data:
        code_src,intent,code_tgt = item[0], item[1], item[2]
        #src, tgt = item[0], item[1]
        code_src_chopped,intent_chopped,code_tgt_chopped = [], [], []
        #src_chopped, tgt_chopped = [], []
        
        for line in code_src:
            code_src_chopped.append(line[:length])
            
        for line in intent:
            intent_chopped.append(line[:length])
            
        for line in code_tgt:
            code_tgt_chopped.append(line[:length])
            
        chopped_data.append((code_src_chopped,intent_chopped,code_tgt_chopped))
    return chopped_data

In [11]:
pandas_data = load_data('/home/anushap/Code-Generation-Old/pandas_context_dataset_5years_no_dups_FINAL.pkl')
print (len(pandas_data), ' is original raw data')
line_stripped_data = filter_newline(pandas_data, mode='nl')

112445  is original raw data


In [12]:
line_stripped_data[0]

(["API_KEY = 'hEerZVQ4FJm8y725RJsB'"],
 ["keep in mind that the json responses you will be getting from the api map almost one-to-one to python's dictionaries. unfortunately, they can be very nested, so make sure you read up on indexing dictionaries in the documentation provided above."],
 ['import requests\n'])

In [13]:
ascii_filtered_data = filter_ascii_data(line_stripped_data)
print (len(ascii_filtered_data), ' is after ascii filtering')

103040  is after ascii filtering


In [14]:
ascii_filtered_data[1]

(['import requests\n'],
 ["keep in mind that the json responses you will be getting from the api map almost one-to-one to python's dictionaries. unfortunately, they can be very nested, so make sure you read up on indexing dictionaries in the documentation provided above."],
 ['r= requests.get("https://www.quandl.com/api/v3/datasets/FSE/AFX_X/data.json")\n',
  'json_data = r.json()\n',
  'json_data["dataset_data"]["data"][0]'])

In [15]:
from statistics import mean, median

print(len(ascii_filtered_data[1][0]), len(ascii_filtered_data[1][1]), len(ascii_filtered_data[1][2]))
print(Counter(len(i[0]) for i in ascii_filtered_data))
print(Counter(len(i[1]) for i in ascii_filtered_data))
print(Counter(len(i[2]) for i in ascii_filtered_data))

print(max(len(i[0]) for i in ascii_filtered_data))
print(max(len(i[1]) for i in ascii_filtered_data))
print(max(len(i[2]) for i in ascii_filtered_data))

1 1 3
Counter({1: 43713, 2: 18106, 3: 9368, 4: 6210, 5: 4318, 6: 3484, 7: 2506, 8: 1957, 9: 1625, 10: 1369, 11: 990, 12: 913, 13: 760, 14: 671, 15: 557, 16: 513, 17: 421, 18: 360, 19: 336, 20: 305, 21: 297, 22: 225, 23: 188, 25: 171, 24: 167, 26: 157, 33: 130, 28: 127, 27: 126, 32: 94, 29: 90, 34: 86, 31: 79, 30: 68, 35: 66, 36: 65, 37: 59, 39: 55, 40: 48, 44: 47, 41: 45, 46: 44, 42: 42, 38: 41, 43: 38, 47: 36, 45: 34, 58: 32, 54: 24, 50: 24, 59: 24, 51: 24, 53: 24, 56: 22, 60: 22, 66: 21, 61: 21, 62: 20, 84: 19, 68: 19, 52: 19, 48: 18, 49: 18, 130: 18, 55: 18, 76: 17, 77: 17, 57: 17, 155: 16, 63: 16, 132: 15, 69: 15, 71: 14, 70: 14, 163: 14, 82: 14, 88: 14, 75: 14, 72: 14, 118: 14, 65: 13, 86: 13, 218: 13, 102: 13, 94: 12, 87: 12, 64: 12, 85: 12, 184: 11, 177: 11, 99: 11, 74: 11, 153: 11, 89: 11, 98: 11, 166: 10, 133: 10, 73: 10, 90: 10, 207: 10, 154: 10, 156: 10, 126: 10, 139: 10, 197: 9, 150: 9, 122: 9, 93: 9, 81: 9, 128: 9, 148: 9, 158: 9, 116: 9, 79: 9, 112: 9, 107: 9, 80: 9, 190:

In [16]:
lines_reduced_data = lines_reduced_filter(5, ascii_filtered_data)
print (len(lines_reduced_data), ' is after num of lines reduced')

65690  is after num of lines reduced


In [17]:
ascii_filtered_data = lines_reduced_data
print(len(ascii_filtered_data[1][0]), len(ascii_filtered_data[1][1]), len(ascii_filtered_data[1][2]))
print(Counter(len(i[0]) for i in ascii_filtered_data))
print(Counter(len(i[1]) for i in ascii_filtered_data))
print(Counter(len(i[2]) for i in ascii_filtered_data))

print(max(len(i[0]) for i in ascii_filtered_data))
print(max(len(i[1]) for i in ascii_filtered_data))
print(max(len(i[2]) for i in ascii_filtered_data))
print(len(ascii_filtered_data))

1 1 3
Counter({1: 36532, 2: 14498, 3: 7145, 4: 4456, 5: 3059})
Counter({1: 48880, 2: 10273, 3: 3186, 4: 1911, 5: 1440})
Counter({1: 35908, 2: 14674, 3: 7461, 4: 4521, 5: 3126})
5
5
5
65690


In [18]:
pandas_fn_filtered_data = pandas_fn_name_filter(ascii_filtered_data)
print (len(pandas_fn_filtered_data), ' is after filtering for pandas fns')

54915  is after filtering for pandas fns


In [19]:
pandas_fn_filtered_data[0]

(['import requests\n'],
 ["keep in mind that the json responses you will be getting from the api map almost one-to-one to python's dictionaries. unfortunately, they can be very nested, so make sure you read up on indexing dictionaries in the documentation provided above."],
 ['r= requests.get("https://www.quandl.com/api/v3/datasets/FSE/AFX_X/data.json")\n',
  'json_data = r.json()\n',
  'json_data["dataset_data"]["data"][0]'])

In [20]:
chopped_data = chop_data(pandas_fn_filtered_data, 150)
print (len(chopped_data), ' is after chopping every line for target size (char)')

54915  is after chopping every line for target size (char)


In [22]:
pandas_fn_filtered_data[1]

(["first_result.find('strong').text[0:-1]"],
 ['### we do need to know that an escape sequence represents a single character'],
 ["first_result.find('strong').text[0:-1] + ', 2017'  "])

In [27]:
new_data = []

for i in range(len(pandas_fn_filtered_data)):
    lis = []
    for j in range(len(pandas_fn_filtered_data[i])):
            lis.append(' '.join(pandas_fn_filtered_data[i][j]))
        
    new_data.append(' '.join(lis))

In [28]:
len(new_data)

54915

In [45]:
len(set(new_data))

54848

In [31]:
print(len(set(new_data)))
data = pandas_fn_filtered_data

54848


In [32]:
new_data1 = []
new_data2 = []

for i in range(len(data)):
    lis1 = []
    lis2 = []
    for j in range(len(data[i])):
        if j ==0 or j==2:
            lis1.append(' '.join(data[i][j]))
        if j==0 or j==1:
            lis2.append(' '.join(data[i][j]))
    new_data1.append(' '.join(lis1))
    new_data2.append(' '.join(lis2))

In [33]:
print(len(new_data1))
print(len(new_data2))

54915
54915


In [34]:
print(len(set(new_data1)))
print(len(set(new_data2)))

54774
51995


In [35]:
index1 = []
new_data_set1 = list(set(new_data1))
for i in range(len(new_data_set1)):
    
    index1.append(new_data1.index(new_data_set1[i]))

In [36]:
index2 = []
new_data_set2 = list(set(new_data2))
for i in range(len(new_data_set2)):
    
    index2.append(new_data2.index(new_data_set2[i]))

In [37]:
print(len(index2),len(index1))

51995 54774


In [38]:
index = list(set(index1).intersection(set(index2)))

In [39]:
print(max(index),min(index))

54914 0


In [40]:
len(index)

51928

In [41]:
final_data = []

for i in range(len(index)):
        final_data.append(data[index[i]])

In [42]:
pandas_fn_filtered_data_code2code = [(i[0],i[2]) for i in final_data] #pandas_fn_filtered_data]
pandas_fn_filtered_data_nl2code = [(i[1],i[2]) for i in final_data] #pandas_fn_filtered_data]

In [43]:
pickle.dump(pandas_fn_filtered_data_code2code,open('/home/anushap/Code-Generation-Old/pandas_context_dataset_5years_no_dups_filtered_code2code_FINAL.pkl','wb'))

In [44]:
pickle.dump(pandas_fn_filtered_data_nl2code,open('/home/anushap/Code-Generation-Old/pandas_context_dataset_5years_no_dups_filtered_nl2code_FINAL.pkl','wb'))