In [1]:
import pdb
import json
import glob
import pickle
import tokenize
from io import StringIO
import numpy as np
import sys
from collections import Counter

In [2]:
def load_data(file_path):
    with open(file_path, 'rb') as fp:
        data = pickle.load(fp)
    return data

# Filtering \n s from code and intent

In [3]:
def filter_newline(data, mode='nl', casefolding=True):
    line_stripped_data = []
    for item in data:
        src, tgt = [], []
        
        for line in item[0]:
            if line == '\n' or line.strip() == '':
                continue
            tgt.append(line)
            
        if mode == 'nl':
            for text in item[2]:
                if text == '\n' or text.strip() == '':
                    continue
                if casefolding:
                    src.append(text.lower())
                else:
                    src.append(text)
                    
        if mode == 'code':
            for text in item[1]:
                if text == '\n' or text.strip() == '':
                    continue
                src.append(text)
                
        line_stripped_data.append((src, tgt))
    return line_stripped_data

# Filtering non ascii characters from code and intent

In [4]:
def is_ascii(s):
    return all(ord(c) < 128 for c in s)

In [5]:
def filter_ascii_data(data):
    ascii_filtered_data = []
    for item in data:
        code, intent = [], []
        for line in item[0]:
            if is_ascii(line):
                code.append(line)
    
        for line in item[1]:
            if is_ascii(line):
                intent.append(line)
            
        if code == [] or intent == []:
            continue
        ascii_filtered_data.append((code, intent))
    return ascii_filtered_data

# Filtering examples which are only 3 lines (both code and intent)

In [6]:
def lines_reduced_filter(length, data):
    lines_reduced_data = []
    
    for item in data:
        if 0 < len(item[0]) <= length and 0 < len(item[1]) <= 3:
            lines_reduced_data.append(item)
            
    return lines_reduced_data

# Filtering based on Pandas functions

In [7]:
with open('pickles/pandas_fns_names.pkl', 'rb') as fp:
    pandas_fns = pickle.load(fp)

In [8]:
def contains_pandas_api(codeline):
    for func in pandas_fns:
        if '.'+func in codeline:
            return True
    return False

In [9]:
def pandas_fn_name_filter(data):
    pandas_fn_filtered_data = []
    
    for item in data:
        code = item[0]
        if any(list(map(contains_pandas_api, code))):
            pandas_fn_filtered_data.append(item)
            
    return pandas_fn_filtered_data

# Chop every line to a maxium fixed length

In [10]:
def chop_data(data, length):
    chopped_data = []
    
    for item in data:
        src, tgt = item[0], item[1]
        src_chopped, tgt_chopped = [], []
        
        for line in src:
            src_chopped.append(line[:length])
            
        for line in tgt:
            tgt_chopped.append(line[:length])
            
        chopped_data.append((src_chopped, tgt_chopped))
    return chopped_data

In [11]:
pandas_data = load_data('/home/anushap/Code-Generation-Old/pandas_context_dataset_5years_no_dups.pkl')
print (len(pandas_data), ' is original raw data')
line_stripped_data = filter_newline(pandas_data, mode='nl')

33205  is original raw data


In [12]:
ascii_filtered_data = filter_ascii_data(line_stripped_data)
print (len(ascii_filtered_data), ' is after ascii filtering')

30035  is after ascii filtering


In [13]:
lines_reduced_data = lines_reduced_filter(3, ascii_filtered_data)
print (len(lines_reduced_data), ' is after num of lines reduced')

17170  is after num of lines reduced


In [14]:
pandas_fn_filtered_data = pandas_fn_name_filter(lines_reduced_data)
print (len(pandas_fn_filtered_data), ' is after filtering for pandas fns')

6695  is after filtering for pandas fns


In [None]:
chopped_data = chop_data(pandas_fn_filtered_data, 150)
print (len(chopped_data), ' is after chopping every line for target size (char)')

In [None]:
with open('pickles/')