# Data cleaning and preproccessing 


### To acquire the data set

# https://github.com/github/CodeSearchNet

In [1]:
#importing libraries

import astor
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split
import ast
import glob
import re
from pathlib import Path


In [2]:
df = pd.concat([pd.read_json(f'python_train_{i}.jsonl',lines=True) for i in range(14)])

In [3]:
df.head()

Unnamed: 0,repo,path,func_name,original_string,language,code,code_tokens,docstring,docstring_tokens,sha,url,partition
0,ageitgey/face_recognition,examples/face_recognition_knn.py,train,"def train(train_dir, model_save_path=None, n_n...",python,"def train(train_dir, model_save_path=None, n_n...","[def, train, (, train_dir, ,, model_save_path,...",Trains a k-nearest neighbors classifier for fa...,"[Trains, a, k, -, nearest, neighbors, classifi...",c96b010c02f15e8eeb0f71308c641179ac1f19bb,https://github.com/ageitgey/face_recognition/b...,train
1,ageitgey/face_recognition,examples/face_recognition_knn.py,predict,"def predict(X_img_path, knn_clf=None, model_pa...",python,"def predict(X_img_path, knn_clf=None, model_pa...","[def, predict, (, X_img_path, ,, knn_clf, =, N...",Recognizes faces in given image using a traine...,"[Recognizes, faces, in, given, image, using, a...",c96b010c02f15e8eeb0f71308c641179ac1f19bb,https://github.com/ageitgey/face_recognition/b...,train
2,ageitgey/face_recognition,examples/face_recognition_knn.py,show_prediction_labels_on_image,"def show_prediction_labels_on_image(img_path, ...",python,"def show_prediction_labels_on_image(img_path, ...","[def, show_prediction_labels_on_image, (, img_...",Shows the face recognition results visually.\n...,"[Shows, the, face, recognition, results, visua...",c96b010c02f15e8eeb0f71308c641179ac1f19bb,https://github.com/ageitgey/face_recognition/b...,train
3,ageitgey/face_recognition,face_recognition/api.py,_rect_to_css,"def _rect_to_css(rect):\n """"""\n Convert ...",python,"def _rect_to_css(rect):\n """"""\n Convert ...","[def, _rect_to_css, (, rect, ), :, return, rec...",Convert a dlib 'rect' object to a plain tuple ...,"[Convert, a, dlib, rect, object, to, a, plain,...",c96b010c02f15e8eeb0f71308c641179ac1f19bb,https://github.com/ageitgey/face_recognition/b...,train
4,ageitgey/face_recognition,face_recognition/api.py,_trim_css_to_bounds,"def _trim_css_to_bounds(css, image_shape):\n ...",python,"def _trim_css_to_bounds(css, image_shape):\n ...","[def, _trim_css_to_bounds, (, css, ,, image_sh...","Make sure a tuple in (top, right, bottom, left...","[Make, sure, a, tuple, in, (, top, right, bott...",c96b010c02f15e8eeb0f71308c641179ac1f19bb,https://github.com/ageitgey/face_recognition/b...,train


In [4]:
df.shape

(412178, 12)

# We tokenize our data to make them easier for our model to process

In [5]:
def tokenize_docstring(text):
    """Gets filetered docstring tokens which help describe the function"""
    
    # Remove decorators and other parameter signatures in the docstring
    before_keyword, keyword, after_keyword = text.partition(':')
    before_keyword, keyword, after_keyword = before_keyword.partition('@param')
    before_keyword, keyword, after_keyword = before_keyword.partition('param')
    before_keyword, keyword, after_keyword = before_keyword.partition('@brief')
    
    if(after_keyword):    
        words = RegexpTokenizer(r'[a-zA-Z0-9]+').tokenize(after_keyword)
    else:
        before_keyword, keyword, after_keyword = before_keyword.partition('@')
        words = RegexpTokenizer(r'[a-zA-Z0-9]+').tokenize(before_keyword)
        
    # Convert all docstrings to lowercase
    new_words= [word.lower() for word in words if word.isalnum()]
    
    return new_words


def tokenize_code(text):
    """Gets filetered fucntion tokens"""
    
    # Remove decorators and function signatures till the def token
    keyword = 'def '
    before_keyword, keyword, after_keyword = text.partition(keyword)
    words = RegexpTokenizer(r'[a-zA-Z0-9]+').tokenize(after_keyword)
    
    # Convert function tokens to lowercase and remove single alphabet variables
    new_words= [word.lower() for word in words if (word.isalpha() and len(word)>1) or (word.isnumeric())]
    return new_words


def get_function_docstring_pairs(blob):
    "Extracts (function/method, docstring) pairs from a given code blob."
    
    pairs = []
    try:
        module = ast.parse(blob) # Converts the python code into an abstract syntx tree
        classes = [node for node in module.body if isinstance(node, ast.ClassDef)] # Retrieves classes from source code
        functions = [node for node in module.body if isinstance(node, ast.FunctionDef)] # Retrieves functions from the source code
        for _class in classes:
            functions.extend([node for node in _class.body if isinstance(node, ast.FunctionDef)]) # Retrieves functions from the classes extracted

        for f in functions:
            source = astor.to_source(f) # Convert the functions extracted into ast format so as to remove comments
            docstring = ast.get_docstring(f) if ast.get_docstring(f) else '' # Get docstring from fucntion definition if present
            function = source.replace(ast.get_docstring(f, clean=False), '') if docstring else source # function definition without any comments

            pairs.append((f.name,
                          f.lineno,
                          source,
                          ' '.join(tokenize_code(function)),
                          ' '.join(tokenize_docstring(docstring.split('\n\n')[0]))
                         ))
    except (AssertionError, MemoryError, SyntaxError, UnicodeEncodeError):
        pass
    return pairs


def get_function_docstring_pairs_list(blob_list):
    """apply the function `get_function_docstring_pairs` on a list of blobs"""
    return [get_function_docstring_pairs(b) for b in blob_list]

In [10]:
func_code = df.original_string.tolist()

In [11]:
func_doc = get_function_docstring_pairs_list(function_code)

In [12]:
len(func_doc)

412178

In [13]:
df['pairs'] = func_doc
df.head()

Unnamed: 0,repo,path,func_name,original_string,language,code,code_tokens,docstring,docstring_tokens,sha,url,partition,pairs
0,ageitgey/face_recognition,examples/face_recognition_knn.py,train,"def train(train_dir, model_save_path=None, n_n...",python,"def train(train_dir, model_save_path=None, n_n...","[def, train, (, train_dir, ,, model_save_path,...",Trains a k-nearest neighbors classifier for fa...,"[Trains, a, k, -, nearest, neighbors, classifi...",c96b010c02f15e8eeb0f71308c641179ac1f19bb,https://github.com/ageitgey/face_recognition/b...,train,"[(train, 1, def train(train_dir, model_save_pa..."
1,ageitgey/face_recognition,examples/face_recognition_knn.py,predict,"def predict(X_img_path, knn_clf=None, model_pa...",python,"def predict(X_img_path, knn_clf=None, model_pa...","[def, predict, (, X_img_path, ,, knn_clf, =, N...",Recognizes faces in given image using a traine...,"[Recognizes, faces, in, given, image, using, a...",c96b010c02f15e8eeb0f71308c641179ac1f19bb,https://github.com/ageitgey/face_recognition/b...,train,"[(predict, 1, def predict(X_img_path, knn_clf=..."
2,ageitgey/face_recognition,examples/face_recognition_knn.py,show_prediction_labels_on_image,"def show_prediction_labels_on_image(img_path, ...",python,"def show_prediction_labels_on_image(img_path, ...","[def, show_prediction_labels_on_image, (, img_...",Shows the face recognition results visually.\n...,"[Shows, the, face, recognition, results, visua...",c96b010c02f15e8eeb0f71308c641179ac1f19bb,https://github.com/ageitgey/face_recognition/b...,train,"[(show_prediction_labels_on_image, 1, def show..."
3,ageitgey/face_recognition,face_recognition/api.py,_rect_to_css,"def _rect_to_css(rect):\n """"""\n Convert ...",python,"def _rect_to_css(rect):\n """"""\n Convert ...","[def, _rect_to_css, (, rect, ), :, return, rec...",Convert a dlib 'rect' object to a plain tuple ...,"[Convert, a, dlib, rect, object, to, a, plain,...",c96b010c02f15e8eeb0f71308c641179ac1f19bb,https://github.com/ageitgey/face_recognition/b...,train,"[(_rect_to_css, 1, def _rect_to_css(rect):\n ..."
4,ageitgey/face_recognition,face_recognition/api.py,_trim_css_to_bounds,"def _trim_css_to_bounds(css, image_shape):\n ...",python,"def _trim_css_to_bounds(css, image_shape):\n ...","[def, _trim_css_to_bounds, (, css, ,, image_sh...","Make sure a tuple in (top, right, bottom, left...","[Make, sure, a, tuple, in, (, top, right, bott...",c96b010c02f15e8eeb0f71308c641179ac1f19bb,https://github.com/ageitgey/face_recognition/b...,train,"[(_trim_css_to_bounds, 1, def _trim_css_to_bou..."


In [14]:
%%time 
df = df.set_index(['repo', 'path'])['pairs'].apply(pd.Series).stack()
df = df.reset_index()
df.columns = ['repo','path','_','pair']

CPU times: user 1min 51s, sys: 9.83 s, total: 2min 1s
Wall time: 2min 35s


In [18]:
df.tail(200)

Unnamed: 0,repo,path,_,pair
403261,mbarakaja/braulio,braulio/cli.py,0,"(changelog_file_option_validator, 1, def chang..."
403262,mbarakaja/braulio,braulio/cli.py,0,"(message_option_validator, 1, def message_opti..."
403263,mbarakaja/braulio,braulio/cli.py,0,"(current_version_option_validator, 1, def curr..."
403264,mbarakaja/braulio,braulio/cli.py,0,"(label_pattern_option_validator, 1, def label_..."
403265,mbarakaja/braulio,braulio/cli.py,0,"(release, 1, def release(ctx, bump, bump_type,..."
...,...,...,...,...
403456,pjuren/pyokit,src/pyokit/datastruct/read.py,0,"(getRelativeQualityScore, 1, def getRelativeQu..."
403457,pjuren/pyokit,src/pyokit/datastruct/read.py,0,"(reverse_complement, 1, def reverse_complement..."
403458,pjuren/pyokit,src/pyokit/datastruct/read.py,0,"(split, 1, def split(self, point=None):\n ""..."
403459,pjuren/pyokit,src/pyokit/datastruct/read.py,0,"(merge, 1, def merge(self, other, forceMerge=F..."


# We restructure the dataset according to our needs

In [19]:
%%time
df['function_name'] = df['pair'].apply(lambda p: p[0])
df['lineno'] = df['pair'].apply(lambda p: p[1])
df['original_function'] = df['pair'].apply(lambda p: p[2])
df['function_tokens'] = df['pair'].apply(lambda p: p[3])
df['docstring_tokens'] = df['pair'].apply(lambda p: p[4])
df = df[['repo', 'path', 'function_name', 'lineno', 'original_function', 'function_tokens', 'docstring_tokens']]
df['url'] = df[['repo', 'path', 'lineno']].apply(lambda x: 'https://github.com/{}/blob/master/{}#L{}'.format(x[0], x[1], x[2]), axis=1)

CPU times: user 24.7 s, sys: 49.9 ms, total: 24.7 s
Wall time: 24.9 s


In [20]:
df.head()

Unnamed: 0,repo,path,function_name,lineno,original_function,function_tokens,docstring_tokens,url
0,ageitgey/face_recognition,examples/face_recognition_knn.py,train,1,"def train(train_dir, model_save_path=None, n_n...",train train dir model save path none neighbors...,trains a k nearest neighbors classifier for fa...,https://github.com/ageitgey/face_recognition/b...
1,ageitgey/face_recognition,examples/face_recognition_knn.py,predict,1,"def predict(X_img_path, knn_clf=None, model_pa...",predict img path knn clf none model path none ...,recognizes faces in given image using a traine...,https://github.com/ageitgey/face_recognition/b...
2,ageitgey/face_recognition,examples/face_recognition_knn.py,show_prediction_labels_on_image,1,"def show_prediction_labels_on_image(img_path, ...",show prediction labels on image img path predi...,shows the face recognition results visually,https://github.com/ageitgey/face_recognition/b...
3,ageitgey/face_recognition,face_recognition/api.py,_rect_to_css,1,"def _rect_to_css(rect):\n """"""\n Convert ...",rect to css rect return rect top rect right re...,convert a dlib rect object to a plain tuple in...,https://github.com/ageitgey/face_recognition/b...
4,ageitgey/face_recognition,face_recognition/api.py,_trim_css_to_bounds,1,"def _trim_css_to_bounds(css, image_shape):\n ...",trim css to bounds css image shape return max ...,make sure a tuple in top right bottom left ord...,https://github.com/ageitgey/face_recognition/b...


In [21]:
%%time
before_dedup = len(df)
df = df.drop_duplicates(['original_function'])
after_dedup = len(df)

print(f'Removed {before_dedup - after_dedup:,} duplicate rows')

Removed 10 duplicate rows
CPU times: user 1.19 s, sys: 4.49 ms, total: 1.19 s
Wall time: 1.26 s


In [22]:
%%time
before_dedup = len(df)
df = df.drop_duplicates(['function_tokens'])
after_dedup = len(df)

print(f'Removed {before_dedup - after_dedup:,} duplicate rows')

Removed 557 duplicate rows
CPU times: user 599 ms, sys: 157 µs, total: 599 ms
Wall time: 614 ms


In [23]:
df.shape

(402894, 8)

In [24]:
def listlen(x):
    if not isinstance(x, list):
        return 0
    return len(x)

with_docstrings = df[df.docstring_tokens.str.split().apply(listlen) >= 3]
without_docstrings = df[df.docstring_tokens.str.split().apply(listlen) < 3]
print('Number of Function Snippets with docstring',len(with_docstrings))
print('Number of Function Snippets without docstring',len(without_docstrings))

Number of Function Snippets with docstring 375557
Number of Function Snippets without docstring 27337


In [25]:
with_docstrings.to_csv('processed_full.csv')

In [29]:
with_docstrings.tail()

Unnamed: 0,repo,path,function_name,lineno,original_function,function_tokens,docstring_tokens,url
403455,pjuren/pyokit,src/pyokit/datastruct/read.py,trimLeft,1,"def trimLeft(self, amount):\n """"""\n Tr...",trimleft self amount if amount 0 return self s...,trim this fastqsequence in place by removing a...,https://github.com/pjuren/pyokit/blob/master/s...
403456,pjuren/pyokit,src/pyokit/datastruct/read.py,getRelativeQualityScore,1,"def getRelativeQualityScore(self, i, score_typ...",getrelativequalityscore self score type illumi...,get the realtive quality score i e the phred q...,https://github.com/pjuren/pyokit/blob/master/s...
403457,pjuren/pyokit,src/pyokit/datastruct/read.py,reverse_complement,1,"def reverse_complement(self, is_RNA=None):\n ...",reverse complement self is rna none sequence r...,reverse complement this read in place,https://github.com/pjuren/pyokit/blob/master/s...
403458,pjuren/pyokit,src/pyokit/datastruct/read.py,split,1,"def split(self, point=None):\n """"""\n Spl...",split self point none if point is none point l...,split this read into two halves original seque...,https://github.com/pjuren/pyokit/blob/master/s...
403459,pjuren/pyokit,src/pyokit/datastruct/read.py,merge,1,"def merge(self, other, forceMerge=False):\n ...",merge self other forcemerge false if self sequ...,merge two reads by concatenating their sequenc...,https://github.com/pjuren/pyokit/blob/master/s...


In [31]:
# Number of unique repositories
print("Number of repositories -",len(list(set(with_docstrings['repo']))))

Number of repositories - 11991


In [32]:
# Getting the count of function tokens per function
with_docstrings['function_tokens_count'] = [len(item.split()) for item in list(with_docstrings['function_tokens'].values)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [34]:
#Grouping entries by thie repository name
grouped = with_docstrings.groupby('repo')

In [35]:
# train, valid, test splits
train, valid = train_test_split(list(grouped), train_size=0.9, random_state=8081)
train, test = train_test_split(train, train_size=0.9, random_state=8081)

In [36]:
train = pd.concat([d for _, d in train]).reset_index(drop=True)
valid = pd.concat([d for _, d in valid]).reset_index(drop=True)
test = pd.concat([d for _, d in test]).reset_index(drop=True)

In [37]:
print(f'train set num rows {train.shape[0]:,}')
print(f'valid set num rows {valid.shape[0]:,}')
print(f'test set num rows {test.shape[0]:,}')
print(f'without docstring rows {without_docstrings.shape[0]:,}')

train set num rows 303,256
valid set num rows 37,783
test set num rows 34,518
without docstring rows 27,337


In [38]:
train.head()

Unnamed: 0,repo,path,function_name,lineno,original_function,function_tokens,docstring_tokens,url,function_tokens_count
0,kennethreitz/env,env.py,lower_dict,1,"def lower_dict(d):\n """"""Lower cases string ...",lower dict for in items try lower except attri...,lower cases string keys in given dict,https://github.com/kennethreitz/env/blob/maste...,10
1,kennethreitz/env,env.py,urlparse,1,"def urlparse(d, keys=None):\n """"""Returns a ...",urlparse keys none copy if keys is none keys k...,returns a copy of the given dictionary with ur...,https://github.com/kennethreitz/env/blob/maste...,18
2,kennethreitz/env,env.py,prefix,1,"def prefix(prefix):\n """"""Returns a dictiona...",prefix prefix lower dict environ copy prefix p...,returns a dictionary of all environment variab...,https://github.com/kennethreitz/env/blob/maste...,22
3,kennethreitz/env,env.py,map,1,"def map(**kwargs):\n """"""Returns a dictionar...",map kwargs lower dict environ copy for in kwar...,returns a dictionary of the given keyword argu...,https://github.com/kennethreitz/env/blob/maste...,13
4,aloetesting/aloe_webdriver,aloe_webdriver/util.py,string_literal,1,"def string_literal(content):\n """"""\n Cho...",string literal content if in content and in co...,choose a string literal that can wrap our string,https://github.com/aloetesting/aloe_webdriver/...,27


In [40]:
train.sort_values(by=['function_tokens_count'], inplace=True)
valid.sort_values(by=['function_tokens_count'], inplace=True)
test.sort_values(by=['function_tokens_count'], inplace=True)

In [42]:
train.to_csv('train_sorted.csv')
valid.to_csv('valid_sorted.csv')
test.to_csv('test_sorted.csv')

In [43]:
train.head()

Unnamed: 0,repo,path,function_name,lineno,original_function,function_tokens,docstring_tokens,url,function_tokens_count
89542,espressif/esptool,ecdsa/numbertheory.py,gcd2,1,"def gcd2(a, b):\n """"""Greatest common diviso...",while return,greatest common divisor using euclid s algorithm,https://github.com/espressif/esptool/blob/mast...,2
74655,gwastro/pycbc,pycbc/conversions.py,chi_a,1,"def chi_a(mass1, mass2, spin1z, spin2z):\n ...",chi return,returns the aligned mass weighted spin differe...,https://github.com/gwastro/pycbc/blob/master/p...,2
165400,SHDShim/pytheos,pytheos/eqn_bm3.py,bm3_k,1,"def bm3_k(p, v0, k0, k0p):\n """"""\n calcu...",return cal,calculate bulk modulus wrapper for cal k bm3 c...,https://github.com/SHDShim/pytheos/blob/master...,2
177346,olsoneric/pedemath,pedemath/vec2.py,cross_v2,1,"def cross_v2(vec1, vec2):\n """"""Return the c...",cross return,return the crossproduct of the two vectors as ...,https://github.com/olsoneric/pedemath/blob/mas...,2
285898,asweigart/pyautogui,pyautogui/__init__.py,getPointOnLine,1,"def getPointOnLine(x1, y1, x2, y2, n):\n """"...",getpointonline return,returns the x y tuple of the point that has pr...,https://github.com/asweigart/pyautogui/blob/ma...,2


# Now that our data are set up for the task, we can start the vectorization process