# Step 1 : Data loading and preprocessing

In [1]:
%load_ext autoreload
%autoreload 2

import ast
import glob
import re
from pathlib import Path

import astor
import pandas as pd
import spacy
from tqdm import tqdm
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split

from general_utils import apply_parallel, flattenlist

EN = spacy.load('en')

In [3]:
%%time
# Read the data into a pandas dataframe, and parse out some meta-data

df = pd.concat([pd.read_csv(f'https://storage.googleapis.com/kubeflow-examples/code_search/raw_data/00000000000{i}.csv') \
                for i in range(10)])

df['nwo'] = df['repo_path'].apply(lambda r: r.split()[0])
df['path'] = df['repo_path'].apply(lambda r: r.split()[1])
df.drop(columns=['repo_path'], inplace=True)
df = df[['nwo', 'path', 'content']]
df.head()

CPU times: user 50.3 s, sys: 20.8 s, total: 1min 11s
Wall time: 4min 34s


Unnamed: 0,nwo,path,content
0,fnl/libfnl,src/fnl/nlp/dictionary.py,"""""""\n.. py:module:: fnl.text.dictionary\n :s..."
1,KivApple/mcu-info-util,mcu_info_util/linker_script.py,from six import iteritems\n\n\ndef generate(op...
2,Yelp/pyleus,examples/bandwith_monitoring/bandwith_monitori...,"from __future__ import absolute_import, divisi..."
3,jhuapl-boss/boss-manage,bin/bearer_token.py,#!/usr/bin/env python3\n\n# Copyright 2016 The...
4,djfroofy/beatlounge,bl/orchestra/base.py,from itertools import cycle\n\nfrom twisted.py...


In [4]:
# Inspect shape of the raw data
df.shape


(1241664, 3)

## Functions to parse data and tokenize¶ 

#### Our goal is to parse the python files into (code, docstring) pairs. Fortunately, the standard library in python comes with the wonderful ast module which helps us extract code from files as well as extract docstrings.We also use the astor library to strip the code of comments by doing a round trip of converting the code to an AST and then from AST back to code.

In [5]:
def tokenize_docstring(text):
    "Apply tokenization using spacy to docstrings."
    tokens = EN.tokenizer(text)
    return [token.text.lower() for token in tokens if not token.is_space]


def tokenize_code(text):
    "A very basic procedure for tokenizing code strings."
    return RegexpTokenizer(r'\w+').tokenize(text)


def get_function_docstring_pairs(blob):
    "Extract (function/method, docstring) pairs from a given code blob."
    pairs = []
    try:
        module = ast.parse(blob)
        classes = [node for node in module.body if isinstance(node, ast.ClassDef)]
        functions = [node for node in module.body if isinstance(node, ast.FunctionDef)]
        for _class in classes:
            functions.extend([node for node in _class.body if isinstance(node, ast.FunctionDef)])

        for f in functions:
            source = astor.to_source(f)
            docstring = ast.get_docstring(f) if ast.get_docstring(f) else ''
            function = source.replace(ast.get_docstring(f, clean=False), '') if docstring else source

            pairs.append((f.name,
                          f.lineno,
                          source,
                          ' '.join(tokenize_code(function)),
                          ' '.join(tokenize_docstring(docstring.split('\n\n')[0]))
                         ))
    except (AssertionError, MemoryError, SyntaxError, UnicodeEncodeError):
        pass
    return pairs


def get_function_docstring_pairs_list(blob_list):
    """apply the function `get_function_docstring_pairs` on a list of blobs"""
    return [get_function_docstring_pairs(b) for b in blob_list]

####  The below convience function apply_parallel parses the code in parallel using process based threading. Adjust the cpu_cores parameter accordingly to your system resources!


In [8]:
%%time
pairs = flattenlist(apply_parallel(get_function_docstring_pairs_list, df.content.tolist(), cpu_cores=32))

CPU times: user 37.2 s, sys: 31.2 s, total: 1min 8s
Wall time: 5min 38s


In [9]:
assert len(pairs) == df.shape[0], f'Row count mismatch. `df` has {df.shape[0]:,} rows; `pairs` has {len(pairs):,} rows.'
df['pairs'] = pairs
df.head()

Unnamed: 0,nwo,path,content,pairs
0,fnl/libfnl,src/fnl/nlp/dictionary.py,"""""""\n.. py:module:: fnl.text.dictionary\n :s...","[(__init__, 19, def __init__(self, *leafs, **e..."
1,KivApple/mcu-info-util,mcu_info_util/linker_script.py,from six import iteritems\n\n\ndef generate(op...,"[(generate, 4, def generate(options, filename=..."
2,Yelp/pyleus,examples/bandwith_monitoring/bandwith_monitori...,"from __future__ import absolute_import, divisi...","[(__init__, 18, def __init__(self, size):\n ..."
3,jhuapl-boss/boss-manage,bin/bearer_token.py,#!/usr/bin/env python3\n\n# Copyright 2016 The...,"[(request, 46, def request(url, params=None, h..."
4,djfroofy/beatlounge,bl/orchestra/base.py,from itertools import cycle\n\nfrom twisted.py...,"[(schedule, 149, def schedule(time, func, args..."


## Flatten code, docstring pairs and extract meta-data¶


Flatten (code, docstring) pairs


In [11]:
%%time
# flatten pairs
df = df.set_index(['nwo', 'path'])['pairs'].apply(pd.Series).stack()
df = df.reset_index()
df.columns = ['nwo', 'path', '_', 'pair']

CPU times: user 6min 15s, sys: 28.9 s, total: 6min 44s
Wall time: 6min 42s




Extract meta-data and format dataframe.

We have not optimized this code. Pull requests are welcome!


In [12]:
%%time
df['function_name'] = df['pair'].apply(lambda p: p[0])
df['lineno'] = df['pair'].apply(lambda p: p[1])
df['original_function'] = df['pair'].apply(lambda p: p[2])
df['function_tokens'] = df['pair'].apply(lambda p: p[3])
df['docstring_tokens'] = df['pair'].apply(lambda p: p[4])
df = df[['nwo', 'path', 'function_name', 'lineno', 'original_function', 'function_tokens', 'docstring_tokens']]
df['url'] = df[['nwo', 'path', 'lineno']].apply(lambda x: 'https://github.com/{}/blob/master/{}#L{}'.format(x[0], x[1], x[2]), axis=1)
df.head()

CPU times: user 4min 18s, sys: 8.87 s, total: 4min 27s
Wall time: 4min 27s


Unnamed: 0,nwo,path,function_name,lineno,original_function,function_tokens,docstring_tokens,url
0,fnl/libfnl,src/fnl/nlp/dictionary.py,__init__,19,"def __init__(self, *leafs, **edges):\n self...",def __init__ self leafs edges self edges edges...,,https://github.com/fnl/libfnl/blob/master/src/...
1,fnl/libfnl,src/fnl/nlp/dictionary.py,__eq__,23,"def __eq__(self, other):\n if isinstance(ot...",def __eq__ self other if isinstance other Node...,,https://github.com/fnl/libfnl/blob/master/src/...
2,fnl/libfnl,src/fnl/nlp/dictionary.py,__repr__,29,def __repr__(self):\n return 'Node<leafs={}...,def __repr__ self return Node leafs edges form...,,https://github.com/fnl/libfnl/blob/master/src/...
3,fnl/libfnl,src/fnl/nlp/dictionary.py,createOrGet,32,"def createOrGet(self, token):\n """"""\n\t\tCr...",def createOrGet self token if token in self ed...,create or get the node pointed to by ` token `...,https://github.com/fnl/libfnl/blob/master/src/...
4,fnl/libfnl,src/fnl/nlp/dictionary.py,setLeaf,47,"def setLeaf(self, key, order):\n """"""\n\t\tS...",def setLeaf self key order self leafs append o...,store the ` key ` as a leaf of this node at po...,https://github.com/fnl/libfnl/blob/master/src/...


### Remove Duplicates¶


In [13]:
%%time
# remove observations where the same function appears more than once
before_dedup = len(df)
df = df.drop_duplicates(['original_function', 'function_tokens'])
after_dedup = len(df)

print(f'Removed {before_dedup - after_dedup:,} duplicate rows')

Removed 1,196,159 duplicate rows
CPU times: user 23.4 s, sys: 2.61 s, total: 26 s
Wall time: 26 s


In [14]:
df.shape

(5396853, 8)

###  Separate function w/o docstrings


In [15]:
def listlen(x):
    if not isinstance(x, list):
        return 0
    return len(x)

# separate functions w/o docstrings
# docstrings should be at least 3 words in the docstring to be considered a valid docstring

with_docstrings = df[df.docstring_tokens.str.split().apply(listlen) >= 3]
without_docstrings = df[df.docstring_tokens.str.split().apply(listlen) < 3]

### Partition code by repository to minimize leakage between train, valid & test sets.¶ 

Rough assumption that each repository has its own style. We want to avoid having code from the same repository in the training set as well as the validation or holdout set.

In [16]:
grouped = with_docstrings.groupby('nwo')

In [17]:
# train, valid, test splits
train, test = train_test_split(list(grouped), train_size=0.87, shuffle=True, random_state=8081)
train, valid = train_test_split(train, train_size=0.82, random_state=8081)

In [18]:
train = pd.concat([d for _, d in train]).reset_index(drop=True)
valid = pd.concat([d for _, d in valid]).reset_index(drop=True)
test = pd.concat([d for _, d in test]).reset_index(drop=True)


In [19]:
print(f'train set num rows {train.shape[0]:,}')
print(f'valid set num rows {valid.shape[0]:,}')
print(f'test set num rows {test.shape[0]:,}')
print(f'without docstring rows {without_docstrings.shape[0]:,}')

train set num rows 997,815
valid set num rows 216,680
test set num rows 187,048
without docstring rows 3,995,310



Preview what the training set looks like. You can start to see how the data looks, the function tokens and docstring tokens are what will be fed downstream into the models. The other information is important for diagnostics and bookeeping.

In [20]:
train.head()

Unnamed: 0,nwo,path,function_name,lineno,original_function,function_tokens,docstring_tokens,url
0,pdyba/aioquiz,utils.py,safe_del_key,46,"def safe_del_key(data, unwanted_key):\n """"""...",def safe_del_key data unwanted_key if isinstan...,safe deleter of keys : param data : dict : par...,https://github.com/pdyba/aioquiz/blob/master/u...
1,pdyba/aioquiz,views/utils.py,user_required,16,"def user_required(access_level='any_user', msg...",def user_required access_level any_user msg NO...,no_user - anonymus any_user - loged user mento...,https://github.com/pdyba/aioquiz/blob/master/v...
2,Kulbear/endless-2048,tester/minimax_tester.py,test_one_game,33,"def test_one_game(self):\n """"""Go through on...",def test_one_game self game self create_one_ga...,"go through one game , played by a minimaxagent...",https://github.com/Kulbear/endless-2048/blob/m...
3,Kulbear/endless-2048,agent/minimax_agent.py,get_move,37,"def get_move(self, game):\n """"""Search the n...",def get_move self game available game moves_av...,search the next optimal move by the iterative ...,https://github.com/Kulbear/endless-2048/blob/m...
4,Kulbear/endless-2048,agent/minimax_agent.py,search,53,"def search(self, game, alpha, beta, depth, max...",def search self game alpha beta depth max_dept...,the implementation of the minimax search with ...,https://github.com/Kulbear/endless-2048/blob/m...


## Output each set to train/valid/test.function/docstrings/lineage files 

Original functions are also written to compressed json files. (Raw functions contain ,, \t, \n, etc., it is less error-prone using json format)

{train,valid,test}.lineage are files that contain a reference to the original location where the code was retrieved.

In [21]:
!mkdir data

In [22]:
mkdir ./data/processed_data

In [23]:
def write_to(df, filename, path='./data/processed_data/'):
    "Helper function to write processed files to disk."
    out = Path(path)
    out.mkdir(exist_ok=True)
    df.function_tokens.to_csv(out/'{}.function'.format(filename), index=False)
    df.original_function.to_json(out/'{}_original_function.json.gz'.format(filename), orient='values', compression='gzip')
    if filename != 'without_docstrings':
        df.docstring_tokens.to_csv(out/'{}.docstring'.format(filename), index=False)
    df.url.to_csv(out/'{}.lineage'.format(filename), index=False)


In [24]:
import os
if not os.path.exists('data/'):
    os.makedirs('data/')
# write to output files
write_to(train, 'train')
write_to(valid, 'valid')
write_to(test, 'test')
write_to(without_docstrings, 'without_docstrings')

In [25]:
!ls -lah ./data/processed_data/

total 2.6G
drwxrwxr-x 2 ritesh ritesh 4.0K Apr  4 05:22 .
drwxrwxr-x 3 ritesh ritesh 4.0K Apr  4 05:16 ..
-rw-rw-r-- 1 ritesh ritesh  16M Apr  4 05:19 test.docstring
-rw-rw-r-- 1 ritesh ritesh  55M Apr  4 05:19 test.function
-rw-rw-r-- 1 ritesh ritesh  18M Apr  4 05:19 test.lineage
-rw-rw-r-- 1 ritesh ritesh  25M Apr  4 05:19 test_original_function.json.gz
-rw-rw-r-- 1 ritesh ritesh  70M Apr  4 05:18 train.docstring
-rw-rw-r-- 1 ritesh ritesh 308M Apr  4 05:17 train.function
-rw-rw-r-- 1 ritesh ritesh  86M Apr  4 05:19 train.lineage
-rw-rw-r-- 1 ritesh ritesh 140M Apr  4 05:18 train_original_function.json.gz
-rw-rw-r-- 1 ritesh ritesh  16M Apr  4 05:19 valid.docstring
-rw-rw-r-- 1 ritesh ritesh  69M Apr  4 05:19 valid.function
-rw-rw-r-- 1 ritesh ritesh  19M Apr  4 05:19 valid.lineage
-rw-rw-r-- 1 ritesh ritesh  31M Apr  4 05:19 valid_original_function.json.gz
-rw-rw-r-- 1 ritesh ritesh 1.1G Apr  4 05:19 without_docstrings.function
-rw-rw-r-- 1 ritesh ritesh 344M Apr  4