In [21]:
import py_utils
import os

## Identify a python file to tokenize

In [27]:
discover_dir = '/home/bhanu/Documents/projects/personal/learning/discover-container/discover'
discover_nlp_path = os.path.join(discover_dir, 'scrape.py')

## Load the file in a variable

In [28]:
discover_mod = py_utils.load_module(discover_nlp_path)

## List all functions in the file

In [29]:
[c for c in scrape.find_functions(discover_mod)]

[<function module.name._func_calls(fn)>,
 <function module.name._method_calls(fn)>,
 <function module.name.annotate(func)>,
 <function module.name.camel_case_split(identifier)>,
 <function module.name.find_functions(module)>,
 <function module.name.find_modules(x)>,
 <function module.name.index_python_files(py_module)>,
 <function module.name.load_module(path)>,
 <function module.name.process_docstring(doc_string)>,
 <function module.name.process_funcs(functions)>,
 <function module.name.process_methods(methods)>,
 <function module.name.process_name(func_name)>,
 <function module.name.snake_case_split(identifier)>,
 <function module.name.tokenize(annotation)>]

## Determine function properties

Identify these for each function

- name
- docstring
- function calls
- method calls


In [30]:
for fn in py_utils.find_functions(discover_mod):
    print("fn", fn.__name__)
    for k, v in py_utils.annotate(fn).items():
        print("\t", k)
        print("\t", "*" * 30)
        if v:
            print("\t", v)
        print("\n")

fn _func_calls
	 doc
	 ******************************
	 Determine function calls within a function

    Args:
        fn (module): python module

    Returns:
        funcs (list): list of function calls within a function
    


	 name
	 ******************************
	 _func_calls


	 functions
	 ******************************
	 ['dis', 'Bytecode', 'append']


	 methods
	 ******************************
	 ['Bytecode', 'opname', 'append', 'argval']


fn _method_calls
	 doc
	 ******************************
	 Determine method calls within a function

    Args:
        fn (module): python module

    Returns:
        (list): list of method calls within a function
    


	 name
	 ******************************
	 _method_calls


	 functions
	 ******************************
	 ['re', 'findall', 'METHOD', 'getsource']


	 methods
	 ******************************
	 ['findall']


fn annotate
	 doc
	 ******************************
	 Annotate a function with doc string, functions, methods and name


In [31]:
for fn in py_utils.find_functions(discover_mod):
    print(fn)
    for mc in py_utils._method_calls(fn):
        print("\t", mc)
    print("*" * 20)

<function _func_calls at 0x7f8fdce39290>
	 Bytecode
	 opname
	 append
	 argval
********************
<function _method_calls at 0x7f8fdce39320>
	 findall
********************
<function annotate at 0x7f8fdce39200>
	 __doc__
	 __doc__
	 __name__
	 __class__
	 __class__
********************
<function camel_case_split at 0x7f901c0fd4d0>
	 finditer
	 group
********************
<function find_functions at 0x7f8fdce393b0>
	 __name__
********************
<function find_modules at 0x7f8fdce39440>
	 rglob
	 py
********************
<function index_python_files at 0x7f8fdce39560>
	 append
	 join
	 fit_transform
	 npz
	 pkl
********************
<function load_module at 0x7f8fdce394d0>
	 name
	 loader
	 exec_module
	 loader
	 get_code
	 name
********************
<function process_docstring at 0x7f8fdce36ef0>
	 findall
	 extend
********************
<function process_funcs at 0x7f8fdce39050>
	 extend
	 extend
********************
<function process_methods at 0x7f8fdce390e0>
	 extend
*******************

## Prepare document array

Append all tokens in a list. Each list item belongs to a function.

In [25]:
documents = []

for fn in py_utils.find_functions(discover_mod):
    ann = py_utils.annotate(fn)
    documents.append(' '.join(py_utils.tokenize(ann)))

documents

['determine function calls within a function args fn module python module returns funcs list list of function calls within a function  func calls dis bytecode append bytecode opname append argval',
 'determine method calls within a function args fn module python module returns list list of method calls within a function  method calls re findall method getsource findall',
 'annotate a function with doc string functions methods and name args func module python module returns d dict description annotate type  func calls type  method calls   doc     doc     name     class     class  ',
 'split camel case function names to tokens args identifier str identifier to split returns list lower case split tokens ex camel case camel case split re finditer finditer group',
 'determine method calls within a function args module module python module returns attr list list of method calls within a function find functions dir getattr callable getattr getattr   name  ',
 'get all python files given a pat

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
matrix = vectorizer.fit_transform(documents)


In [35]:
matrix

<14x101 sparse matrix of type '<class 'numpy.float64'>'
	with 250 stored elements in Compressed Sparse Row format>

In [36]:
vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

## Time to test!

In [37]:
question = "lower case"

In [38]:
ques_vec = vectorizer.transform([question])

In [39]:
from sklearn.metrics.pairwise import cosine_similarity

In [41]:
cs = cosine_similarity(ques_vec, matrix)
cs

array([[0.        , 0.        , 0.        , 0.3827608 , 0.        ,
        0.        , 0.        , 0.        , 0.08609757, 0.        ,
        0.09685866, 0.10500961, 0.44089331, 0.        ]])

In [43]:
# this will be the ID of the function in find_functions result
cs.ravel().argmax()

12

In [50]:
funcs = py_utils.find_functions(discover_mod)

for idx, fn in enumerate(funcs):
    print(idx, fn)

0 <function _func_calls at 0x7f8fdce39290>
1 <function _method_calls at 0x7f8fdce39320>
2 <function annotate at 0x7f8fdce39200>
3 <function camel_case_split at 0x7f901c0fd4d0>
4 <function find_functions at 0x7f8fdce393b0>
5 <function find_modules at 0x7f8fdce39440>
6 <function index_python_files at 0x7f8fdce39560>
7 <function load_module at 0x7f8fdce394d0>
8 <function process_docstring at 0x7f8fdce36ef0>
9 <function process_funcs at 0x7f8fdce39050>
10 <function process_methods at 0x7f8fdce390e0>
11 <function process_name at 0x7f8fdce36f80>
12 <function snake_case_split at 0x7f8fdce36e60>
13 <function tokenize at 0x7f8fdce39170>


## Few more tests

In [56]:
for q in ['annotate', 'camel', 'class', 'rglob']:
    funcs = py_utils.find_functions(discover_mod)
    print("Searching for", q, "\n", "*" * 20)
    q_vec = vectorizer.transform([q])
    cs = cosine_similarity(q_vec, matrix)
    fn_id = cs.ravel().argmax()
    print(list(funcs)[fn_id], "\n\n")


Searching for annotate 
 ********************
<function annotate at 0x7f8fdce39200> 


Searching for camel 
 ********************
<function camel_case_split at 0x7f901c0fd4d0> 


Searching for class 
 ********************
<function annotate at 0x7f8fdce39200> 


Searching for rglob 
 ********************
<function find_modules at 0x7f8fdce39440> 


