In [2]:
import networkx as nx
import os
from networkx.algorithms.traversal.depth_first_search import dfs_tree
from tabulate import tabulate
import regex as re
from transformers import BartTokenizerFast
DATA_PATH = "../../data"

In [3]:
top_10_tree_path = os.path.join(DATA_PATH,"processed","top_10_lib_tree","tree.pickle")
top_10_tree = nx.read_gpickle(top_10_tree_path)

In [4]:
len(top_10_tree.edges())

206083

In [5]:
len(top_10_tree.nodes())

206084

In [5]:
list(top_10_tree.predecessors('sklearn.decomposition.dict_learning'))

['sklearn.decomposition']

In [6]:
def get_nodes_with_in_degree(G,n):
    nodes = []
    for node in G.nodes():
        if G.in_degree(node)==n:
            nodes.append(node)
    return nodes


## How can we get this down:
1. Remove functions that start with `_ `(and descendants) -> Use python naming covention for private methods
2. Remove tests (and descendants)? How much of the tree do non-apis make up?
3. Strip out all functions that don't appear in the training set? 
    * What do we mean by appear? 

In [7]:
def remove_subtree(G,node):
    for child in list(nx.neighbors(G,node)):
        try:
            remove_subtree(G,child)
        except nx.NetworkXError:
            #Child has already been removed
            return
    G.remove_node(node)

def conditional_remove_subtrees(G,root,fn):
    """BFS a Graph G. If fn(G,root),remove root
    and all descenant nodes"""
    if not root in G:
        return
    if fn(G,root):
        remove_subtree(G,root)
    else:
        for child in list(nx.neighbors(G,root)):
            conditional_remove_subtrees(G,child,fn)

def is_private(G,node):
    return bool(re.search(r"\._(.+)",node))

In [8]:
conditional_remove_subtrees(top_10_tree,"libraries",is_private)

In [9]:
top_10_tree.size()

65138

In [6]:
def profile_libraries(G,root ="libraries") :
    sizes = [(name,len(dfs_tree(G,name).nodes)) for name in nx.neighbors(G,root)]
    return sorted(sizes, key=lambda x:x[-1], reverse=True)

In [7]:
print(tabulate(profile_libraries(top_10_tree),tablefmt="pretty"))

+---------+-------+
| pandas  | 61192 |
|  torch  | 59097 |
|  numpy  | 44177 |
|  keras  | 22441 |
|  scipy  | 9672  |
| sklearn | 9504  |
+---------+-------+


Remove contrib and compiler modules:

In [12]:
def is_contrib(G,node):
    return bool(re.search(r"\.contrib\.",node))

def is_compiler(G,node):
    return bool(re.search(r"\.compiler\.",node))

conditional_remove_subtrees(top_10_tree, "libraries", is_contrib)
print(top_10_tree.size())

conditional_remove_subtrees(top_10_tree, "libraries", is_compiler)
print(top_10_tree.size())


65138
65138


In [13]:
profile_libraries(top_10_tree)

[('torch', 20934),
 ('pandas', 17060),
 ('numpy', 14420),
 ('keras', 6655),
 ('scipy', 3079),
 ('sklearn', 2990)]

In [14]:
top_10_tree.size()

65138

Remove keras from tensorflow:

In [15]:
def is_tf_keras(G,node):
    return bool(re.search(r"tensorflow.*\.keras\.",node))

conditional_remove_subtrees(top_10_tree, "libraries", is_tf_keras)
print(top_10_tree.size())

65138


In [16]:
profile_libraries(top_10_tree)

[('torch', 20934),
 ('pandas', 17060),
 ('numpy', 14420),
 ('keras', 6655),
 ('scipy', 3079),
 ('sklearn', 2990)]

## What does this look like if we tokenize the nodes?

In [17]:
path_to_tokenizer = "../../tokenizer"
vocab_path = os.path.join(path_to_tokenizer, "vocab.json")
merges_path = os.path.join(path_to_tokenizer, "merges.txt")
tokenizer = BartTokenizerFast(vocab_path, merges_path)

In [48]:
def tokenize_graph(G,tokenizer): 
    G_copy = G.copy()
    for node_id, data in G.nodes(data=True):
        node_tokens = tokenizer.encode(data["name"],add_special_tokens=False)
        for nt_id in node_tokens:
            G_copy.add_edge(node_id,nt_id)
    return G_copy.to_undirected()

In [49]:
tokenized_graph = tokenize_graph(top_10_tree,tokenizer)

In [43]:
tokenizer.encode("ttest",add_special_tokens=False)

[10112]

In [50]:
set().add(1)

In [68]:
import scipy
import numpy as np
from scipy.sparse import csc_matrix
from scipy.sparse.csgraph import dijkstra

In [52]:
from scipy.sparse import dok_matrix

In [59]:
tokenizer.vocab_size

52000

In [67]:
list(tokenized_graph.nodes())[4]

'scipy.add'

In [80]:
G_copy = top_10_tree.copy()
supported_input_ids=set()

#Add the token nodes
for node_id, data in top_10_tree.nodes(data=True):
    node_token_ids = tokenizer.encode(data["name"],add_special_tokens=False)
    for nt_id in node_token_ids:
        supported_input_ids.add(nt_id)
        G_copy.add_edge(node_id,nt_id)

supported_input_ids = list(supported_input_ids)

G_int = nx.convert_node_labels_to_integers(G_copy,label_attribute="old_id")
input_id_to_graph_matrix_id = {data["old_id"]:i for i,data in G_int.nodes(data=True)}

G_matrix = nx.to_scipy_sparse_matrix(G_int, nodelist=list(range(G_int.order())))


In [None]:
input_id_distances = dijkstra(G_matrix,directed=False,unweighted=True,
                            indices=list(input_id_to_graph_matrix_id.values()))

In [78]:
input_id_to_graph_matrix_id.values()

