In [89]:
from pycparser import c_ast, parse_file, c_parser
from anytree import Node, RenderTree, PreOrderIter
from anytree.exporter import DotExporter
import pycparser_fake_libc, anytree
import pandas as pd
import numpy as np
import networkx as nx


def get_children(root):
    if isinstance(root, c_ast.FileAST):
        children = root.children()
    elif isinstance(root[1], c_ast.Node):
        children = root[1].children()
    elif isinstance(root, set):
        children = list(root)
    else:
        children = []

    def expand(nested_list):
        for item in nested_list:
            if isinstance(item, list):
                for sub_item in expand(item):
                    yield sub_item
            elif item:
                yield item

    return list(expand(children))

def get_trees(current_node, parent_node, order):
    
    token, children = get_token(current_node), get_children(current_node)
    node = Node([order,token], parent=parent_node, order=order)

    for child_order in range(len(children)):
        get_trees(children[child_order], node, order+str(int(child_order)+1))

def get_token(node):
    token = ''
    if isinstance(node, c_ast.FileAST):
        token = node.__class__.__name__
        return token
    elif isinstance(node[1], str):
        token = node[1]
        return token
        #print(token)
    elif isinstance(node[1], c_ast.Node):
        token = node[1].__class__.__name__
    #print(token)
    
    if len(get_children(node))==0:
        attr_names = node[1].attr_names
        if attr_names:
            if 'names' in attr_names:
                token = node[1].names[0]
            elif 'name' in attr_names:
                token = node[1].name
            else:
                token = node[1].value
    else:
        if token == 'TypeDecl':
            token = node[1].declname
        if node[1].attr_names:
            attr_names = node[1].attr_names
            if 'op' in attr_names:
                if node[1].op[0] == 'p':
                    token = node[1].op[1:]
                else:
                    token = node[1].op
 
    if token == '':
        token = node[1].__class__.__name__
    return token

def get_edges_and_nodes(root):
    """Returns a list of edges and nodes from the given tree."""
    edges = []
    nodes = []

    def walker(node):
        """Recursive function to walk the tree and collect edges and nodes."""
        nodes.append(node)
        for child in node.children:
            edges.append((node, child))
            walker(child)

    walker(root)
    return edges, nodes

if __name__=="__main__":
    filename = '../rosetta/spam-filter/src/sw/sgd_sw.cpp'
    fake_libc_arg = "-I" + pycparser_fake_libc.directory

    ast = parse_file(filename, use_cpp=True,
            cpp_path='g++',
            cpp_args=['-E', fake_libc_arg, r'-IHLS_arbitrary_Precision_Types-master/include'])
    
    head = Node(["1",get_token(ast)])
    # Recursively construct AST tree.
    for child_order in range(len(get_children(ast))):
        get_trees(get_children(ast)[child_order], head, "1"+str(int(child_order)+1))

    lst = list(PreOrderIter(head, filter_=lambda node: node.is_leaf))
    f = [str(i)[7:-2].split(i.separator) for i in lst]
    # print(f)
    edges, nodes = get_edges_and_nodes(head)

    np.save('test_edges.npy', edges)
    # print(nodes[0].name)

    # print(RenderTree(head))
    DotExporter(head).to_dotfile("./dotfiles/testcode.dot")
    scriptName = "testcode.cypher"



    # print([n.name[1] for n in nodes])

    nodeTypes = []

    for _ in nodes:
        nodeTypes.append(-1)
    for i in range(len(nodes)):
        if nodes[i].name[1] == 'FuncDef':
            nodeTypes[i] = 0
        elif nodes[i].name[1] == 'For':
            nodeTypes[i] = 1
        elif nodes[i].name[1] == 'ArrayDecl':
            nodeTypes[i] = 2
    # print(nodeTypes)


    G=nx.Graph()

    G.add_nodes_from([("Node_" + str(n.name[0]) ) for n in nodes])

    G.add_edges_from([(("Node_" +str(e[0].name[0])) , ("Node_" +str(e[1].name[0]))) for e in edges])

    import matplotlib.pyplot as plt

    print(len(nodeTypes))
    print(len(nodes))
    print(G)
    # print(nodes)

    # nx.draw(G)
    # plt.show()

    from karateclub.node_embedding.attributed import FeatherNode
    from karateclub import DeepWalk
    # model = FeatherNode()
    # model = DeepWalk()

    for edge in G.edges:
        print(edge)

    # model.fit(G , np.array(nodeTypes))
    # model.fit(G)
    from node2vec import Node2Vec

    # node2vec = Node2Vec(G, dimensions=20, walk_length=16, num_walks=100)
    node2vec = Node2Vec(G, dimensions=64, walk_length=30, num_walks=200, workers=10) 
    model = node2vec.fit(window=10, min_count=1, batch_words=4,workers=10)
    model.wv.save_word2vec_format("n2vEmb.kv")
    # type(model.wv)

    # f = open(scriptName , "w")
    # for nod in nodes:
    #     f.write("CREATE (astnode" + str(nod.name[0]) + ": AstNode {type:'" + str(nod.name[1])+"'})\n")
    # print(edges[10][1].name)
    # eCount = 0
    # for edg in edges:
    #     f.write("CREATE (astnode" + str(edg[0].name[0]) + ")-[e" + str(eCount) + ":IS_CONNECTED]->(astnode" + str(edg[1].name[0]) + ")\n")
    #     eCount+=1
    # f.close()
    # endnodes = anytree.findall(head, filter_=lambda node: len(node.children)==0)
    # members = []
    # tier = []
    # endcluster = []
    # for item in endnodes:
    #     members += item.members
    #     tier += [item.tier] * len(item.members)
    #     endcluster += [item.name] * len(item.members)
    # endf = pd.DataFrame(index=members)
    # endf['tier']=tier
    # endf['endcluster']=endcluster

    # print(endf.head())


893
893
Graph with 844 nodes and 870 edges
('Node_1', 'Node_11')
('Node_1', 'Node_12')
('Node_1', 'Node_13')
('Node_1', 'Node_14')
('Node_1', 'Node_15')
('Node_1', 'Node_16')
('Node_1', 'Node_17')
('Node_1', 'Node_18')
('Node_1', 'Node_19')
('Node_1', 'Node_110')
('Node_1', 'Node_111')
('Node_1', 'Node_112')
('Node_1', 'Node_113')
('Node_1', 'Node_114')
('Node_1', 'Node_115')
('Node_1', 'Node_116')
('Node_1', 'Node_117')
('Node_1', 'Node_118')
('Node_1', 'Node_119')
('Node_1', 'Node_120')
('Node_1', 'Node_121')
('Node_1', 'Node_122')
('Node_1', 'Node_123')
('Node_1', 'Node_124')
('Node_1', 'Node_125')
('Node_1', 'Node_126')
('Node_1', 'Node_127')
('Node_1', 'Node_128')
('Node_1', 'Node_129')
('Node_1', 'Node_130')
('Node_1', 'Node_131')
('Node_1', 'Node_132')
('Node_1', 'Node_133')
('Node_1', 'Node_134')
('Node_1', 'Node_135')
('Node_1', 'Node_136')
('Node_1', 'Node_137')
('Node_1', 'Node_138')
('Node_1', 'Node_139')
('Node_1', 'Node_140')
('Node_1', 'Node_141')
('Node_1', 'Node_142')


Computing transition probabilities: 100%|██████████| 844/844 [00:00<00:00, 4800.56it/s]
Generating walks (CPU: 3): 100%|██████████| 20/20 [00:03<00:00,  6.44it/s]]
Generating walks (CPU: 2): 100%|██████████| 20/20 [00:03<00:00,  5.01it/s]]
Generating walks (CPU: 8): 100%|██████████| 20/20 [00:03<00:00,  5.76it/s]
Generating walks (CPU: 4): 100%|██████████| 20/20 [00:03<00:00,  5.04it/s]
Generating walks (CPU: 1): 100%|██████████| 20/20 [00:04<00:00,  4.62it/s]]
Generating walks (CPU: 5): 100%|██████████| 20/20 [00:03<00:00,  5.04it/s]
Generating walks (CPU: 9): 100%|██████████| 20/20 [00:03<00:00,  5.14it/s]]
Generating walks (CPU: 6): 100%|██████████| 20/20 [00:04<00:00,  4.74it/s]
Generating walks (CPU: 7): 100%|██████████| 20/20 [00:04<00:00,  4.81it/s]
Generating walks (CPU: 10): 100%|██████████| 20/20 [00:04<00:00,  4.92it/s]


In [90]:
forIndices = []
for i in range(len(nodeTypes)):
    if (nodeTypes[i] == 1):
        forIndices.append(i)

In [91]:
[nodes[i] for i in forIndices]

[Node("/['1', 'FileAST']/['1204', 'FuncDef']/['12042', 'Compound']/['120422', 'Label']/['1204221', 'For']", order='1204221'),
 Node("/['1', 'FileAST']/['1206', 'FuncDef']/['12062', 'Compound']/['120621', 'Label']/['1206211', 'For']", order='1206211'),
 Node("/['1', 'FileAST']/['1207', 'FuncDef']/['12072', 'Compound']/['120721', 'Label']/['1207211', 'For']", order='1207211'),
 Node("/['1', 'FileAST']/['1208', 'FuncDef']/['12082', 'Compound']/['120822', 'Label']/['1208221', 'For']", order='1208221'),
 Node("/['1', 'FileAST']/['1208', 'FuncDef']/['12082', 'Compound']/['120822', 'Label']/['1208221', 'For']/['12082214', 'Compound']/['120822141', 'Label']/['1208221411', 'For']", order='1208221411')]

In [98]:
x = []
y = []

In [103]:
num = 1208221411
res = 0

# 0 - Nothing
# 1 - Pipeline
# 2 - DataFlow
# 3 - Unroll

x.append(model.wv['Node_' + str(num)])
y.append(res)

In [104]:
print(len(x))
print(y)

5
[1, 1, 1, 1, 0]


In [105]:
np.save('spam_filter_x.npy' , np.array(x))
np.save('spam_filter_y.npy' , np.array(y))

In [34]:
model.wv['Node_18221']

array([ 1.0972737 ,  0.239924  , -0.2674956 , -0.43305728, -0.28289855,
        0.37311724,  0.3111177 , -0.93908   , -0.7645221 , -0.18918903,
       -0.2098697 , -0.50991887, -0.5978845 , -0.22295688, -0.35107625,
        0.6310047 , -0.18822399,  0.3255082 ,  0.10860312,  0.59550154,
        0.537721  ,  0.28600916,  0.48024696, -0.13482381,  0.05024524,
        0.3769798 , -0.11870631, -0.0249996 ,  0.04842938, -0.34234062,
        0.8515974 ,  0.04378412, -0.05903495, -0.6271442 , -0.42813766,
       -0.5117964 , -0.08396254, -0.22360481, -0.22526011,  0.15703787,
        0.23030362, -0.03629335,  0.19478448,  0.3331336 ,  0.5495108 ,
       -0.9124143 ,  0.07016402, -0.05810996, -0.24292845,  0.55198586,
        0.4342167 , -0.1639944 , -0.37147063,  0.3888604 ,  0.05817171,
       -0.20738555,  0.19926934, -0.31530324,  0.13890053,  0.17228802,
       -0.22062317,  0.03055854,  0.71202964,  0.6417132 ], dtype=float32)

In [106]:
model.wv

<gensim.models.keyedvectors.KeyedVectors at 0x7f8fccace970>

In [11]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

def baseline_model():
    # Create model here
    model = Sequential()
    model.add(Dense(32, input_dim = 64, activation = 'relu')) # Rectified Linear Unit Activation Function
    model.add(Dense(16, activation = 'relu'))
    model.add(Dense(4, activation = 'softmax')) # Softmax for multi-class classification
    # Compile model here
    model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    return model

In [15]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
estimator = KerasClassifier(build_fn = baseline_model, epochs = 100, batch_size = 10, verbose = 0)

In [17]:
kfold = KFold(n_splits = 5, shuffle = True, random_state = 25)

In [19]:
nn = Sequential()
nn.add(Dense(128, input_dim = 64 , activation = 'relu'))
nn.add(Dense(256, activation = 'relu'))
nn.add(Dense(128, activation = 'relu'))
nn.add(Dense(64, activation = 'relu'))
nn.add(Dense(4, activation = 'softmax'))

nn.compile(loss = 'categorical_crossentropy' , optimizer = 'adam' , metrics = ['accuracy'] )

2023-11-30 18:47:08.233690: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1
2023-11-30 18:47:08.358771: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-11-30 18:47:08.359499: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1561] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: NVIDIA GeForce GTX 1650 Ti computeCapability: 7.5
coreClock: 1.485GHz coreCount: 16 deviceMemorySize: 3.82GiB deviceMemoryBandwidth: 178.84GiB/s
2023-11-30 18:47:08.362265: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2023-11-30 18:47:08.401052: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcublas.so.10
2023-11-30 18:47:08.420170: I tensorflow/stream_executor/