In [1]:
from tree_sitter import Language, Parser
from query_pattern import JAVA_QUERY

parser = Parser()
parser.set_language(Language('/home/zixian/PycharmProjects/semantic_graph_code_code_clone/data/my-languages.so', 'java'))
file_path = '/home/zixian/PycharmProjects/semantic_graph_code_code_clone/benchmark/dataset/BCB_tailor/bigclonebench/21386162.java'

with open(file_path, 'rb') as file:
    code = file.read()
    code_ast = parser.parse(code)


# Display the tree structure
root_node = code_ast.root_node

query = JAVA_QUERY()

# _method = query.class_method_query().captures(root_node)[0][0]

# for child in _method.children:
#     print('------------------')
#     root_token = code[child.start_byte:child.end_byte]
#     print(child.type)
#     print(root_token)


# print(_method.type)
# root_token = code[_method.start_byte:_method.end_byte]
# print(root_token)

In [4]:
from treelib import Tree
from ast_node import ASTNode
import tree_sitter
import uuid

def generate_ast_key(file_name: str, func_name: str, parsed_node: tree_sitter.Node) -> str:
    """ Generate unique key value for each ASTNode

    attributes:
        file_name -- name of the file including current node\\
        parsed_node -- instance of tree_sitter Node is parsed now
    
    returns:
        key_value -- unique key string for parsed_node
    """
    # construct unique string for each tree_sitter Node
    key_str = file_name + '-' + func_name + '-' + parsed_node.type + '-' + str(parsed_node.start_byte) + '-' + str(parsed_node.end_byte)
    key_value = uuid.uuid3(uuid.NAMESPACE_DNS, key_str)
    key_value = str(key_value).replace('-', '')

    return key_value

class Queue():
    """Construct the queue structure using list
    """
    def __init__(self) -> None:
        self.__list = list()
    
    def is_empty(self):
        return self.__list == []
    
    def push(self, data):
        self.__list.append(data)
    
    def pop(self):
        if self.is_empty():
            return False
        
        return self.__list.pop(0)

def build_func_sast(file_name: str, func_name: str, func_tree : tree_sitter.Node, src_code : bytes, exclude_type: list) -> Tree:
    """Build simplified AST (sast) with function as the basic unit

    attributes:
        file_name -- name of the file including current function\\
        func_tree -- function ast generated by tree-sitter\\
        src_code -- serial source code for token querying\\
        exclude_type -- identifier types ignored
    
    returns:
        s_ast -- simplified ast organized by ASTNode
    """
    s_ast = Tree()
    
    # create root node for this function
    root_node = func_tree
    root_key = generate_ast_key(file_name, func_name, root_node)
    has_child = len(root_node.children)
    if not has_child:
        root_token = src_code[root_node.start_byte:root_node.end_byte]
    else:
        root_token = ''
    root_ast = ASTNode(root_key, root_node.type, root_token, root_node.start_byte, root_node.end_byte)
    s_ast.create_node(tag=root_node.type, identifier=root_key, data=root_ast)

    query = JAVA_QUERY()
    ret_node = query.method_ret_query().captures(root_node)[0][0]
    
    # create ret node for each ast
    ret_key = generate_ast_key(file_name, func_name, ret_node)
    ret_token = src_code[ret_node.start_byte:ret_node.end_byte].decode('utf8')
    ret_astnode = ASTNode(ret_key, 'ret_type', ret_token, ret_node.start_byte, ret_node.end_byte)

    queue = Queue()
    queue.push(root_node)
    while not queue.is_empty():
        current_node = queue.pop()

        for child in current_node.children:
            child_type = str(child.type)
            if child_type in exclude_type:
                continue
            child_key = generate_ast_key(file_name, func_name, child)
            child_token = ''
            has_child = len(child.children) > 0
            if not has_child:
                child_token = src_code[child.start_byte:child.end_byte].decode('utf8')
            parent_identifier = generate_ast_key(file_name, func_name, current_node)
            s_ast.create_node(tag=child_type, identifier=child_key, parent=parent_identifier, data=ASTNode(child_key, child_type, child_token, child.start_byte, child.end_byte))

            queue.push(child)
    if s_ast.get_node(ret_key) == None:
        exit(-1)
    
    # remove return parameter node, and place it as return node.
    s_ast.remove_node(ret_key)
    s_ast.create_node(tag=ret_node.type, identifier=ret_key, parent=root_key, data=ret_astnode)

    return s_ast


exclude_type = [",","{",";","}",")","(",'"',"'","`",""," ","[]","[","]",":",".","''","'.'","b", "\\", "'['", "']","''", "comment", "@", "?"]

_method = query.class_method_query().captures(root_node)[0][0]
_m_name_tmp = query.method_declaration_query().captures(_method)
_m_name = code[_m_name_tmp[0][0].start_byte:_m_name_tmp[0][0].end_byte].decode('utf8')

sast = build_func_sast("11.java", _m_name, _method, code, exclude_type)



def print_tree_relationships(tree, node_identifier, level=0):
    node = tree.get_node(node_identifier)
    
    # Print indentation to reflect depth in the tree
    indent = "  " * level
    print(f"{indent}- Node Type: {node.data.node_type}, Identifier: {node.identifier}")
    print(f"{indent}  Node Token: {node.data.node_token}")
    
    # Recursively print all children of the current node
    children = tree.children(node_identifier)
    for child in children:
        print_tree_relationships(tree, child.identifier, level + 1)

# Start the recursive display from the root node of the tree
print("\nTree Relationships (Parent-Child Structure):")
root_identifier = sast.root
print_tree_relationships(sast, root_identifier)



Tree Relationships (Parent-Child Structure):
- Node Type: method_declaration, Identifier: 27cbd332d2383f14bff63d4aec3f74e9
  Node Token: 
  - Node Type: modifiers, Identifier: fb778dd000fb3deb868a65ccf34d2f23
    Node Token: 
    - Node Type: public, Identifier: 436d5091d3ab3c8e9ea40a2c37338d8c
      Node Token: public
  - Node Type: identifier, Identifier: 06f00bb006733e2aa47d5fa26bdb8018
    Node Token: writeToFile
  - Node Type: formal_parameters, Identifier: 99e805baf24539cf9ec708e03a9fa005
    Node Token: 
    - Node Type: formal_parameter, Identifier: a33198318d4635f6a1fe23bc91496dba
      Node Token: 
      - Node Type: type_identifier, Identifier: d8c5f94a6b8a3227b7c7a917074f36ba
        Node Token: File
      - Node Type: identifier, Identifier: 840040bff93337ab99a3ea550a3368a5
        Node Token: file
    - Node Type: formal_parameter, Identifier: 6c86c018685e381f9f098bea9c292e09
      Node Token: 
      - Node Type: type_identifier, Identifier: 6d068e87e22f3edfbd247b14df223