# Code Hierarchy Node Parser

In [None]:
#%pip install llama-index-packs-code-hierarchy llama-index

In [2]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core.text_splitter import CodeSplitter
from llama_index.llms.openai import OpenAI
from llama_index.packs.code_hierarchy import CodeHierarchyNodeParser
from llama_index.packs.code_hierarchy import CodeHierarchyAgentPack
from pathlib import Path

In [8]:
documents = SimpleDirectoryReader(
    input_files=[Path("C:\Dev\POC\.venv\Lib\site-packages\pypdf\pagerange.py")],
    file_metadata=lambda x: {"filepath": x},
).load_data()


split_nodes = CodeHierarchyNodeParser(
    language="python",
    # You can further parameterize the CodeSplitter to split the code
    # into "chunks" that match your context window size using
    # chunck_lines and max_chars parameters, here we just use the defaults
    code_splitter=CodeSplitter(language="python", max_chars=1000, chunk_lines=10),
).get_nodes_from_documents(documents)

In [9]:
documents

[Document(id_='dd793fd2-0f9f-4291-bd14-a66d3abef34d', embedding=None, metadata={'filepath': 'C:\\Dev\\POC\\.venv\\Lib\\site-packages\\pypdf\\pagerange.py'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='"""\nRepresentation and utils for ranges of PDF file pages.\n\nCopyright (c) 2014, Steve Witham <switham_github@mac-guyver.com>.\nAll rights reserved. This software is available under a BSD license;\nsee https://github.com/py-pdf/pypdf/blob/main/LICENSE\n"""\n\nimport re\nfrom typing import Any, List, Tuple, Union\n\nfrom .errors import ParseError\n\n_INT_RE = r"(0|-?[1-9]\\d*)"  # A decimal int, don\'t allow "-0".\nPAGE_RANGE_RE = f"^({_INT_RE}|({_INT_RE}?(:{_INT_RE}?(:{_INT_RE}?)?)))$"\n# groups:         12     34     5 6     7 8\n\n\nclass PageRange:

In [10]:
split_nodes

[TextNode(id_='152c4556-8642-48bd-bc08-32a95ce64d97', embedding=None, metadata={'language': 'python', 'inclusive_scopes': [], 'start_byte': 0, 'end_byte': 6880, 'filepath': 'C:\\Dev\\POC\\.venv\\Lib\\site-packages\\pypdf\\pagerange.py'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='152c4556-8642-48bd-bc08-32a95ce64d97', node_type=<ObjectType.TEXT: '1'>, metadata={'language': 'python', 'inclusive_scopes': [], 'start_byte': 0, 'end_byte': 6880, 'filepath': 'C:\\Dev\\POC\\.venv\\Lib\\site-packages\\pypdf\\pagerange.py'}, hash='0663bcdbc3b00129f4ade7223b2ebc7d664eb82c34baa8b69221bda7e727cebd'), <NodeRelationship.CHILD: '5'>: [RelatedNodeInfo(node_id='a4d522d2-982f-4cc6-ba4b-8870ac8ed52c', node_type=<ObjectType.TEXT: '1'>, metadata={'inclusive_scopes': [{'name': 'PageRange', 'type': 'class_definition', 'signature': 'class PageRange:'}], 'start_byte': 523, 'end_byte': 5484}, hash='4e7a3d7cf007226c1c884

In [11]:
print(CodeHierarchyNodeParser.get_code_hierarchy_from_nodes(split_nodes))

(defaultdict(<class 'dict'>, {'C:\\Dev\\POC\\': defaultdict(<class 'dict'>, {'PageRange': defaultdict(<class 'dict'>, {'__init__': defaultdict(<class 'dict'>, {}), 'valid': defaultdict(<class 'dict'>, {}), 'to_slice': defaultdict(<class 'dict'>, {}), '__str__': defaultdict(<class 'dict'>, {}), '__repr__': defaultdict(<class 'dict'>, {}), 'indices': defaultdict(<class 'dict'>, {}), '__eq__': defaultdict(<class 'dict'>, {}), '__add__': defaultdict(<class 'dict'>, {})}), 'parse_filename_page_ranges': defaultdict(<class 'dict'>, {})})}), '- C:\\Dev\\POC\\\n  - PageRange\n    - __init__\n    - valid\n    - to_slice\n    - __str__\n    - __repr__\n    - indices\n    - __eq__\n    - __add__\n  - parse_filename_page_ranges\n')


In [12]:
split_nodes_by_id = {n.node_id: n for n in split_nodes}
uuid_from_text = split_nodes[9].text.splitlines()[-1].split(" ")[-1]
print("Going to print the node with UUID:", uuid_from_text)
#print_python(split_nodes_by_id[uuid_from_text].text)

Going to print the node with UUID: self._slice


In [15]:
from IPython.display import Markdown, display


def print_python(python_text):
    """This function prints python text in ipynb nicely formatted."""
    display(Markdown("```python\n" + python_text + "```"))

In [14]:
from llama_index.packs.code_hierarchy import CodeHierarchyKeywordQueryEngine

query_engine = CodeHierarchyKeywordQueryEngine(
    nodes=split_nodes,
)

In [18]:
print_python(query_engine.query(split_nodes[0].node_id).response)

```python
"""
Representation and utils for ranges of PDF file pages.

Copyright (c) 2014, Steve Witham <switham_github@mac-guyver.com>.
All rights reserved. This software is available under a BSD license;
see https://github.com/py-pdf/pypdf/blob/main/LICENSE
"""

import re
from typing import Any, List, Tuple, Union

from .errors import ParseError

_INT_RE = r"(0|-?[1-9]\d*)"  # A decimal int, don't allow "-0".
PAGE_RANGE_RE = f"^({_INT_RE}|({_INT_RE}?(:{_INT_RE}?(:{_INT_RE}?)?)))$"
# groups:         12     34     5 6     7 8


class PageRange:
    # Code replaced for brevity. See node_id a4d522d2-982f-4cc6-ba4b-8870ac8ed52c


PAGE_RANGE_ALL = PageRange(":")  # The range of all pages.


def parse_filename_page_ranges(
    args: List[Union[str, PageRange, None]]
) -> List[Tuple[str, PageRange]]:
    # Code replaced for brevity. See node_id 446ebec5-d6fa-4cd6-996d-58da053ef58c


PageRangeSpec = Union[str, PageRange, Tuple[int, int], Tuple[int, int, int], List[int]]```

In [21]:
from llama_index.core.tools import QueryEngineTool

tool = QueryEngineTool.from_defaults(
    query_engine=query_engine,
    name="code_lookup",
    description="Useful for looking up information about the code hierarchy codebase.",
)

In [22]:
display(Markdown("Description: " + query_engine.get_tool_instructions()))

Description: Search the tool by any element in this list to get more information about that element.
If you see 'Code replaced for brevity' then a uuid, you may also search the tool with that uuid to see the full code.
You may need to use the tool multiple times to fully answer the user message.
The list is:
- C:\Dev\POC\
  - PageRange
    - __init__
    - valid
    - to_slice
    - __str__
    - __repr__
    - indices
    - __eq__
    - __add__
  - parse_filename_page_ranges

