In [1]:
# This is due to the fact that we use asyncio.loop_until_complete in
# the GithubRepositoryReader. Since the Jupyter kernel itself runs on
# an event loop, we need to add some help with nesting
!pip install nest_asyncio
import nest_asyncio

nest_asyncio.apply()




In [None]:
%env OPENAI_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
from gpt_index import (
    GPTSimpleVectorIndex,
    GPTQdrantIndex,
    GPTTreeIndex,
    GPTFaissIndex,
    GPTWeaviateIndex,
    GPTListIndex,
    GPTSimpleKeywordTableIndex,
    GPTKeywordTableIndex,
    GPTPineconeIndex,
    GPTRAKEKeywordTableIndex,
    GPTSQLStructStoreIndex,
    GithubRepositoryReader,
)
from IPython.display import Markdown, display
import os

In [None]:
%env GITHUB_TOKEN=github_pat_xxxxxxxxxxxxxxxxxx_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
github_token = os.environ.get("GITHUB_TOKEN")
owner = "jerryjliu"
repo = "gpt_index"
branch = "main"
reader = GithubRepositoryReader(
    github_token=github_token,
    owner=owner,
    repo=repo,
    use_parser=True,
    verbose=True,
    # ignore_directories=["examples", "docs", ".vscode"],
    directories_to_include=["gpt_index"],
    ignore_file_extensions=[
        ".png",
        ".jpg",
        ".jpeg",
        ".gif",
        ".svg",
        ".ico",
        ".json",
        ".csv",
    ],
    concurrent_requests=5,
)

In [5]:
documents = reader.load_data(branch=branch)
print(f"Loaded {len(documents)} documents")

current path: 
processing tree 76dce9173ad45f67844d7b36e68d2bbba2d6cb0e
found blob .flake8
recursing into .github
ignoring tree .github due to directory
found blob .gitignore
found blob .readthedocs.yaml
recursing into .vscode
ignoring tree .vscode due to directory
found blob CITATION.cff
found blob CONTRIBUTING.md
found blob LICENSE
found blob MANIFEST.in
found blob Makefile
found blob README.md
found blob data_requirements.txt
recursing into docs
ignoring tree docs due to directory
recursing into examples
ignoring tree examples due to directory
recursing into experimental
ignoring tree experimental due to directory
recursing into gpt_index
	current path: gpt_index
	processing tree 798979bcad0f6f89adc63e91060b9c406b0e85d9
	found blob VERSION
	found blob __init__.py
	recursing into composability
	ignoring tree composability due to directory
	found blob constants.py
	recursing into data_structs
	ignoring tree data_structs due to directory
	found blob docstore.py
	recursing into embeddin

In [41]:
from gpt_index import Document
print(f"Indexing {len(documents)} documents")
new_documents: list[list[Document]] = []
batch_size = 3
for x, doc in enumerate(documents):
    if x % batch_size == 0:
        new_documents.append([])
    new_documents[-1].append(doc)
    
indexes = [ GPTTreeIndex(ndocs) for ndocs in new_documents[:4] ]

i = 1
indexes_len = len(indexes)
for index in indexes:
    files = ''
    for _, doc in index.docstore.docs.items():
        if doc and doc.extra_info and 'file_name' in doc.extra_info:
            files += str(doc.extra_info['file_name']) + ', '

    summary = index.query(
        f"What are the summaries of the codes in the following files {files}?", mode="summarize" 
    )
    index.set_text(str(summary))
    print(f"{i}/{indexes_len} -- Index document id: {index.get_doc_id()} -- Done!")
    print(f"{files}")
    print(f"Index summary text: {str(summary)}")
    i += 1
    # print(f"Index document text: {str(summary)}")
    # input("Press enter to continue")
    

INFO:root:> Building index from nodes: 0 chunks


Indexing 196 documents


INFO:root:> [build_index_from_documents] Total LLM token usage: 778 tokens
INFO:root:> [build_index_from_documents] Total embedding token usage: 0 tokens
INFO:root:> Building index from nodes: 0 chunks
INFO:root:> [build_index_from_documents] Total LLM token usage: 847 tokens
INFO:root:> [build_index_from_documents] Total embedding token usage: 0 tokens
INFO:root:> Building index from nodes: 0 chunks
INFO:root:> [build_index_from_documents] Total LLM token usage: 1194 tokens
INFO:root:> [build_index_from_documents] Total embedding token usage: 0 tokens
INFO:root:> Building index from nodes: 0 chunks
INFO:root:> [build_index_from_documents] Total LLM token usage: 1025 tokens
INFO:root:> [build_index_from_documents] Total embedding token usage: 0 tokens
INFO:root:> Starting query: What are the summaries of the codes in the following files .flake8, build_package.yml, ?
INFO:root:> Building index from nodes: 0 chunks
INFO:root:> [query] Total LLM token usage: 1108 tokens
INFO:root:> [query

1/4 -- Index document id: 55775b74-3c8d-42f3-af09-febebdf2af38 -- Done!
.flake8, build_package.yml, 
Index summary text: 
The .flake8 file contains settings for the flake8 linter, such as excluding certain directories and setting the max-line-length to 88. The build_package.yml file is a GitHub workflow that sets up Python 3.9, installs dependencies, builds the package, and then runs an import test.


INFO:root:> [query] Total LLM token usage: 1127 tokens
INFO:root:> [query] Total embedding token usage: 0 tokens
INFO:root:> Starting query: What are the summaries of the codes in the following files .gitignore, .readthedocs.yaml, ?
INFO:root:> Building index from nodes: 0 chunks


2/4 -- Index document id: 617a77a4-7f2b-44ed-b7cf-e00e42a86510 -- Done!
lint.yml, unit_test.yml, 
Index summary text: 
lint.yml: Sets up Python 3.9, installs dependencies, and runs the linter to check for errors.

unit_test.yml: Sets up Python 3.9 and 3.8, installs dependencies, and runs the unit tests to check for errors.


INFO:root:> [query] Total LLM token usage: 2095 tokens
INFO:root:> [query] Total embedding token usage: 0 tokens
INFO:root:> Starting query: What are the summaries of the codes in the following files CITATION.cff, CONTRIBUTING.md, ?
INFO:root:> Building index from nodes: 0 chunks


3/4 -- Index document id: 35b41143-b640-4c9a-a9de-12f96e319f90 -- Done!
.gitignore, .readthedocs.yaml, 
Index summary text: 
The .gitignore file contains entries for files and directories that should be ignored by version control systems, such as .DS_Store, __pycache__, *.py[cod], *.so, .Python, bin/, build/, develop-eggs/, dist/, downloads/, eggs/, .eggs/, etc/, include/, lib/, lib64/, parts/, sdist/, share/, var/, wheels/, pip-wheel-metadata/, share/python-wheels/, *.egg-info/, .installed.cfg, *.egg, MANIFEST, *.manifest, *.spec, pip-log.txt, pip-delete-this-directory.txt, htmlcov/, .tox/, .nox/, .coverage, .coverage.*, .cache, nosetests.xml, coverage.xml, *.cover, *.py,cover, .hypothesis/, .pytest_cache/, *.mo, *.pot, *.log, local_settings.py, db.sqlite3, db.sqlite3-journal, instance/, .webassets-cache, .scrapy, docs/_build/, target/, .ipynb_checkpoints, notebooks/, profile_default/, ip.

The .


INFO:root:> [query] Total LLM token usage: 1467 tokens
INFO:root:> [query] Total embedding token usage: 0 tokens


4/4 -- Index document id: 4f406aa5-4603-4437-854e-af664c559efe -- Done!
CITATION.cff, CONTRIBUTING.md, 
Index summary text: 
CITATION.cff: This file encourages users to cite GPT Index when using the software, and provides the authors, title, DOI, date released, and URL of the software.

CONTRIBUTING.md: This file provides instructions on how to contribute to GPT Index, including setting up the environment, formatting/linting changes, testing changes, creating an example notebook, and opening a pull request against the main GPT Index repo.


In [42]:
print(f"Built {len(indexes)} indexes")
keyword_table = GPTSimpleKeywordTableIndex(indexes, max_keywords_per_chunk=50)

from gpt_index.composability import ComposableGraph

graph = ComposableGraph.build_from_index(keyword_table)

query_configs = [
    {
        "index_struct_type": "simple_dict",
        "query_mode": "default",
        "query_kwargs": {
            "similarity_top_k": 1
        }
    },
    {
        "index_struct_type": "keyword_table",
        "query_mode": "simple",
        "query_kwargs": {}
    },
]

response = graph.query("How can I contribute to this open source GitHub repository?", query_configs=query_configs)
display(Markdown(f"**Query:** What is the responsibility of ComposableGraph class?"))
display(Markdown(f"**Response:** {response}"))

INFO:root:> [build_index_from_documents] Total LLM token usage: 0 tokens
INFO:root:> [build_index_from_documents] Total embedding token usage: 0 tokens
INFO:root:> Starting query: How can I contribute to this open source GitHub repository?
INFO:root:query keywords: ['repository', 'github', 'contribute', 'open', 'source']
INFO:root:Extracted keywords: ['github', 'contribute']
INFO:root:> Starting query: How can I contribute to this open source GitHub repository?


Built 4 indexes


INFO:root:> [query] Total LLM token usage: 284 tokens
INFO:root:> [query] Total embedding token usage: 0 tokens
INFO:root:> Starting query: How can I contribute to this open source GitHub repository?
INFO:root:> [query] Total LLM token usage: 443 tokens
INFO:root:> [query] Total embedding token usage: 0 tokens
INFO:root:> [query] Total LLM token usage: 1191 tokens
INFO:root:> [query] Total embedding token usage: 0 tokens


**Query:** What is the responsibility of ComposableGraph class?

**Response:** 

ANSWER: The build_package.yml file is a GitHub workflow that builds a package on its own without additional pip install. It sets up Python 3.9, installs dependencies, builds the package, and then runs an import test. Additionally, environment setup requires forking the repo and creating a Python virtual environment. Formatting/Linting and Testing can be done with the commands `make format; make lint` and `pytest tests` respectively. For new features, an example Jupyter notebook can be added to the `examples` folder. Finally, instructions to open a pull request against the main GPT Index repo can be found.

In [None]:
from typing import List
from gpt_index.indices.base import BaseGPTIndex
from gpt_index.langchain_helpers.chain_wrapper import LLMPredictor
from gpt_index.indices.prompt_helper import PromptHelper
from langchain import OpenAI



llm_predictor = LLMPredictor(
    llm=OpenAI(temperature=0.25, model_name="text-davinci-003", request_timeout=60)
)

# define prompt helper
# set maximum input size
max_input_size = int(4096)
# set number of output tokens
num_output = int(256)
# set maximum chunk overlap
max_chunk_overlap = int(20)
prompt_helper = PromptHelper(max_input_size, num_output, max_chunk_overlap)

for doc in documents:
    print(f"{doc.get_doc_id()}: {doc.extra_info}")

print(f"Loaded {len(documents)} documents")


def construct_index(index_class, *args, **kwargs):
    try:
        print(f"Constructing {index_class.__name__}...")
        index = index_class(documents, *args, **kwargs)
        print(f"Saving {index_class.__name__} to disk...")
        index.save_to_disk(f"{index_class.__name__}.json")
        print(f"Done constructing {index_class.__name__}")
        return index
    except Exception as e:
        print(e)
        return None

indexClasses: List[BaseGPTIndex] = [
    GPTSimpleVectorIndex,
    # GPTQdrantIndex,
    # GPTTreeIndex,
    # GPTFaissIndex,
    # GPTWeaviateIndex,
    # GPTListIndex,
    GPTSimpleKeywordTableIndex,
    GPTKeywordTableIndex,
    # GPTPineconeIndex,
    GPTRAKEKeywordTableIndex,
    # GPTSQLStructStoreIndex,
]
indexes = {
    indexClass.__name__: construct_index(
        indexClass,
        # llm_predictor=llm_predictor,
        # prompt_helper=prompt_helper,
    )
    for indexClass in indexClasses
}

indexes = {k: v for k, v in indexes.items() if v is not None}

In [None]:
from gpt_index.indices.base import BaseGPTIndex

indexName: str
index: BaseGPTIndex
for indexName, index in indexes.items():
    print(f"Index: {index}")
    response = index.query(
        "What is the difference between `GPTTreeIndex` and `GPTListIndex` classes?",
    )
    display(Markdown(f"<h2>{indexName}</h2> response:<br><p>{response}</p>"))
    input("Press enter to continue...")