In [1]:
! pip install llama-index deeplake openai python-dotenv

Collecting llama-index
  Using cached llama_index-0.10.16-py3-none-any.whl.metadata (8.8 kB)
Collecting deeplake
  Using cached deeplake-3.8.22-py3-none-any.whl
Collecting openai
  Using cached openai-1.13.3-py3-none-any.whl.metadata (18 kB)
Collecting python-dotenv
  Using cached python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting llama-index-agent-openai<0.2.0,>=0.1.4 (from llama-index)
  Using cached llama_index_agent_openai-0.1.5-py3-none-any.whl.metadata (695 bytes)
Collecting llama-index-cli<0.2.0,>=0.1.2 (from llama-index)
  Using cached llama_index_cli-0.1.7-py3-none-any.whl.metadata (1.6 kB)
Collecting llama-index-core<0.11.0,>=0.10.16 (from llama-index)
  Using cached llama_index_core-0.10.16.post1-py3-none-any.whl.metadata (3.6 kB)
Collecting llama-index-embeddings-openai<0.2.0,>=0.1.5 (from llama-index)
  Using cached llama_index_embeddings_openai-0.1.6-py3-none-any.whl.metadata (654 bytes)
Collecting llama-index-indices-managed-llama-cloud<0.2.0,>=0.1.2 (from 

In [4]:
! pip install llama-index-readers-github
! pip install llama-index-vector-stores-deeplake

Collecting llama-index-readers-github
  Downloading llama_index_readers_github-0.1.7-py3-none-any.whl.metadata (837 bytes)
Downloading llama_index_readers_github-0.1.7-py3-none-any.whl (20 kB)
Installing collected packages: llama-index-readers-github
Successfully installed llama-index-readers-github-0.1.7


In [1]:
from dotenv import load_dotenv
import os

# Load environment variables
load_dotenv()

# Fetch and set API keys
openai_api_key = os.getenv("OPENAI_API_KEY")
active_loop_token = os.getenv("ACTIVELOOP_TOKEN")
dataset_path = os.getenv("DATASET_PATH")

## Define helper function

In [2]:
import textwrap
from dotenv import load_dotenv
from llama_index.core import download_loader
# Llama hub is a platform that aggregates custom plugins for all data types
from llama_index.readers.github import GithubRepositoryReader, GithubClient
from llama_index.vector_stores.deeplake import DeepLakeVectorStore
from llama_index.core import VectorStoreIndex, StorageContext
import re

def parse_github_url(url):
    pattern = r"https://github\.com/([^/]+)/([^/]+)"
    match = re.match(pattern, url)
    return match.groups() if match else (None, None)


def validate_owner_repo(owner, repo):
    return bool(owner) and bool(repo)


def initialize_github_client():
    github_token = os.getenv("GITHUB_TOKEN")
    return GithubClient(github_token)

## Check for authorization

In [3]:
# Check for OpenAI API key
openai_api_key = os.getenv("OPENAI_API_KEY")
if not openai_api_key:
    raise EnvironmentError("OpenAI API key not found in environment variables")

# Check for GitHub Token
github_token = os.getenv("GITHUB_TOKEN")
if not github_token:
    raise EnvironmentError("GitHub token not found in environment variables")

# Check for Activeloop Token
active_loop_token = os.getenv("ACTIVELOOP_TOKEN")
if not active_loop_token:
    raise EnvironmentError("Activeloop token not found in environment variables")

## Upload Github data to Vectorstore

In [4]:
github_client = initialize_github_client()
download_loader("GithubRepositoryReader")

github_url = input("Please enter the GitHub repository URL: ")
owner, repo = parse_github_url(github_url)

while True:
    owner, repo = parse_github_url(github_url)
    if validate_owner_repo(owner, repo):
        loader = GithubRepositoryReader(
            github_client,
            owner=owner,
            repo=repo,
            filter_file_extensions=(
                [".py", ".js", ".ts", ".md"],
                GithubRepositoryReader.FilterType.INCLUDE,
            ),
            verbose=False,
            concurrent_requests=5,
        )
        print(f"Loading {repo} repository by {owner}")
        docs = loader.load_data(branch="main")
        print("Documents uploaded:")
        for doc in docs:
            print(doc.metadata)
        break  # Exit the loop once the valid URL is processed
    else:
        print("Invalid GitHub URL. Please try again.")
        github_url = input("Please enter the GitHub repository URL: ")

print("Uploading to vector store...")

  download_loader("GithubRepositoryReader")




## Create query engine

In [26]:
vector_store = None

In [27]:
# ====== Create vector store and upload data ======
if vector_store is None:
    vector_store = DeepLakeVectorStore(
        dataset_path=dataset_path,
        overwrite=True,
        runtime={"tensor_db": True},
    )

storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(docs, storage_context=storage_context)
query_engine = index.as_query_engine()

Your Deep Lake dataset has been successfully created!


 

Uploading data to deeplake dataset.


100%|██████████| 34/34 [00:03<00:00,  9.25it/s]
-

Dataset(path='hub://akshatsingh1718/LlamaIndex_intro', tensors=['text', 'metadata', 'embedding', 'id'])

  tensor      htype      shape      dtype  compression
  -------    -------    -------    -------  ------- 
   text       text      (34, 1)      str     None   
 metadata     json      (34, 1)      str     None   
 embedding  embedding  (34, 1536)  float32   None   
    id        text      (34, 1)      str     None   


 

## Run a sanity check question

In [31]:
# Include a simple question to test.
def ask(que):
    print(f"Test question: {que}")
    print("=" * 50)
    answer = query_engine.query(que)
    print(f"Answer: {textwrap.fill(str(answer), 50)} \n")


ask("What is the repository about?")

Test question: What is the repository about?
Answer: The repository is about a tool designed for
transforming Sale, Purchase, and stock Excel
files. 



In [32]:
while True:
    user_question = input("Please enter your question (or type 'exit' to quit): ")
    if user_question.lower() == "exit":
        print("Exiting, thanks for chatting!")
        break

    ask(user_question)

Test question: what the framework used in the application ?
Answer: Django 

Test question: What are all the utilities defined ?
Answer: check_for_stock, check_for_purchase,
check_for_sale, check_for_gst, check_for_jjonly,
check_for_gst2b, check_for_echs 

Test question: show me the config used for sale
Answer: The configuration for sale can be found in the
Sale model within the Django admin interface. 

Test question: what is the cofig passed in TransformExcelGST ?
Answer: The config passed in TransformExcelGST includes
the following parameters: - output_columns -
gst_data - default_output_row - columns_to_sum -
target_columns_index - file_save_dir -
filename_prefix - perfix_for_totals -
party_gst_index 

Test question: 


ValueError: Either an `embedding`, `embedding_function`, `filter`, or `query` must be specified.

In [33]:
ask("What is the configs to be passed for GST Transformation")

Test question: What is the configs to be passed for GST Transformation
Answer: The configs to be passed for GST Transformation
are: - output_columns - gst_data -
default_output_row - columns_to_sum -
target_columns_index - perfix_for_totals -
party_gst_index - column_to_idx 

