Notebook for Dev purpose

In [39]:
#Manage Imports

#llamaindex stuff
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.agent import ReActAgent
from llama_index.core.schema import IndexNode
from llama_index.core import (
    Settings,
    Document,
    SimpleDirectoryReader,
    VectorStoreIndex,
    SummaryIndex,
)
from llama_index.llms.groq import Groq
#Document Readers
import ebooklib
from bs4 import BeautifulSoup
from ebooklib import epub

#Initalize LLM and Embedding Model

#Using Groq Temporarily due to low computer ram.
import os
from dotenv import load_dotenv
load_dotenv()
llm = Groq(model="llama3-groq-70b-8192-tool-use-preview", api_key=(os.getenv("GROQ_API_KEY")))
#llm = Ollama(model="llama3.2:latest", temperature=0)
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

#Settings the models to be used
Settings.llm = llm
Settings.embed_model = embed_model
#Settings for the chunk size and overlap for efficient embedding at VectorStoreIndex
Settings.chunk_size = 512
Settings.chunk_overlap = 30


debug = True

Load pretrained SentenceTransformer: BAAI/bge-small-en-v1.5
2 prompts are loaded, with the keys: ['query', 'text']


In [46]:
#Document Processing
file_name = input(" Enter the (enter) file name inside the document directory: ")
document = 'documents/' + file_name

#Ebooks(Text)

if file_name.lower().endswith('.epub'):
    if debug:
        print("Loading Ebook through Epub Reader.")
    book = epub.read_epub(document)
    #Initialize list to store all the text
    all_text = []
    if debug and book.get_metadata('DC', 'title'):
        print(f"Loaded Ebook of Title : {book.get_metadata('DC', 'title')[0][0]}\n\n")
    else: 
        print("Loading Unssuccessful\n\n")

    for item in book.get_items_of_type(ebooklib.ITEM_DOCUMENT):
        #Parse it with soup
        soup = BeautifulSoup(item.get_content(), 'html.parser')
        text = soup.get_text()

        all_text.append(text)
    
    #Creating Document Object so it can be embedded
    full_text = [Document(text=t) for t in all_text]
    if debug and full_text:
        print("Converted Document Object\n\n")
    else: 
        print("Convertion Unssuccessful\n\n")

#More efficient document parsing modules can be added in the future

#General Document Loading
else:
    if debug:
        print('Loading Document Object through SimpleDirectoryReader.')
    full_text = SimpleDirectoryReader(input_files=[document]).load_data()
    if debug and full_text[0].metadata:
        print(f"Loaded Document of Title : {full_text[0].metadata['file_name']}\n\n")
    else: 
        print("Loading Unssuccessful\n\n")

Loading Document Object through SimpleDirectoryReader.
Loaded Document of Title : Chapter5.4_Covariance.pdf




In [None]:
#Embedding the Document
vector_index = VectorStoreIndex.from_documents(full_text)
#for summary purpose
summary_index = SummaryIndex.from_documents(full_text)

#Define Query Engines to be used(k = 3 for keeping some additional context)
vector_query_engine = vector_index.as_query_engine()
summary_query_engine = summary_index.as_query_engine()

# define tools
query_engine_tools = [
    QueryEngineTool(
        query_engine=vector_query_engine,
        metadata=ToolMetadata(
            name="vector_tool",
            description=(
                f"Useful for retrieving specific context from {file_name}"
            ),
        ),
    ),
    QueryEngineTool(
        query_engine=summary_query_engine,
        metadata=ToolMetadata(
            name="summary_tool",
            description=(
                f"Useful for summarization questions related to {file_name}"
            ),
        ),
    ),
]

#Build document specific agent
if debug:
    agent = ReActAgent.from_tools(
        query_engine_tools,
        llm = llm,
        verbose=True,
    )
else:
    agent = ReActAgent.from_tools(
        query_engine_tools,
        llm = llm,
        verbose=False,
    )

#Index nodes for future multi-doc module
objects = []

file_summary = (
    "Use this index if you need to lookup specific facts about"
    f" {file_name}."
)

node = IndexNode(
    text=file_summary, index_id=file_name, obj=agent
)
objects.append(node)


#Query Engine
vector_index = VectorStoreIndex(
    objects=objects,
)

if debug:
    query_engine = vector_index.as_query_engine(similarity_top_k=1, verbose=True)
else:
    query_engine = vector_index.as_query_engine(similarity_top_k=1, verbose=False)

Batches: 100%|██████████| 1/1 [00:00<00:00,  1.67it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  4.66it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.00it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  7.26it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00,  8.94it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 12.85it/s]


In [59]:
# Helper function for print
def print_response(response):
    display(HTML(f'<p style="font-size:20px">{response.response}</p>'))

In [71]:
response = query_engine.query("summarize the first three aspects covered in the document")
print_response(response)

Batches: 100%|██████████| 1/1 [00:00<00:00,  1.20it/s]


[1;3;38;2;11;159;203mRetrieval entering Chapter5.4_Covariance.pdf: ReActAgent
[0m[1;3;38;2;237;90;200mRetrieving from object ReActAgent with query summarize the first three aspects covered in the document
[0m> Running step 4dfa6459-5abb-49df-b917-ea46ee95754c. Step input: summarize the first three aspects covered in the document
HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
[1;3;38;5;200mThought: The current language of the user is: English. I need to use a tool to help me answer the question.
Action: summary_tool
Action Input: {'input': 'summarize the first three aspects covered in the document'}
[0mHTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
[1;3;34mObservation: The first three aspects covered in the document are:

1. Introduction 