RAG chatbot

Install dependencies

In [42]:
%pip install llama_hub llama-index llama-index-core llama-index-retrievers-bm25 llama-index-llms-ollama pydantic transformers llama-index-llms-openai llama-index-embeddings-huggingface

[0mFound existing installation: llama-hub 0.0.79.post1
Uninstalling llama-hub-0.0.79.post1:
  Would remove:
    /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/LICENSE
    /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/llama_hub-0.0.79.post1.dist-info/*
    /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/llama_hub/*
Proceed (Y/n)? ^C
[31mERROR: Operation cancelled by user[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


Setup paths to user guide file and folder to store the created chunks

In [21]:
DATA_FILE_PATH = './output_text_file.txt'
FIXED_CHUNK_FOLDER = './chunkraw/fixed_size'

Setup llm and embedding models

In [None]:
from llama_index.llms.ollama import Ollama
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

#llm = Ollama(model="llama3.1:8B", temperature=0.1,request_timeout=300)
llm = OpenAI(model="gpt-4o-mini",temperature= 0.1, api_key="api key")
#embeddings = HuggingFaceEmbedding(model_name="all-MiniLM-L6-v2")
embeddings = HuggingFaceEmbedding(model_name="jxm/cde-small-v2",trust_remote_code=True,max_length=768)

Settings.llm = llm
Settings.embed_model = embeddings

Disabled 23 dropout modules from model type <class 'transformers_modules.jxm.cde-small-v2.287bf0ea6ebfecf2339762d0ef28fb846959a8f2.model.BiEncoder'>
Disabled 46 dropout modules from model type <class 'transformers_modules.jxm.cde-small-v2.287bf0ea6ebfecf2339762d0ef28fb846959a8f2.model.ContextualDocumentEmbeddingTransformer'>


Load data and chunk it into fixed sizes

In [23]:
import os
from typing import List
def fixed_size_chunking(doc_text: str, chunk_size: int, overlap: int) -> List[str]:
    words = doc_text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

def save_chunks(chunks: List[str], folder_name: str):
    os.makedirs(folder_name, exist_ok=True)
    for i, chunk in enumerate(chunks):
        with open(os.path.join(folder_name, f"chunk_{i + 1}.txt"), 'w', encoding='utf-8') as f:
            f.write(chunk)


Process text file and create nodes, this including the method of actually splitting the documents, currently using sentencesplitter with overlapping as the base method

In [24]:
import time
from llama_index.core import Document
from llama_index.core.node_parser import SentenceSplitter, SemanticSplitterNodeParser
from llama_index.core.memory import ChatMemoryBuffer

def process_text_file(data_file=DATA_FILE_PATH, chunk_size=250, overlap=50):
    start_time = time.process_time()
    print("Starting chunking process...")
    try:
        load_start_time = time.process_time()
        if os.path.exists(FIXED_CHUNK_FOLDER) and os.listdir(FIXED_CHUNK_FOLDER):
            print("Loading existing chunks...")
            documents = [Document(text=open(f.path, 'r').read()) for f in os.scandir(FIXED_CHUNK_FOLDER) if f.is_file()]
        else:
            with open(data_file, 'r', encoding='utf-8') as file:
                doc_text = file.read()

            if not doc_text:
                raise ValueError("No content found in the specified text file.")

            print("Chunking the text...")
            chunk_start_time = time.process_time()
            chunks = fixed_size_chunking(doc_text, chunk_size, overlap)
            chunk_end_time = time.process_time()
            
            save_start_time = time.process_time()
            save_chunks(chunks, FIXED_CHUNK_FOLDER)
            save_end_time = time.process_time()
            
            print(f"Chunking completed in {chunk_end_time - chunk_start_time:.2f} seconds.")
            print(f"Chunks saved in {save_end_time - save_start_time:.2f} seconds.")
            
            documents = [Document(text=open(f.path, 'r').read()) for f in os.scandir(FIXED_CHUNK_FOLDER) if f.is_file()]


        #Sentence splitter
        splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
        nodes = splitter.get_nodes_from_documents(documents)

        #Semantic splitter
        # splitter = SemanticSplitterNodeParser(buffer_size=1, breakpoint_percentile_threshold=95, embed_model= Settings.embed_model)
        # nodes = splitter.get_nodes_from_documents(documents)
        print("Processing completed successfully.")
        return nodes
    
    except Exception as e:
        print(f"Error processing text file: {str(e)}")
        raise

After creating the nodes (splitted documents), customising the indexes, retrievers, queriers and chat engines

In [None]:
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.core import get_response_synthesizer
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor
from llama_index.core.tools import QueryEngineTool, ToolMetadata
from llama_index.core.agent import ReActAgent, FunctionCallingAgent
from llama_index.core.response_synthesizers import ResponseMode
from llama_index.core import PromptTemplate
from llama_index.core.response_synthesizers import TreeSummarize
from llama_index.core import SummaryIndex
from llama_index.core.agent import FunctionCallingAgentWorker
from llama_index.core.agent import AgentRunner
from llama_index.core import VectorStoreIndex
from llama_index.core.chat_engine import ContextChatEngine
from llama_index.core.llms import ChatMessage, MessageRole
from llama_index.core.chat_engine import CondensePlusContextChatEngine
def creating_agent(nodes):
    bm25_retriever = BM25Retriever.from_defaults(
        nodes=nodes,
        similarity_top_k=10,
        language="english",
        
    )

    response_synthesizer = get_response_synthesizer(response_mode="tree_summarize")
    #response_synthesizer = summarizer
    query_engine = RetrieverQueryEngine(
        retriever = bm25_retriever,
        response_synthesizer=response_synthesizer,
        node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.7)],
    )

    index_configs = []

    RAG_tool = QueryEngineTool(
        query_engine = query_engine,
        metadata=ToolMetadata(name = f"vector", description=f"useful for when you want to asnwer queries about the Equix documents"),
    )

    index_configs.append(RAG_tool)

    summary_index = SummaryIndex(nodes,show_progress=True)
    summary_query_engine = summary_index.as_query_engine(response_mode="tree_summarize",use_async=True,)
    summary_tool = QueryEngineTool.from_defaults(
        name="summary",
        query_engine=summary_query_engine,
        description=(
            "Useful if you want to get a summary of Equix documents"
        ),
    )

    index_configs.append(summary_tool)
    context = "If the user asks a question the you already know the answer to OR the user is making idle banter, just respond without calling any tools."
    #agent = ReActAgent.from_tools(index_configs,llm = Settings.llm, verbose=True, memory= ChatMemoryBuffer(token_limit=3900))
    #React_agent = ReActAgent.from_tools(index_configs,llm = Settings.llm, verbose=True,context= context,max_iterations=10)
    memory = ChatMemoryBuffer.from_defaults(token_limit=1500)
    custom_prompt = PromptTemplate(
    """\
    Given a conversation (between Human and Assistant) and a follow up message from Human, \
    rewrite the message to be a standalone question that captures all relevant context \
    from the conversation.

    <Chat History>
    {chat_history}

    <Follow Up Message>
    {question}

    <Standalone question>
    """
    )
    
    CONTEXT_PROMPT_TEMPLATE = """
  The following is a friendly conversation between a user and an AI assistant.
  The assistant is talkative and provides lots of specific details from its context.
  If the assistant does not know the answer to a question, it truthfully says it
  does not know without being hallucinated.

  Here are the relevant documents for the context:

  {context_str}

  Instruction: Based on the above documents, provide a detailed answer for the user question below. Never mention about the source of context or documents. Try to be as much humane as possible.
  Answer "don't know, please provide more information" if not present in the document.
  """
    CONTEXT_REFINE_PROMPT_TEMPLATE = """
  The following is a friendly conversation between a user and an AI assistant.
  The assistant is talkative and provides lots of specific details from its context.
  If the assistant does not know the answer to a question, it truthfully says it
  does not know.

  Here are the relevant documents for the context:

  {context_msg}

  Existing Answer:
  {existing_answer}

  Instruction: Refine the existing answer using the provided context to assist the user. Never mention about the source of context or documents. Try to be as much humane as possible.
  If the context isn't helpful, just repeat the existing answer and nothing more. Never mention about the source of context or documents. Try to be as much humane as possible.
  """
    CONDENSE_PROMPT_TEMPLATE = """
  Given the following conversation between a user and an AI assistant and a follow up question from user,
  rephrase the follow up question to be a standalone question.

  Chat History:
  {chat_history}
  Follow Up Input: {question}
  Standalone question:"""
    sys_prompt = """
    You are a truthful AI that sticks to the instruction.

    Instruction: use the provided context to answer the question from the users. If the answer is not provided in the context, please don't imaginatively generate random response. Never mention about the source of the contex or information. You MUST try to be as much humane as possible
    """
    React_agent = CondensePlusContextChatEngine(
        retriever=bm25_retriever, 
        llm = Settings.llm, 
        memory=memory, 
        system_prompt=sys_prompt, 
        context_prompt= CONTEXT_PROMPT_TEMPLATE, 
        context_refine_prompt=CONTEXT_REFINE_PROMPT_TEMPLATE, 
        condense_prompt= CONDENSE_PROMPT_TEMPLATE,
        node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.7)],
        verbose=True
    )
    func_agent = FunctionCallingAgentWorker.from_tools(index_configs,llm = Settings.llm, verbose= True)
    func_agent = AgentRunner(func_agent)
    #zilliz = creating_zillis_index(nodes=nodes, pipeline_ids=pipeline_ids)
    #index = VectorStoreIndex.build_index_from_nodes(nodes= nodes)
    #engine = index.as_chat_engine(chat_mode = "best", llm = Settings.llm, verbose = True )
    return bm25_retriever, query_engine,React_agent,func_agent

In [26]:
def load_index(chunk_size=100, overlap=10):
    nodes = process_text_file(chunk_size=chunk_size, overlap=overlap)
    return creating_agent(nodes=nodes)

Load data source, create chunks and agents

In [40]:
DATA_FILE_PATH = './output_text_file.txt'
FIXED_CHUNK_FOLDER = './chunkraw/fixed_size'
#customise the chunk size: too high will lead to ambiguity in the retrieval: the gathered data is too general and hence, too long to compute. too low will lead to missing of data
chunk_size = 100
overlap = 10

retriever,query_engine,chat_agent,func_agent = load_index(chunk_size=chunk_size, overlap=overlap)


Starting chunking process...
Loading existing chunks...
Processing completed successfully.


Test retriever, querier, chat engine and function calling agent

In [None]:
string = "Hello?"
response = chat_agent.stream_chat(string)

print("Retrieved nodes from the index:")
for i in response.source_nodes:
   print(f"retriever: {i}")
print("_____________________________________")
print("Response from the chat engine")
#response1 = query_engine.query("Placing an order")

for token in response.response_gen:
   print(token, end ="")


Condensed question: how do I place an order?
Retrieved nodes from the index:
retriever: Node ID: 3b884962-a89f-413b-bf4f-67efc63c0e2f
Text: applied for Equity only. = Placed Price * Placed Quantity + Fees
Open Orders (Sells) The total value of sell orders that are still
working (open). Currently applied for Equity only. = Placed Price *
Placed Quantity - Fees Trading Balance The available balance that you
can place order Trading balance = Cash at bank+ Transaction not booked
Value o...
Score:  2.515

retriever: Node ID: d27610b7-2ef1-4559-b4a5-16c089a68903
Text: provide the status of the profit of the company Profit Margin
The portion of a company’s sales revenue that it gets to keep as a
profit, after subtracting all its costs Operating Margin Measures how
much profit a company makes on a dollar of sales after paying for
variable costs of production Management Effectiveness Return on Assets
The financi...
Score:  2.310

retriever: Node ID: 7d529d11-215c-4499-90c2-ec678b1e6099
Text: Tr