### Install & Import Dependencies

In [1]:
# requirements

!pip install llama-index
!pip install llama-index-embeddings-huggingface
!pip install llama-index-llms-huggingface-api
# !pip install llama-index-extractors-entity

Collecting llama-index
  Downloading llama_index-0.12.41-py3-none-any.whl.metadata (12 kB)
Collecting llama-index-agent-openai<0.5,>=0.4.0 (from llama-index)
  Downloading llama_index_agent_openai-0.4.9-py3-none-any.whl.metadata (438 bytes)
Collecting llama-index-cli<0.5,>=0.4.2 (from llama-index)
  Downloading llama_index_cli-0.4.3-py3-none-any.whl.metadata (1.4 kB)
Collecting llama-index-core<0.13,>=0.12.41 (from llama-index)
  Downloading llama_index_core-0.12.41-py3-none-any.whl.metadata (2.4 kB)
Collecting llama-index-embeddings-openai<0.4,>=0.3.0 (from llama-index)
  Downloading llama_index_embeddings_openai-0.3.1-py3-none-any.whl.metadata (684 bytes)
Collecting llama-index-indices-managed-llama-cloud>=0.4.0 (from llama-index)
  Downloading llama_index_indices_managed_llama_cloud-0.7.4-py3-none-any.whl.metadata (3.3 kB)
Collecting llama-index-llms-openai<0.5,>=0.4.0 (from llama-index)
  Downloading llama_index_llms_openai-0.4.4-py3-none-any.whl.metadata (3.0 kB)
Collecting llama-

In [3]:
from google.colab import drive
drive.mount('/gdrive')

import os
os.chdir('/gdrive/MyDrive/rag/gen_ai')

import nest_asyncio
nest_asyncio.apply()

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [22]:
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, SummaryIndex
from llama_index.core.node_parser import SentenceSplitter #, SemanticSplitterNodeParser
from llama_index.core.tools import QueryEngineTool #, FunctionTool
from llama_index.core.postprocessor import SimilarityPostprocessor
# from llama_index.core.vector_stores import MetadataFilters, FilterCondition
from llama_index.core import StorageContext, load_index_from_storage
from llama_index.core.prompts import RichPromptTemplate
from llama_index.core.objects import ObjectIndex
from llama_index.core.query_engine import ToolRetrieverRouterQueryEngine
from typing import List, Optional
from pprint import pprint

from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core.vector_stores import SimpleVectorStore
# from llama_index.core.extractors import SummaryExtractor, QuestionsAnsweredExtractor, \
#                                         TitleExtractor, KeywordExtractor
# from llama_index.extractors.entity import EntityExtractor

from huggingface_hub import login
from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# Define Models and Load Data

In [4]:
papers = [
    "data/longlora.pdf",
    "data/selfrag.pdf",
    "data/loftq.pdf",
    "data/zipformer.pdf"
]

In [5]:
## load models
# the token is taken from colab secrets

# load embeddings from huggingface
embed_model = HuggingFaceEmbedding()

# load llm model from huggingface
llm = HuggingFaceInferenceAPI(model_name="mistralai/Mixtral-8x7B-Instruct-v0.1")
# try bloomberg gpt for financial docs

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/90.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [24]:
# directory to store vector index
persist_dir = './storage'

# # rebuild storage context
storage_context = StorageContext.from_defaults(persist_dir=persist_dir)
docstore = SimpleDocumentStore()
vector_store = SimpleVectorStore()

Loading llama_index.core.storage.kvstore.simple_kvstore from ./storage/docstore.json.
Loading llama_index.core.storage.kvstore.simple_kvstore from ./storage/index_store.json.


# Create Ingestion Pipeline for RAG

In [7]:
# read pdf files from disk

# automatically sets the metadata of each document according to filename_fn
# filename_fn = lambda filename: {"file_name": filename}
# add this parameter to the simple directory reader: file_metadata=filename_fn

document_reader = SimpleDirectoryReader(input_files=papers,
                                        filename_as_id=True)
documents = document_reader.load_data()

In [8]:
# deine custom transformation object to clean text
from llama_index.core.schema import TransformComponent
import re
class TextCleaner(TransformComponent):
    def __call__(self, nodes, **kwargs):
        for node in nodes:
            # removing the header text from the documents
            node.text = re.sub(r"Published as a conference paper at ICLR 2024",
                               "", node.text)
        return nodes

In [9]:
# define sequence of transformations for the documents
transformations=[
        SentenceSplitter(chunk_size=512, chunk_overlap=20),
        TextCleaner()
        # TitleExtractor(llm=llm, nodes=5),
        # QuestionsAnsweredExtractor(llm=llm, questions=3),
        # SummaryExtractor(llm=llm, summaries=["prev", "self"]),
        # KeywordExtractor(llm=llm, keywords=10),
        # EntityExtractor(llm=llm, prediction_threshold=0.5)
    ]

In [10]:
# create ingestion pipeline
pipeline = IngestionPipeline(
    transformations= transformations + [embed_model],
    vector_store=SimpleVectorStore(),
    docstore=SimpleDocumentStore()
)

nodes = pipeline.run(documents=documents,
                     show_progress=True)
print(len(nodes))
print(len(pipeline.docstore.docs))

Parsing nodes:   0%|          | 0/81 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/212 [00:00<?, ?it/s]

212
81


In [11]:
# pipeline.persist('./pipeline')
# pipeline.load('./pipeline')

### Adding Custom Prompts

In [28]:
chat_text_qa_prompt_str = """
{% chat role="system" %}
Only answer the question if the context is helpful, do not use any prior knowledge.
Do not refer to the given context in any way such as to say "based on the given context".
{% endchat %}

{% chat role="user" %}
The following is some retrieved context:

<context>
{{ context_str }}
</context>

Using the context, answer the provided question:
{{ query_str }}
{% endchat %}
"""

# Refine Prompt
chat_refine_prompt_str = """
{% chat role="system" %}
Only answer the question if the context is helpful, do not use any prior knowledge.
Do not refer to the given context in any way such as to say "based on the given context".
{% endchat %}

{% chat role="user" %}
The following is some new retrieved context:

<context>
{{ context_msg }}
</context>

And here is an existing answer to the query:
<existing_answer>
{{ existing_answer }}
</existing_answer>

Using both the new retrieved context and the existing answer, either update or repeat the existing answer to this query:
{{ query_str }}
{% endchat %}
"""

# Summary Engine

In [70]:
# create a summary index
summary_index = SummaryIndex(nodes,
                             storage_context=storage_context)

# get query engine from summary index
summary_query_engine = summary_index.as_query_engine(llm=llm,
                                                     response_mode="tree_summarize",
                                                     use_async=True)

summary_query_engine.update_prompts(
    {
        "response_synthesizer:text_qa_template": RichPromptTemplate(chat_text_qa_prompt_str),
        "response_synthesizer:refine_template": RichPromptTemplate(chat_refine_prompt_str)
    }
)

summary_tool = QueryEngineTool.from_defaults(
    query_engine=summary_query_engine,
    description="Useful for questions asking for summaries.",
)

# Vector Store Index

In [12]:
## create vector store index

# method 1: from documents with the transformation steps
# vector_index = VectorStoreIndex.from_documents(documents,
#                                                embed_model=embed_model,
#                                                transformations=transformations)


# method 2: from nodes and vector store of an ingestion pipeline
# this is needed mainly because the SimpleVectorStore does not store text
# when using a vector store that does store text, use VectorStoreIndex.from_vector_store
vector_index = VectorStoreIndex(nodes=nodes,
                                vector_store=pipeline.vector_store,
                                embed_model=embed_model,
                                # storage_context=StorageContext.from_defaults(persist_dir=persist_dir)
                                )

# method 3: load from storage
# vector_index = load_index_from_storage(storage_context, embed_model=embed_model)

# store vector index locally
vector_index.storage_context.persist(persist_dir)
# docstore.persist(os.path.join(persist_dir, 'docstore.json'))

In [27]:
# create a node post processor to only select the most similar nodes
postprocessor = SimilarityPostprocessor(similarity_cutoff=0.7)

# create query engine
vector_query_engine = vector_index.as_query_engine(llm=llm,
                                                   similarity_top_k=2,
                                                   node_postprocessors=[postprocessor],
                                                   response_mode="tree_summarize",
                                                   use_async=True
                                                   )

In [29]:
vector_query_engine.update_prompts(
    {
        "response_synthesizer:text_qa_template": RichPromptTemplate(chat_text_qa_prompt_str),
        "response_synthesizer:refine_template": RichPromptTemplate(chat_refine_prompt_str)
    }
)

In [69]:
vector_tool = QueryEngineTool.from_defaults(
    query_engine=vector_query_engine,
    description=(
        "Useful for retrieving specific snippets"
    ),
)

# Query Engine

In [71]:
obj_index = ObjectIndex.from_objects(
    [summary_tool, vector_tool],
    index_cls=VectorStoreIndex,
    embed_model=embed_model
)

query_engine = ToolRetrieverRouterQueryEngine(obj_index.as_retriever(),
                                              llm=llm)

In [72]:
# query the query engine and check response
response = query_engine.query("what is the training procedure used in the paper zipformer?")

print("query response")
pprint(response.response)
print()
print("getting meta data of response")
pprint(response.metadata, indent=2)

query response
(' The training procedure for the Zipformer model, as described in the paper, '
 'involves training the Zipformer CTC models for 100 epochs and the Zipformer '
 'CTC/AED models for 50 epochs. The Transformer decoder in the Zipformer '
 'CTC/AED model consists of 6 layers, each with an attention dimension of 512, '
 '8 attention heads, and a feed-forward hidden dimension of 2048. The proposed '
 'ScaledAdam optimizer is utilized for faster convergence and better '
 'performance. The experiments were conducted on LibriSpeech, Aishell-1, and '
 'Wenet-Speech datasets to demonstrate the effectiveness of the Zipformer.')

getting meta data of response
{ 'retrieved_tools': [ <llama_index.core.tools.query_engine.QueryEngineTool object at 0x7eb3a32503d0>,
                       <llama_index.core.tools.query_engine.QueryEngineTool object at 0x7eb3a32503d0>]}


In [73]:
# query the query engine and check response
response = query_engine.query("what is the training procedure used in the paper longlora?")

print("query response")
pprint(response.response)
print()
print("getting meta data of response")
pprint(response.metadata, indent=2)

query response
(' The training procedure for the paper longlora consists of training the '
 'models for 5 epochs with the same learning rate, weight decay, and batch '
 'sizes as used in the context extension step. The authors employ a '
 'long-context instruction following dataset named LongAlpaca-12k, which '
 'contains a mix of 9k long-context QAs and 3k short QAs derived from the '
 'original Alpaca data. To accommodate long contexts, the embedding and '
 'normalization layers are left open for training, a technique referred to as '
 'LoRA+. The pre-trained 7B, 13B, and 70B Llama2 models undergo extension with '
 'maximum extended context window sizes of 100k for 7B models, 65536 for 13B '
 'models, and 32768 for 70B models.')

getting meta data of response
{ 'retrieved_tools': [ <llama_index.core.tools.query_engine.QueryEngineTool object at 0x7eb3a32503d0>,
                       <llama_index.core.tools.query_engine.QueryEngineTool object at 0x7eb3a32503d0>]}


In [74]:
# query the query engine and check response
response = query_engine.query("Summarize the paper selfrag")

print("query response")
pprint(response.response)
print()
print("getting meta data of response")
pprint(response.metadata, indent=2)

query response
(' The paper "selfrag" introduces a unique method for summary generation, '
 'which consists of three stages. Initially, it creates several candidate '
 'summaries. After that, it formulates critique tokens to assess the factual '
 'accuracy and general quality of each candidate summary. In the final step, '
 'it picks the most suitable candidate based on this evaluation. This '
 'multi-step process differentiates it from traditional RAG methods. The code '
 'and trained models for this paper can be accessed at a given URL for further '
 'investigation.')

getting meta data of response
{ 'retrieved_tools': [ <llama_index.core.tools.query_engine.QueryEngineTool object at 0x7eb3a32503d0>,
                       <llama_index.core.tools.query_engine.QueryEngineTool object at 0x7eb3a32503d0>]}


In [42]:
# query the query engine and check response
response = query_engine.query("who is jose mourinho?")

print("query response")
pprint(response.response)
print()
print("getting meta data of response")
pprint(response.metadata, indent=2)

query response
(' Jose Mourinho is a highly accomplished and well-known professional football '
 'coach who has made a significant impact in top-tier football leagues around '
 'the world. He is celebrated for his tactical prowess, motivational '
 'abilities, and distinctive personality. Throughout his career, he has led '
 'several prestigious clubs to multiple titles, among which is the UEFA '
 'Champions League, with more than one team. Some of the clubs he has managed '
 'include Porto, Chelsea, Inter Milan, Real Madrid, and Manchester United.')

getting meta data of response
{ 'retrieved_tools': [ <llama_index.core.tools.query_engine.QueryEngineTool object at 0x7eb3a5bf1c50>,
                       <llama_index.core.tools.query_engine.QueryEngineTool object at 0x7eb3a5bf1c50>]}


In [80]:
tool = response.metadata['retrieved_tools'][0]
tool.metadata

ToolMetadata(description='Useful for retrieving specific snippets', name='query_engine_tool', fn_schema=<class 'llama_index.core.tools.types.DefaultToolFnSchema'>, return_direct=False)

In [78]:
len(response.source_nodes)

4

In [None]:
# !git clone https://github.com/atulpuri/gen_ai.git
# !git commit -m "added notebook" llamaindex_rag.ipynb
# !git status
# !git push
# !git config --list