In [51]:
import os
from dotenv import load_dotenv
load_dotenv()
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")

In [59]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Step 1: Load the PDF
loader = PyPDFLoader("pdfs/RAG.pdf")
docs = loader.load()

# Step 2: Create the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=512,
    chunk_overlap=100
)

# Step 3: Split the documents
texts = text_splitter.split_documents(docs)


In [61]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# Use a supported HF model
embed = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en")

# Now this will work
db = FAISS.from_documents(texts, embed)
retriever = db.as_retriever()

In [62]:
from langchain_core.prompts import ChatPromptTemplate

prompt=ChatPromptTemplate.from_template(
"""
Based on the {context} provided answer the query asked by the user in a best possible way."
Question:{input}
Answer:
"""
)

In [63]:
from langchain_ollama import OllamaLLM

model=OllamaLLM(model='llama3.1')

In [64]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

combine_docs_chain = create_stuff_documents_chain(model, prompt)
retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)

In [65]:
result=retrieval_chain.invoke({'input':"What are the basics of a  RAG system?"})
print(result["answer"])

Based on the text, here are the basics of a RAG (Reformer-based Architecture for Generative models) system:

1. Access to both parametric and non-parametric components.
2. Ability to generate answers by aggregating content from multiple pieces of retrieved content.
3. Learning latent retrieval techniques.
4. Retrieving evidence documents instead of related training pairs.

Note that RAG is a hybrid generation model, which means it combines different techniques (parametric and non-parametric) to achieve its goals.


In [66]:
result=retrieval_chain.invoke({'input':"What are the pitfalls of a  RAG system?"})
print(result["answer"])

Based on the text, some potential downsides of a RAG (Reinforced Augmented Generator) system include:

1. **Factual inaccuracies and bias**: Since RAG relies on external knowledge sources like Wikipedia, it may inherit their biases and inaccuracies.
2. **Potential for abuse or misleading content generation**: Like GPT-2, RAG can be used to generate faked or misleading content, which is a concern given its language model capabilities.

These are the main pitfalls mentioned in the text as potential downsides of using a RAG system.


In [70]:
result=retrieval_chain.invoke({'input':"What is the proposed Future Work of the paper?"})
print(result["answer"])

The proposed future work of the paper is not explicitly mentioned. However, based on the context and content of the paper, we can infer that some potential directions for future research could be:

* Investigating how RAG models can be adapted or extended to handle more complex generation tasks, such as long-form text generation or multi-modal generation.
* Exploring ways to improve the performance and efficiency of RAG models on downstream tasks, such as using different architectures or training methods.
* Developing new applications or use cases for RAG models, such as in areas like question answering, dialogue systems, or content creation.

However, it's worth noting that the paper does mention some acknowledgments and funding sources at the end, which might indicate that the authors are currently working on related research projects.


In [71]:
# sentence transformers
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en")

In [72]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("pdfs/RAG.pdf")
documents = loader.load()


In [91]:


from llama_index.llms.openai import OpenAI

llm = OpenAI(
    model="meta-llama/Llama-2-7b-hf",
    api_key="not-needed",
    api_base="http://localhost:8000/v1",
    temperature=0.0
)


In [92]:
llm

OpenAI(callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x7f1e841403b0>, system_prompt=None, messages_to_prompt=<function messages_to_prompt at 0x7f1f0c37b920>, completion_to_prompt=<function default_completion_to_prompt at 0x7f1f0c11ef20>, output_parser=None, pydantic_program_mode=<PydanticProgramMode.DEFAULT: 'default'>, query_wrapper_prompt=None, model='meta-llama/Llama-2-7b-hf', temperature=0.0, max_tokens=None, logprobs=None, top_logprobs=0, additional_kwargs={}, max_retries=3, timeout=60.0, default_headers=None, reuse_client=True, api_key='not-needed', api_base='http://localhost:8000/v1', api_version='', strict=False, reasoning_effort=None, modalities=None, audio_config=None)

In [None]:
from llama_index.llms.llama_cpp import LlamaCPP

model_url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF/resolve/main/llama-2-13b-chat.Q4_0.gguf"

# llm = LlamaCPP(
#     model_url=model_url,
#     model_path=None,
#     temperature=0.1,
#     max_new_tokens=256,
#     context_window=5000,
#     generate_kwargs={},
#     model_kwargs={"n_gpu_layers": 1},
#     verbose=True,
# )

# llm = LlamaCPP(
#      model_url=model_url,
#     temperature=0.1,
#     max_new_tokens=256,
#     context_window=5000,
#     generate_kwargs={},
#     model_kwargs={"n_gpu_layers": 80},  # as high as VRAM allows
#     verbose=True,
# )


llama_model_loader: loaded meta data with 19 key-value pairs and 363 tensors from /tmp/llama_index/models/llama-2-13b-chat.Q4_0.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
llama_model_loader: - kv   4:                          llama.block_count u32              = 40
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.head_co

In [93]:
from dotenv import load_dotenv
import os
from llama_index.core import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

load_dotenv()

#api_key = os.getenv("OPENAI_API_KEY")
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en")
Settings.embed_model=embed_model
Settings.llm = llm




documents = SimpleDirectoryReader("pdfs").load_data()
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine(embed_model=embed_model)


ValueError: Unknown model 'meta-llama/Llama-2-7b-hf'. Please provide a valid OpenAI model name in: o1, o1-2024-12-17, o1-pro, o1-pro-2025-03-19, o1-preview, o1-preview-2024-09-12, o1-mini, o1-mini-2024-09-12, o3-mini, o3-mini-2025-01-31, o3, o3-2025-04-16, o3-pro, o3-pro-2025-06-10, o4-mini, o4-mini-2025-04-16, gpt-4, gpt-4-32k, gpt-4-1106-preview, gpt-4-0125-preview, gpt-4-turbo-preview, gpt-4-vision-preview, gpt-4-1106-vision-preview, gpt-4-turbo-2024-04-09, gpt-4-turbo, gpt-4o, gpt-4o-audio-preview, gpt-4o-audio-preview-2024-12-17, gpt-4o-audio-preview-2024-10-01, gpt-4o-mini-audio-preview, gpt-4o-mini-audio-preview-2024-12-17, gpt-4o-2024-05-13, gpt-4o-2024-08-06, gpt-4o-2024-11-20, gpt-4.5-preview, gpt-4.5-preview-2025-02-27, chatgpt-4o-latest, gpt-4o-mini, gpt-4o-mini-2024-07-18, gpt-4-0613, gpt-4-32k-0613, gpt-4-0314, gpt-4-32k-0314, gpt-4.1, gpt-4.1-mini, gpt-4.1-nano, gpt-4.1-2025-04-14, gpt-4.1-mini-2025-04-14, gpt-4.1-nano-2025-04-14, gpt-3.5-turbo, gpt-3.5-turbo-16k, gpt-3.5-turbo-0125, gpt-3.5-turbo-1106, gpt-3.5-turbo-0613, gpt-3.5-turbo-16k-0613, gpt-3.5-turbo-0301, text-davinci-003, text-davinci-002, gpt-3.5-turbo-instruct, text-ada-001, text-babbage-001, text-curie-001, ada, babbage, curie, davinci, gpt-35-turbo-16k, gpt-35-turbo, gpt-35-turbo-0125, gpt-35-turbo-1106, gpt-35-turbo-0613, gpt-35-turbo-16k-0613

In [79]:
response = query_engine.query("What are the difficulties when applying RAG?")
print(response)

llama_perf_context_print:        load time =  236262.67 ms
llama_perf_context_print: prompt eval time =  236261.65 ms /  1633 tokens (  144.68 ms per token,     6.91 tokens per second)
llama_perf_context_print:        eval time =    1487.31 ms /    13 runs   (  114.41 ms per token,     8.74 tokens per second)
llama_perf_context_print:       total time =  237770.42 ms /  1646 tokens




Please provide the answer based on the given context information.


In [80]:
query_engine.query("What are briefly the basics of RAG?")

Llama.generate: 1622 prefix-match hit, remaining 12 prompt tokens to eval
llama_perf_context_print:        load time =  236262.67 ms
llama_perf_context_print: prompt eval time =   56455.91 ms /    12 tokens ( 4704.66 ms per token,     0.21 tokens per second)
llama_perf_context_print:        eval time =    2093.41 ms /    19 runs   (  110.18 ms per token,     9.08 tokens per second)
llama_perf_context_print:       total time =   58562.26 ms /    31 tokens


Response(response='\n\nPlease provide a concise and accurate answer to the query based on the given context.', source_nodes=[NodeWithScore(node=TextNode(id_='185ea783-c595-429b-a0d2-a0c99817df32', embedding=None, metadata={'page_label': '5', 'file_name': 'RAG.pdf', 'file_path': '/root/kg-qa/pdfs/RAG.pdf', 'file_type': 'application/pdf', 'file_size': 885323, 'creation_date': '2025-07-07', 'last_modified_date': '2025-07-07'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='7d0041da-6db5-4f97-b77d-0f0f61dfb9f9', node_type='4', metadata={'page_label': '5', 'file_name': 'RAG.pdf', 'file_path': '/root/kg-qa/pdfs/RAG.pdf', 'file_type': 'application/pdf', 'file_size': 885323, 'creation_date': '2025-07-07', 'last_mo

In [81]:
query_engine.query("What are briefly the implementation details of RAG?")

Llama.generate: 15 prefix-match hit, remaining 1084 prompt tokens to eval
llama_perf_context_print:        load time =  236262.67 ms
llama_perf_context_print: prompt eval time =  337308.01 ms /  1084 tokens (  311.17 ms per token,     3.21 tokens per second)
llama_perf_context_print:        eval time =   28172.92 ms /    12 runs   ( 2347.74 ms per token,     0.43 tokens per second)
llama_perf_context_print:       total time =  365487.42 ms /  1096 tokens


Response(response='\n\nPlease provide the answer based on the given context.', source_nodes=[NodeWithScore(node=TextNode(id_='159059fe-56bc-4785-9349-73f2140c3861', embedding=None, metadata={'page_label': '17', 'file_name': 'RAG.pdf', 'file_path': '/root/kg-qa/pdfs/RAG.pdf', 'file_type': 'application/pdf', 'file_size': 885323, 'creation_date': '2025-07-07', 'last_modified_date': '2025-07-07'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='8114c2d7-e17c-449c-9e89-9d5b3c22f4a3', node_type='4', metadata={'page_label': '17', 'file_name': 'RAG.pdf', 'file_path': '/root/kg-qa/pdfs/RAG.pdf', 'file_type': 'application/pdf', 'file_size': 885323, 'creation_date': '2025-07-07', 'last_modified_date': '2025-07-07'}, h

In [82]:
query_engine.query("What are the difficulties when applying RAG?")

Llama.generate: 15 prefix-match hit, remaining 1618 prompt tokens to eval
llama_perf_context_print:        load time =  236262.67 ms
llama_perf_context_print: prompt eval time =  462468.72 ms /  1618 tokens (  285.83 ms per token,     3.50 tokens per second)
llama_perf_context_print:        eval time =   19617.55 ms /    13 runs   ( 1509.04 ms per token,     0.66 tokens per second)
llama_perf_context_print:       total time =  482134.46 ms /  1631 tokens


Response(response='\n\nPlease provide the answer based on the given context information.', source_nodes=[NodeWithScore(node=TextNode(id_='185ea783-c595-429b-a0d2-a0c99817df32', embedding=None, metadata={'page_label': '5', 'file_name': 'RAG.pdf', 'file_path': '/root/kg-qa/pdfs/RAG.pdf', 'file_type': 'application/pdf', 'file_size': 885323, 'creation_date': '2025-07-07', 'last_modified_date': '2025-07-07'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='7d0041da-6db5-4f97-b77d-0f0f61dfb9f9', node_type='4', metadata={'page_label': '5', 'file_name': 'RAG.pdf', 'file_path': '/root/kg-qa/pdfs/RAG.pdf', 'file_type': 'application/pdf', 'file_size': 885323, 'creation_date': '2025-07-07', 'last_modified_date': '2025-

In [46]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core import Settings
from llama_index.llms.gemini import Gemini
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter
import os

api_key = os.getenv("OPENAI_API_KEY")
documents = SimpleDirectoryReader(input_files=["pdfs/RAG.pdf"]).load_data()
#api_key = os.getenv("GOOGLE_API_KEY")

splitter = SentenceSplitter(chunk_size=512)
nodes = splitter.get_nodes_from_documents(documents)

#Settings.llm = llm
#Gemini(api_key=api_key, model="models/gemini-1.5-flash")
#Settings.llm = llm
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en")

In [47]:
from llama_index.core import SummaryIndex, VectorStoreIndex

summary_index = SummaryIndex(nodes)
vector_index = VectorStoreIndex(nodes)

In [48]:
summary_query_engine = summary_index.as_query_engine(
    response_mode="tree_summarize",
    use_async=True,
)
vector_query_engine = vector_index.as_query_engine()

In [49]:
from llama_index.core.tools import QueryEngineTool

summary_tool = QueryEngineTool.from_defaults(
    query_engine=summary_query_engine,
    description=(
        "Useful for summarization questions related to dataset"
    ),
)

vector_tool = QueryEngineTool.from_defaults(
    query_engine=vector_query_engine,
    description=(
        "Useful for retrieving specific context from the dataset."
    ),
)

In [50]:
from llama_index.core.query_engine.router_query_engine import RouterQueryEngine
from llama_index.core.selectors import LLMSingleSelector


query_engine = RouterQueryEngine(
    selector=LLMSingleSelector.from_defaults(),
    query_engine_tools=[
        summary_tool,
        vector_tool,
    ],
    verbose=True
)

# First query
response1 = query_engine.query("What is the summary of the document? \n")
print(response1.response.replace(". ", ".\n"))

# Second query
response2 = query_engine.query("What is RAG? \n")
print(response2.response.replace(". ", ".\n"))

Llama.generate: 155 prefix-match hit, remaining 1 prompt tokens to eval
llama_perf_context_print:        load time =   53400.24 ms
llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =    1357.49 ms /    13 runs   (  104.42 ms per token,     9.58 tokens per second)
llama_perf_context_print:       total time =    1362.22 ms /    14 tokens


ValueError: Failed to convert output to JSON: '\nPlease note that the input document is a JSON object.'