In [1]:
import os
from dotenv import load_dotenv
load_dotenv()
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")

In [2]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Step 1: Load the PDF
loader = PyPDFLoader("pdfs/RAG.pdf")
docs = loader.load()

# Step 2: Create the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100
)

# Step 3: Split the documents
texts = text_splitter.split_documents(docs)


In [3]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# Use a supported HF model
embed = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Now this will work
db = FAISS.from_documents(texts, embed)
retriever = db.as_retriever()

  embed = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from langchain_core.prompts import ChatPromptTemplate

prompt=ChatPromptTemplate.from_template(
"""
Based on the {context} provided answer the query asked by the user in a best possible way.
Example1- Question:"What skill is necessary to become Data Scientist?"
Answer:"SQL, Python, Machine Learning and concepts which help in future values predictions."
Question:{input}
Answer:
"""
)

In [5]:
from langchain_ollama import OllamaLLM

model=OllamaLLM(model='llama3.1')

In [6]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

combine_docs_chain = create_stuff_documents_chain(model, prompt)
retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)

In [None]:
result=retrieval_chain.invoke({'input':"What is RAG?"})
print(result["answer"])

In [None]:
# sentence transformers
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en")

In [8]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("data/llama2.pdf")
documents = loader.load()


In [6]:
from llama_index.llms.llama_cpp import LlamaCPP

model_url = "https://huggingface.co/TheBloke/Llama-2-13B-chat-GGUF/resolve/main/llama-2-13b-chat.Q4_0.gguf"

llm = LlamaCPP(
    model_url=model_url,
    model_path=None,
    temperature=0.1,
    max_new_tokens=256,
    context_window=3900,
    generate_kwargs={},
    model_kwargs={"n_gpu_layers": 1},
    verbose=True,
)

llama_model_loader: loaded meta data with 19 key-value pairs and 363 tensors from C:\Users\Administrator\AppData\Local\llama_index\models\llama-2-13b-chat.Q4_0.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
llama_model_loader: - kv   4:                          llama.block_count u32              = 40
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:        

In [9]:
from dotenv import load_dotenv
import os
from llama_index.core import Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

load_dotenv()

api_key = os.getenv("OPENAI_API_KEY")
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en")
Settings.llm = llm




documents = SimpleDirectoryReader("pdfs").load_data()
index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine(llm=llm, embed_model=embed_model)


In [10]:
response = query_engine.query("What are the difficulties when applying RAG?")
print(response)

llama_perf_context_print:        load time =  281490.73 ms
llama_perf_context_print: prompt eval time =  281488.46 ms /  2004 tokens (  140.46 ms per token,     7.12 tokens per second)
llama_perf_context_print:        eval time =  191709.12 ms /   107 runs   ( 1791.67 ms per token,     0.56 tokens per second)
llama_perf_context_print:       total time =  473793.10 ms /  2111 tokens


 There are several potential downsides to applying RAG, including the possibility of
generating abuse, faked, or misleading content in the news or on social media, impersonating
others, or automating the production of spam/phishing content. Additionally, advanced language
models may lead to the automation of various jobs in the coming decades. To mitigate these risks,
AI systems could be employed to fight against misleading content and automated spam/phishing.


In [11]:
query_engine.query("What are briefly the basics of RAG?")

Llama.generate: 1289 prefix-match hit, remaining 387 prompt tokens to eval
llama_perf_context_print:        load time =  281490.73 ms
llama_perf_context_print: prompt eval time =   50083.85 ms /   387 tokens (  129.42 ms per token,     7.73 tokens per second)
llama_perf_context_print:        eval time =   49650.33 ms /   113 runs   (  439.38 ms per token,     2.28 tokens per second)
llama_perf_context_print:       total time =  100127.14 ms /   500 tokens


Response(response=' RAG is a new language model that offers several positive societal benefits over previous work: the fact that it is more strongly grounded in real factual knowledge (in this case Wikipedia) makes it "hallucinate" less with generations that are more factual, and offers more control and interpretability. RAG could be employed in a wide variety of scenarios with direct benefit to society, for example by endowing it with a medical index and asking it open-domain questions on that topic, or by helping people be more effective at their jobs.', source_nodes=[NodeWithScore(node=TextNode(id_='1c612cec-f900-45db-a057-adc1685c1020', embedding=None, metadata={'page_label': '10', 'file_name': 'RAG.pdf', 'file_path': 'c:\\Users\\Administrator\\kg-qa\\pdfs\\RAG.pdf', 'file_type': 'application/pdf', 'file_size': 885323, 'creation_date': '2025-07-04', 'last_modified_date': '2025-07-04'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modif

In [14]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core import Settings
from llama_index.llms.gemini import Gemini
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter
import os

documents = SimpleDirectoryReader(input_files=["pdfs/RAG.pdf"]).load_data()
api_key = os.getenv("GOOGLE_API_KEY")

splitter = SentenceSplitter(chunk_size=512)
nodes = splitter.get_nodes_from_documents(documents)

Settings.llm = Gemini(api_key=api_key, model="models/gemini-1.5-flash")
#Settings.llm = llm
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en")

  Settings.llm = Gemini(api_key=api_key, model="models/gemini-1.5-flash")


In [15]:
from llama_index.core import SummaryIndex, VectorStoreIndex

summary_index = SummaryIndex(nodes)
vector_index = VectorStoreIndex(nodes)

In [16]:
summary_query_engine = summary_index.as_query_engine(
    response_mode="tree_summarize",
    use_async=True,
)
vector_query_engine = vector_index.as_query_engine()

In [17]:
from llama_index.core.tools import QueryEngineTool

summary_tool = QueryEngineTool.from_defaults(
    query_engine=summary_query_engine,
    description=(
        "Useful for summarization questions related to dataset"
    ),
)

vector_tool = QueryEngineTool.from_defaults(
    query_engine=vector_query_engine,
    description=(
        "Useful for retrieving specific context from the dataset."
    ),
)

In [18]:
from llama_index.core.query_engine.router_query_engine import RouterQueryEngine
from llama_index.core.selectors import LLMSingleSelector


query_engine = RouterQueryEngine(
    selector=LLMSingleSelector.from_defaults(),
    query_engine_tools=[
        summary_tool,
        vector_tool,
    ],
    verbose=True
)

# First query
response1 = query_engine.query("What is the summary of the document? \n")
print(response1.response.replace(". ", ".\n"))

# Second query
response2 = query_engine.query("What is RAG? \n")
print(response2.response.replace(". ", ".\n"))

ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerDayPerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-1.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 50
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 33
}
]