### 1. Get the models

In [30]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [31]:
from langchain_openai import ChatOpenAI
LLM_openai = ChatOpenAI()

In [32]:
from langchain_groq import ChatGroq
LLM = ChatGroq(model="deepseek-r1-distill-llama-70b")
LLM.invoke("What is the capital of France?")

AIMessage(content='<think>\n\n</think>\n\nThe capital of France is Paris.', additional_kwargs={}, response_metadata={'token_usage': {'completion_tokens': 12, 'prompt_tokens': 10, 'total_tokens': 22, 'completion_time': 0.053891541, 'prompt_time': 0.000187917, 'queue_time': 0.00956878, 'total_time': 0.054079458}, 'model_name': 'deepseek-r1-distill-llama-70b', 'system_fingerprint': 'fp_e98d30d035', 'service_tier': 'on_demand', 'finish_reason': 'stop', 'logprobs': None}, id='run--dee93ecd-2b8d-49f0-8e07-f536ffa78661-0', usage_metadata={'input_tokens': 10, 'output_tokens': 12, 'total_tokens': 22})

In [33]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings
EMBEDDING=GoogleGenerativeAIEmbeddings(model="models/embedding-001")
len(EMBEDDING.embed_query("What is the capital of France?"))

768

### 2. Data Ingestion

In [34]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [50]:
file_path=os.path.join(os.getcwd(), "data", "cookbook.pdf")
loader= PyPDFLoader(file_path)
documents = loader.load()
len(documents)

34

In [None]:
# Small documents can be directly used with the vector store whithout splitting:

# text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=150, length_function=len)
# split_docs = text_splitter.split_documents(documents)
# len(split_docs)

179

In [None]:
# split_docs[2].metadata

{'producer': 'Adobe PDF Library 15.0',
 'creator': 'Adobe InDesign 14.0 (Macintosh)',
 'creationdate': '2020-01-29T11:11:43-08:00',
 'moddate': '2020-01-29T11:12:21-08:00',
 'trapped': '/False',
 'source': '/Users/arindam/Machine Learning/GenAI_2025/AgenticAI/4_Agentic_Document_Portal_LangGraph_E2E/notebooks/data/cookbook.pdf',
 'total_pages': 34,
 'page': 1,
 'page_label': '2'}

### 3. Store the docs in vector db

1. FIASS is in-memory vector store (eg. chroma)
2. on disk storage(faiss you can persist over the disk,chroma)
3. cloud storage(cloud variant of faiss is not available)(pinecone,weaviate,milvus,mongodbvectorsearch,astradb)

In [51]:
from langchain.vectorstores import FAISS
vectorstore = FAISS.from_documents(
    documents,
    EMBEDDING,)

In [39]:
vectorstore.similarity_search("How to make Stuffed Cauliflower Parantha?")

[Document(id='5fbc28fc-725e-46fc-b4a7-880676a3fb21', metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign 14.0 (Macintosh)', 'creationdate': '2020-01-29T11:11:43-08:00', 'moddate': '2020-01-29T11:12:21-08:00', 'trapped': '/False', 'source': '/Users/arindam/Machine Learning/GenAI_2025/AgenticAI/4_Agentic_Document_Portal_LangGraph_E2E/notebooks/data/cookbook.pdf', 'total_pages': 34, 'page': 0, 'page_label': '1'}, page_content='FOOD HERO\n FOOD HERO\nMy Recipes\nBe a \nFood Hero \nCook together. Eat together. Talk together.'),
 Document(id='80c6ed77-085a-4d63-abb2-913db5682685', metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign 14.0 (Macintosh)', 'creationdate': '2020-01-29T11:11:43-08:00', 'moddate': '2020-01-29T11:12:21-08:00', 'trapped': '/False', 'source': '/Users/arindam/Machine Learning/GenAI_2025/AgenticAI/4_Agentic_Document_Portal_LangGraph_E2E/notebooks/data/cookbook.pdf', 'total_pages': 34, 'page': 28, 'page_label': '29'}, page_cont

In [52]:
retriever= vectorstore.as_retriever()
retriever.invoke("How to make Stuffed Cauliflower Parantha?")

[Document(id='7dbd49ec-cfcf-48e8-8e00-aee8d5f959a5', metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign 14.0 (Macintosh)', 'creationdate': '2020-01-29T11:11:43-08:00', 'moddate': '2020-01-29T11:12:21-08:00', 'trapped': '/False', 'source': '/Users/arindam/Machine Learning/GenAI_2025/AgenticAI/4_Agentic_Document_Portal_LangGraph_E2E/notebooks/data/cookbook.pdf', 'total_pages': 34, 'page': 0, 'page_label': '1'}, page_content='FOOD HERO\n FOOD HERO\nMy Recipes\nBe a \nFood Hero \nCook together. Eat together. Talk together.'),
 Document(id='7d73ff55-5c89-4aa3-b552-496a5e833499', metadata={'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign 14.0 (Macintosh)', 'creationdate': '2020-01-29T11:11:43-08:00', 'moddate': '2020-01-29T11:12:21-08:00', 'trapped': '/False', 'source': '/Users/arindam/Machine Learning/GenAI_2025/AgenticAI/4_Agentic_Document_Portal_LangGraph_E2E/notebooks/data/cookbook.pdf', 'total_pages': 34, 'page': 30, 'page_label': '31'}, page_cont

In [53]:
prompt = """
    You are a helpful assistant that answers questions based on the provided context.
    if the context does not contain enough information to answer the question, respond with "I don't know".
    Context: {context}
    Question: {question}
    Answer:
"""

In [54]:
from langchain.prompts import PromptTemplate
prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt
)

In [44]:
prompt_template

PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template='\n    You are a helpful assistant that answers questions based on the provided context.\n    if the context does not contain enough information to answer the question, respond with "I don\'t know".\n    Context: {context}\n    Question: {question}\n    Answer:\n')

In [55]:
from langchain_core.output_parsers import StrOutputParser
parser = StrOutputParser()

In [56]:
def format_docs(docs):
    return "\n\n".join([doc.page_content for doc in docs])

In [57]:
from langchain_core.runnables import RunnablePassthrough
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt_template
    | LLM_openai
    | parser
)

In [58]:
rag_chain.invoke("What is the Nutrition Facts of Lasagna?")

'The Nutrition Facts for Lasagna are as follows:\n- Serving size: 1 cup (196g)\n- Amount per serving: 260 calories\n- % Daily Value: \n  - Total Fat: 14%\n  - Saturated Fat: 23%\n  - Cholesterol: 17%\n  - Sodium: 27%\n  - Total Carbohydrate: 6%\n  - Dietary Fiber: 4%\n  - Sugars: 4%\n  - Protein: 4%\n- Other nutrients: \n  - Vitamin D: 0mcg, 0%\n  - Calcium: 159mg, 10%\n  - Iron: 3mg, 15%\n  - Vitamin A: 45mcg, 5%\n  - Potassium: 497mg, 10%\n  - Vitamin C: 6mg, 7%'