# Install required packages

In [33]:
!pip install langchain langchain-community faiss-cpu sentence-transformers pypdf transformers torch



#Import necessary libraries




In [34]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import textwrap
import torch

# Initialize the embedding model

In [35]:
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': 'cpu'}
)
print("Embeddings model initialized")

Embeddings model initialized


# Load the PDF document

In [36]:
pdf_path = "/content/Human-Nutrition-2020-Edition-1598491699.pdf"
loader = PyPDFLoader(pdf_path)
documents = loader.load()
print(f"Loaded PDF with {len(documents)} pages")

Loaded PDF with 1208 pages


# Split text into chunks

In [37]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
)
texts = text_splitter.split_documents(documents)
print(f"Split into {len(texts)} chunks")

Split into 2101 chunks


# Create a vector store

In [38]:
vector_store = FAISS.from_documents(texts, embeddings)
print("Vector store created")

Vector store created


In [39]:
print("\nExample chunks:")
for i, chunk in enumerate(texts[75:77]):
    print(f"\nChunk {i+1} (Page {chunk.metadata['page']}):")
    print(textwrap.fill(chunk.page_content[:200] + "...", width=80))


Example chunks:

Chunk 1 (Page 60):
Volume  Metric System US Customary System Conversions  Milliliter (mL) Teaspoon
(tsp) 1 tsp = 5 mL  Deciliter (dL) Tablespoon (tbsp) 1 tbsp = 3 tsp = 15 mL
Liter (L) Fluid ounce (fl oz) 1 fl oz = 2 t...

Chunk 2 (Page 61):
available in the web-based textbook and not available in the  downloadable
versions (EPUB, Digital PDF, Print_PDF, or  Open Document).  Learning activities
may be used across various mobile  devices, ...


# Initialize the LLM

In [40]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
model_id = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(
    model_id,
    torch_dtype=torch.float32,
    device_map='auto'
)

pipe = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=512,
    temperature=0.3,
    top_p=0.95,
)

llm = HuggingFacePipeline(pipeline=pipe)
print("LLM initialized")

Device set to use cuda:0


LLM initialized


# Set up retriever

In [41]:
retriever = vector_store.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 5}
)

In [42]:
test_query = "What is protein, and what role does it play in the human body?"
retrieved_docs = retriever.get_relevant_documents(test_query)
print("\nTest retrieval results:")
for i, doc in enumerate(retrieved_docs):
    print(f"\nRetrieved Document {i+1} (Page {doc.metadata['page']}):")
    print(textwrap.fill(doc.page_content[:200] + "...", width=80))


Test retrieval results:

Retrieved Document 1 (Page 424):
Proteins are  the  “workhorses”  of the body  and  participate  in many  bodily
functions.  Proteins  come in all  sizes and  shapes and  each is  specifically
structured  for its  particular  funct...

Retrieved Document 2 (Page 404):
Defining Protein  UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN
NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM  Protein makes up approximately 20
percent of the human body and  is present in e...

Retrieved Document 3 (Page 404):
about the c omponents of protein, the impor tant roles that protein  serves wi
thin the bod y, ho w the bod y uses pr otein, the risks and  consequences
associa ted wi th too much or too li ttle prote...

Retrieved Document 4 (Page 53):
Protein  Necessary for tissue formation, cell reparation, and  hormone and
enzyme production. It is essential for  building strong muscles and a healthy
immune system.  Carbohydrates  Provide a ready ...

Retrieved Document 5 (Page

# Create the QA chain

In [43]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)
print("\nQA chain created")


QA chain created


# Make queries

In [44]:
question = "Give information on vitamin K"
result = qa_chain({"query": question})

print("\nFinal QA Result:")
print("\nQuestion:", question)
print("\nAnswer:", result["result"])
print("\nSource Documents:")
for i, doc in enumerate(result["source_documents"], 1):
    print(f"\nDocument {i} (Page {doc.metadata['page']}):")
    print(textwrap.fill(doc.page_content, width=80))

Token indices sequence length is longer than the specified maximum sequence length for this model (1316 > 512). Running this sequence through the model will result in indexing errors



Final QA Result:

Question: Give information on vitamin K

Answer: Vitamin K r efers to a gr oup of fat-soluble vitamins that are similar in chemical structure. Vitamin K is critical for blood function acting as coenzymes which play an essential role in blood coagulation (aka blood clotting). Blood-clotting proteins are continuously circulating in the blood. Upon in jury to a blood v essel, pla telets stick to the dose of vitamin K. This pr actice has basically eliminated vitamin K- dependent bleeding disorders in babies.

Source Documents:

Document 1 (Page 585):
Food Serving Size Vitamin E (mg) Percent Daily Value  Sunflower seeds 1 oz. 7.4
37  Almonds 1 oz. 6.8 34  Sunflower oil 1 Tbsp 5.6 28  Hazelnuts 1 oz. 1 oz. 4.3
22  Peanut butter 2 Tbsp. 2.9 15  Peanuts 1 oz. 1 oz. 2.2 11  Corn oil 1 Tbsp. 1
Tbsp. 1.9 10  Kiwi 1 medium 1.1 6  Tomato 1 medium 0.7 4  Spinach 1 c. raw 0.6 3
Source: Die tary S upplement Fac t She et: Vi tamin E.N ational  Institutes o f
H ealth, Of fice o f Die 

In [45]:
question = "What is protein, and what role does it play in the human body?"
result = qa_chain({"query": question})

print("\nFinal QA Result:")
print("\nQuestion:", question)
print("\nAnswer:", result["result"])
print("\nSource Documents:")
for i, doc in enumerate(result["source_documents"], 1):
    print(f"\nDocument {i} (Page {doc.metadata['page']}):")
    print(textwrap.fill(doc.page_content, width=80))




Final QA Result:

Question: What is protein, and what role does it play in the human body?

Answer: macromolecules composed of amino acids

Source Documents:

Document 1 (Page 424):
Proteins are  the  “workhorses”  of the body  and  participate  in many  bodily
functions.  Proteins  come in all  sizes and  shapes and  each is  specifically
structured  for its  particular  function.  Protein’s Functions in the  Body
UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN  NUTRITION PROGRAM AND
HUMAN NUTRITION PROGRAM  Structure and Motion  Figure 6.9 Collagen Structure
Protein’s Functions in the Body  |  383

Document 2 (Page 404):
Defining Protein  UNIVERSITY OF HAWAI‘I AT MĀNOA FOOD SCIENCE AND HUMAN
NUTRITION PROGRAM AND HUMAN NUTRITION PROGRAM  Protein makes up approximately 20
percent of the human body and  is present in e very sing le cell. The w ord
protein is a Gr eek word,  meaning “of utmost importance.” Proteins are called
the workhorses  of life as the y provide the bod y wit