In [3]:
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS

# Function to load and split PDF documents in paragraph chunks
def split_paragraphs(rwText):
    text_splitter = CharacterTextSplitter(
        separator = '\n',
        chunk_size = 200,
        chunk_overlap = 20,
        length_function = len,
        is_separator_regex = False
    )

    return text_splitter.split_text(rwText)

def load_pdfs(pdfs):
    text_chunks = []
    for pdf in pdfs:
        reader = PdfReader(pdf)
        for page in reader.pages:
            raw = page.extract_text()
            chunks = split_paragraphs(raw)
            text_chunks += chunks
    
    return text_chunks

list_of_pdfs = ['dietary_supplements.pdf']
text_chunks = load_pdfs(list_of_pdfs)

print(text_chunks[:5])  # Display the first 5 text chunks for verification

# Create a FAISS vector store from the text chunks
from langchain.embeddings import OllamaEmbeddings
embeddings = OllamaEmbeddings(model="nomic-embed-text", base_url="http://localhost:11434")

store = FAISS.from_texts(text_chunks, embeddings)
store.save_local("./myVectorStore")


['International  Journal  of \nEnvironmental Research\nand Public Health\nReview\nDietary Supplements—For Whom? The Current State of\nKnowledge about the Health Effects of Selected\nSupplement Use', 'Supplement Use\nRegina Ewa Wierzejska\n/gid00030/gid00035/gid00032/gid00030/gid00038/gid00001/gid00033/gid00042/gid00045 /gid00001\n/gid00048/gid00043/gid00031/gid00028/gid00047/gid00032/gid00046', 'Citation: Wierzejska, R.E. Dietary\nSupplements—For Whom? The\nCurrent State of Knowledge about the\nHealth Effects of Selected Supplement\nUse. Int. J. Environ. Res. Public Health', '2021 ,18, 8897. https://doi.org/\n10.3390/ijerph18178897\nAcademic Editor: Paul B. Tchounwou\nReceived: 15 July 2021\nAccepted: 21 August 2021\nPublished: 24 August 2021', 'Publisher’s Note: MDPI stays neutral\nwith regard to jurisdictional claims in\npublished maps and institutional afﬁl-\niations.\nCopyright: © 2021 by the author.\nLicensee MDPI, Basel, Switzerland.']


  embeddings = OllamaEmbeddings(model="nomic-embed-text", base_url="http://localhost:11434")


In [None]:
# Load the vector store from disk
db_name = r"./myVectorStore"
vector_store = FAISS.load_local(db_name, embeddings, allow_dangerous_deserialization=True)

In [21]:
# Retrieve similar documents
question = "how to gain muscle mass?"
docs = vector_store.similarity_search(query=question, k=3, search_type="similarity")


In [22]:
docs

[Document(id='3e896865-2bac-4338-8aec-72bb98cb88bf', metadata={}, page_content='in medicines or their analogs, which had not been tested for toxicology [ 135]. Apart from\nweight loss supplements, preparations for muscle building and sexual potency have also'),
 Document(id='d66beaa0-99b6-4e82-a484-9a96ffc35f8c', metadata={}, page_content='et al. [ 107] demonstrated that weight loss supplements are the most frequently sought di-\netary supplements on the Internet, followed by preparations for muscle building and sexual'),
 Document(id='578c8480-9bbf-42ec-88f4-530972cc076b', metadata={}, page_content='by accelerating excretion from the body or by producing dangerously high concentrations\nin the blood [88,104].\nThe abovementioned data indicate that reasonable supplementation, tailor-made for')]

In [23]:
# Retrieve documents using a retriever
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 3})
retriever.invoke(question)

[Document(id='3e896865-2bac-4338-8aec-72bb98cb88bf', metadata={}, page_content='in medicines or their analogs, which had not been tested for toxicology [ 135]. Apart from\nweight loss supplements, preparations for muscle building and sexual potency have also'),
 Document(id='d66beaa0-99b6-4e82-a484-9a96ffc35f8c', metadata={}, page_content='et al. [ 107] demonstrated that weight loss supplements are the most frequently sought di-\netary supplements on the Internet, followed by preparations for muscle building and sexual'),
 Document(id='578c8480-9bbf-42ec-88f4-530972cc076b', metadata={}, page_content='by accelerating excretion from the body or by producing dangerously high concentrations\nin the blood [88,104].\nThe abovementioned data indicate that reasonable supplementation, tailor-made for')]

In [24]:
question = "how to lose weight?"
retriever.invoke(question)

[Document(id='8bf6b215-74a9-4a6c-912e-ced091e0ae7a', metadata={}, page_content='has become one of the greatest challenges of contemporary medicine. A person wishing to\nlose weight needs to undertake several difﬁcult life-changes and practice them consistently'),
 Document(id='82efda77-c649-4d4c-a27a-052d89bdbfdd', metadata={}, page_content='of “Russian roulette” [ 125]. If, despite the lack of evidence, patients wish to attempt to\nlose weight using dietary supplements, they should never purchase these products from'),
 Document(id='c4f76c0f-825a-43fc-9a7f-a9f6adc0d078', metadata={}, page_content='(diet, physical activity, addiction-free). Meanwhile dietary supplements are presented as\na compelling alternative to traditional methods for combatting obesity. Wr óbel-Harmas')]

In [25]:
# RAG with Ollama
from langchain_ollama import ChatOllama 
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough 
from langchain_core.prompts import ChatPromptTemplate

from langchain import hub

prompt = hub.pull("rlm/rag-prompt")



In [26]:
prompt = """
    You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question.
    If you don't know the answer, just say that you don't know.
    Answer in bullet points. Make sure your answer is relevant to the question and it is answered from the context only.
    Question: {question} 
    Context: {context} 
    Answer:
"""

prompt = ChatPromptTemplate.from_template(prompt)

In [27]:
llm = ChatOllama(model='gemma3:1b', base_url='http://localhost:11434')

In [28]:
def format_docs(docs):
    return '\n\n'.join([doc.page_content for doc in docs])

context = format_docs(docs)

In [29]:
rag_chain = (
    {"context": retriever|format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [30]:
question = "how to lose weight?"
response = rag_chain.invoke(question)

print(response)

Here’s a breakdown of how to lose weight, based solely on the provided context:

*   It’s a significant challenge of contemporary medicine.
*   It requires several difficult life-changes and consistent practice.
*   Dietary supplements are presented as a potential alternative to traditional methods.
*   They should never be purchased from unreliable sources.


In [31]:
question = "how to gain muscle mass?"
response = rag_chain.invoke(question)

print(response)

Here's a bulleted answer to the question "how to gain muscle mass?" based on the provided context:

*   Weight loss supplements are the most frequently sought dietary supplements on the internet.
*   Preparations for muscle building and sexual potency are also popular.
*   These supplements are believed to accelerate excretion from the body or produce dangerously high concentrations in the blood.
