### HYBRID SEARCH -> Weaviate
### RERANKING -> Cohere Api

PDF source: https://pressbooks.oer.hawaii.edu/humannutrition2/


In [None]:
%pip install weaviate-client

In [None]:
%pip install langchain
%pip install langchain-core langchain-community langhchain-huggingface

In [8]:
import weaviate
from weaviate.classes.init import Auth
import os

In [13]:
client = weaviate.connect_to_weaviate_cloud(
    cluster_url=WEAVIATE_URL,
    auth_credentials=Auth.api_key(WEAVIATE_API_KEY),
)

print(client.is_ready())

True


In [None]:
%pip install langchain_huggingface

In [14]:
from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace, HuggingFacePipeline
llm = HuggingFaceEndpoint(
    repo_id = "Qwen/Qwen3-4B-Instruct-2507",
    task = "text-generation",
)
model = ChatHuggingFace(llm = llm)

In [15]:
client.collections.list_all()

{'RAG': _CollectionConfigSimple(name='RAG', description='Documents for RAG', generative_config=None, properties=[_Property(name='content', description='The content of the paragraph', data_type=<DataType.TEXT: 'text'>, index_filterable=True, index_range_filters=False, index_searchable=True, nested_properties=None, tokenization=<Tokenization.WORD: 'word'>, vectorizer_config=None, vectorizer='none', vectorizer_configs=None)], references=[], reranker_config=None, vectorizer_config=None, vectorizer=<Vectorizers.NONE: 'none'>, vector_config=None, object_ttl_config=None)}

In [None]:
from weaviate.classes.config import Configure, Property, DataType

# Create collection
client.collections.create(
    name="RAG",
    description="Documents for RAG",
    properties=[
        Property(
            name="content",
            data_type=DataType.TEXT,
            description="The content of the paragraph",
        )
    ],
    vectorizer_config=Configure.Vectorizer.none(),
)

In [49]:
from langchain_core.retrievers import BaseRetriever
from langchain_core.documents import Document
from pydantic import Field
from sentence_transformers import SentenceTransformer

class WeaviateHybridRetriever(BaseRetriever):
    collection: any = Field(exclude=True)
    embed_model: any = Field(exclude=True)
    alpha: float = 0.5
    k: int = 5

    def _get_relevant_documents(self, query: str):
        query_vector = self.embed_model.encode(query).tolist()

        response = self.collection.query.hybrid(
            query=query,
            vector=query_vector,
            alpha=self.alpha,
            limit=self.k,
        )

        return [
            Document(
                page_content=obj.properties["content"],
                metadata={"id": obj.uuid},
            )
            for obj in response.objects
        ]


In [50]:
from sentence_transformers import SentenceTransformer

embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

retriever = WeaviateHybridRetriever(
    collection=collection,
    embed_model=embed_model,
    alpha=0.5,
    k=5,
)


In [45]:
from google.colab import files
uploaded = files.upload()

Saving 4.pdf to 4.pdf


In [33]:
filePath = "/content/text.pdf"

In [None]:
%pip install pypdf
%pip install langchain_community

In [21]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [34]:
def loadDocs(path):
  loader = PyPDFLoader(path)
  docs = loader.load()
  return docs

In [35]:
docs = loadDocs(filePath)
print(len(docs))

1208


In [40]:
docs[0]

Document(metadata={'producer': 'Prince 12.5 (www.princexml.com)', 'creator': 'Pressbooks 5.9.2', 'creationdate': '', 'title': 'Human Nutrition: 2020 Edition', 'source': '/content/text.pdf', 'total_pages': 1208, 'page': 0, 'page_label': '1'}, page_content='Human Nutrition: 2020 Edition')

In [None]:
%pip install sentence-transformers

In [45]:
from sentence_transformers import SentenceTransformer
embeddingModel = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

collection = client.collections.get("RAG")

for doc in docs:
    text = doc.page_content
    vector = embeddingModel.encode(text).tolist()

    collection.data.insert(
        properties={
            "content": text,
        },
        vector=vector,
    )

print("✅ PDF documents stored in Weaviate")


✅ PDF documents stored in Weaviate


In [63]:
results = retriever._get_relevant_documents("Metabolism Overview")
context = "\n\n---\n\n".join(
    f"Context chunk {i+1}:\n{doc.page_content}"
    for i, doc in enumerate(results)
)
context

'Context chunk 1:\nMetabolism Overview \nMetabolism is defined as the sum of all chemical reactions required \nto suppor t c ellular func tion and henc e the lif e o f an or ganism. \nMetabolism is ei ther c ategorized as c atabolism, r eferring to all \nmetabolic processes involved in molecule breakdown, or anabolism, \nwhich includes all me tabolic processes involved in building bigger \nmolecules. Gener ally, c atabolic pr ocesses r elease ener gy and \nanabolic pr ocesses c onsume ener gy. The o verall goals o f \nmetabolism ar e ener gy tr ansfer and ma tter tr ansport. Ener gy is \ntransformed from food macronutrients into cellular energy, which \nis used to perform cellular work. Metabolism transforms the matter \nof macr onutrients in to substanc es a c ell c an use to gr ow and \nreproduce and also into waste products. For example, enzymes are \nproteins and their job is to c atalyze chemic al r eactions. Ca talyze \nmeans to spe ed-up a chemic al r eaction and r educe the ene

In [56]:
parser = StrOutputParser()

In [57]:
prompt = PromptTemplate(
    template =
    """
    Use the following pieces of context to answer the question at the end.
    If you don't know the answer, just say that you do not have the relevant information needed to provide a verified answer, don't try to make up an answer.
    When providing an answer, aim for clarity and precision. Position yourself as a knowledgeable authority on the topic, but also be mindful to explain the information in a manner that is accessible and comprehensible to those without a technical background.
    Always say "Do you have any more questions pertaining to this instrument?" at the end of the answer.
    {context}
    Question: {question}
    Helpful Answer:
    """,
    input_variables = ["context", "question"]
)

In [67]:
def generateResponse(query):
  results = retriever._get_relevant_documents("Metabolism Overview")
  context = "\n\n---\n\n".join(
      f"Context chunk {i+1}:\n{doc.page_content}"
      for i, doc in enumerate(results)
      )
  prompt = PromptTemplate(
       template =
       """
       You are an assistant for question-answering tasks.
       Use the following pieces of retrieved context to answer the question.
       If you don't know the answer, just say that you don't know. Use ten sentences maximum and keep the answer concise.
       <context>
       {context}
       </context>
       Question : {question}
       Answer:
       """,
       input_variables=["context", "question"]
       )
  chain = prompt | model | parser
  response = chain.invoke({
      "context": context,
      "question": query})
  return response

In [69]:
while True:
    query = input("Enter your question related to the document: ")

    if not query.strip():
        print("Please enter a valid question.\n")
        continue

    print("\nGenerating answer...\n")
    answer = generateResponse(query)
    print("Answer:", answer)
    continue_chat = input("Do you want to ask another question? (yes/no): ").strip().lower()

    if continue_chat not in ['yes', 'y']:
        print("\nThank you for using the document Q&A system. Goodbye!")
        break

Enter your question related to the document: What is metabolism?

Generating answer...

Answer: Metabolism is the sum of all chemical reactions that support cellular function and sustain life in an organism. It includes catabolism, the breakdown of molecules to release energy, and anabolism, the building of larger molecules that consume energy. Metabolism transforms food macronutrients into cellular energy (ATP) and substances needed for growth and reproduction. Energy is transferred from nutrients to ATP, which powers cellular work. Enzymes catalyze these reactions, speeding them up without being consumed. Metabolic pathways are sequences of enzyme-catalyzed reactions that convert substrates into end products. These pathways are regulated by energy status, hormones, and substrate levels. Glucose, fatty acids, and amino acids serve as fuel sources depending on activity and availability. Aerobic and anaerobic metabolism differ in oxygen use and ATP production, with aerobic metabolism be

In [None]:
%pip install -U langchain langchain-core langchain-community cohere


# RERANKING(COHERE API)


In [None]:
%pip install -U langchain-cohere cohere


In [88]:
from langchain_cohere import CohereRerank

reranker = CohereRerank(
    model="rerank-english-v3.0",
    top_n=5,
)

In [84]:
from langchain_core.retrievers import BaseRetriever
from langchain_core.documents import Document
from pydantic import Field

class RerankRetriever(BaseRetriever):
    base_retriever: BaseRetriever = Field(exclude=True)
    reranker: any = Field(exclude=True)

    def _get_relevant_documents(self, query: str):
        docs = self.base_retriever._get_relevant_documents(query)

        reranked_docs = self.reranker.compress_documents(
            documents=docs,
            query=query,
        )

        return reranked_docs




In [91]:
compression_retriever = RerankRetriever(
    base_retriever=retriever,
    reranker=reranker,
)
docs = compression_retriever._get_relevant_documents(
    "Metabolism Overview"
)
docs

[Document(metadata={'id': _WeaviateUUIDInt('c3cf6032-9c03-4337-8baa-e3a206765088'), 'relevance_score': 0.9995159}, page_content='Metabolism Overview \nMetabolism is defined as the sum of all chemical reactions required \nto suppor t c ellular func tion and henc e the lif e o f an or ganism. \nMetabolism is ei ther c ategorized as c atabolism, r eferring to all \nmetabolic processes involved in molecule breakdown, or anabolism, \nwhich includes all me tabolic processes involved in building bigger \nmolecules. Gener ally, c atabolic pr ocesses r elease ener gy and \nanabolic pr ocesses c onsume ener gy. The o verall goals o f \nmetabolism ar e ener gy tr ansfer and ma tter tr ansport. Ener gy is \ntransformed from food macronutrients into cellular energy, which \nis used to perform cellular work. Metabolism transforms the matter \nof macr onutrients in to substanc es a c ell c an use to gr ow and \nreproduce and also into waste products. For example, enzymes are \nproteins and their job 

In [92]:
docs = compression_retriever._get_relevant_documents(
    "Metabolism Overview"
)

for d in docs:
    print(d.page_content[:300])
    print("-" * 50)

Metabolism Overview 
Metabolism is defined as the sum of all chemical reactions required 
to suppor t c ellular func tion and henc e the lif e o f an or ganism. 
Metabolism is ei ther c ategorized as c atabolism, r eferring to all 
metabolic processes involved in molecule breakdown, or anabolism, 
w
--------------------------------------------------
Domain; “Bag of sugar” by Evilestmark / Public Domain; “Drink 
milk butterfly” by Glitch / Public Domain 
10. Figure 4.5 Dietary Fiber reused “Apples” by gnokii / Public 
Domain; “Wheat kernel nutrition” by Jon C / CC BY-SA 3.0 
11. Figure 4.8 The Regulation of Glucose reused “Pancreas organ” 
by Za
--------------------------------------------------
Image by 
Allison 
Calabrese / 
CC BY 4.0 
The fuel sources for anaerobic and aerobic metabolism will change 
depending on the amoun t o f nutrien ts a vailable and the t ype o f 
metabolism. Glucose may come from blood g lucose (which is fr om 
dietary c arbohydrates or liv er g lycogen and g l

In [93]:
def generateResponseAfterReranking(query):
  results = compression_retriever._get_relevant_documents("Metabolism Overview")
  context = "\n\n---\n\n".join(
      f"Context chunk {i+1}:\n{doc.page_content}"
      for i, doc in enumerate(results)
      )
  prompt = PromptTemplate(
       template =
       """
       You are an assistant for question-answering tasks.
       Use the following pieces of retrieved context to answer the question.
       If you don't know the answer, just say that you don't know. Use ten sentences maximum and keep the answer concise.
       <context>
       {context}
       </context>
       Question : {question}
       Answer:
       """,
       input_variables=["context", "question"]
       )
  chain = prompt | model | parser
  response = chain.invoke({
      "context": context,
      "question": query})
  return response

In [None]:
while True:
    query = input("Enter your question related to the document: ")

    if not query.strip():
        print("Please enter a valid question.\n")
        continue

    print("\nGenerating answer...\n")
    answer = generateResponse(query)
    print("Answer:", answer)
    continue_chat = input("Do you want to ask another question? (yes/no): ").strip().lower()

    if continue_chat not in ['yes', 'y']:
        print("\nThank you for using the document Q&A system. Goodbye!")
        break

Enter your question related to the document: What is protein?

Generating answer...

Answer: Protein is one of the three essential macronutrients used by the body for energy, growth, and repair. Although not a major energy source under normal conditions, small amounts of amino acids from protein are used for energy metabolism, especially when dietary energy intake is insufficient or during long endurance exercises. When proteins are broken down, the carbon molecules can be converted into glucose or used in aerobic metabolism to produce ATP. Protein also serves structural and functional roles in the body, such as enzyme production and immune response. While it is not the primary fuel source, it contributes to energy balance and metabolic regulation. The body can use amino acids for energy when glucose and fats are scarce. Protein breakdown releases nitrogen-containing compounds, which are processed and excreted. It is a vital component of cells and tissues. During prolonged activity, in