<a href="https://colab.research.google.com/github/abinavrameshs/LLM-Playground/blob/main/Agentic_RAG_Performing_Hybrid_Search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [106]:
!pip install smolagents pandas langchain langchain-community sentence-transformers datasets python-dotenv rank_bm25 --upgrade -q

In [107]:
! pip install --upgrade  pinecone-client pinecone-text pinecone-notebooks -q

In [108]:
from datasets import load_dataset
from google.colab import userdata
import datasets
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.retrievers import BM25Retriever
import os
from langchain.embeddings import HuggingFaceEmbeddings
from pinecone_text.sparse import BM25Encoder

import nltk
from smolagents import HfApiModel, CodeAgent
from langchain_community.retrievers import PineconeHybridSearchRetriever
from pinecone import Pinecone, ServerlessSpec

In [109]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [110]:
ds = load_dataset("MedRAG/pubmed", data_files='chunk/pubmed23n0001.jsonl',split='train[:100]')
ds

Dataset({
    features: ['id', 'title', 'content', 'contents', 'PMID'],
    num_rows: 100
})

In [111]:

index_name = 'hybrid-search-langchain'

# Initialize pinecone client

pc = Pinecone(api_key=userdata.get('PINECONE_API_KEY'))

# Create index
if index_name not in pc.list_indexes().names() :
  pc.create_index(
    name=index_name,
    dimension=384,
    metric='dotproduct',
    # sparse values supported only for dot product
    spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )
  print('Index created')
else:
  print('Index already exists')


Index already exists


In [112]:
pc.list_indexes()

{'indexes': [{'deletion_protection': 'disabled',
              'dimension': 384,
              'host': 'hybrid-search-langchain-28adf09.svc.aped-4627-b74a.pinecone.io',
              'metric': 'dotproduct',
              'name': 'hybrid-search-langchain',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}},
             {'deletion_protection': 'disabled',
              'dimension': 768,
              'host': 'abstractive-question-answering-28adf09.svc.gcp-starter.pinecone.io',
              'metric': 'cosine',
              'name': 'abstractive-question-answering',
              'spec': {'pod': {'environment': 'gcp-starter',
                               'pod_type': 'starter',
                               'pods': 1,
                               'replicas': 1,
                               'shards': 1}},
              'status': {'ready': True, 'state': 'Ready'}}]}

In [113]:
index = pc.Index(index_name)
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [114]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [115]:
source_docs = [Document(page_content=doc["contents"],
             metadata={"id": doc["id"],
                       'title' : doc['title'],
                       'PMID' : doc['PMID']})
    for doc in ds
]

In [55]:
# text_splitter = RecursiveCharacterTextSplitter(
#     chunk_size=500,
#     chunk_overlap=50,
#     add_start_index=True,
#     strip_whitespace=True,
#     separators=["\n\n", "\n", ".", " ", ""],
# )
# docs_processed = text_splitter.split_documents(source_docs)

In [116]:
bm25_encoder = BM25Encoder().default()
bm25_encoder

bm25_encoder.fit([doc.page_content for doc in source_docs])

  0%|          | 0/100 [00:00<?, ?it/s]

<pinecone_text.sparse.bm25_encoder.BM25Encoder at 0x7be8cdb18e90>

In [122]:
retreiver = PineconeHybridSearchRetriever(
        embeddings=embeddings,
        sparse_encoder=bm25_encoder,
        index=index
    )
retreiver.add_texts([doc.page_content for doc in source_docs])

  0%|          | 0/4 [00:00<?, ?it/s]

In [123]:
from smolagents import Tool

class HybridRetrieverTool(Tool):
    name = "retriever"
    description = "Uses Hybrid search to retrieve the parts of documentation that could be most relevant to answer your query."
    inputs = {
        "query": {
            "type": "string",
            "description": "The query to perform. This should be semantically close to your target documents. Use the affirmative form rather than a question.",
        }
    }
    output_type = "string"

    def __init__(self, docs, **kwargs):
        super().__init__(**kwargs)
        self.retriever = PineconeHybridSearchRetriever(
        embeddings=embeddings,
        sparse_encoder=bm25_encoder,
        index=index
    )

    def forward(self, query: str) -> str:
        assert isinstance(query, str), "Your search query must be a string"

        docs = self.retriever.invoke(
            query,
        )
        return "\nRetrieved documents:\n" + "".join(
            [
                f"\n\n===== Document {str(i)} =====\n" + doc.page_content
                for i, doc in enumerate(docs)
            ]
        )

retriever_tool = HybridRetrieverTool(source_docs)

In [124]:

agent = CodeAgent(
    tools=[retriever_tool], model=HfApiModel(), max_steps=4, verbosity_level=2
)

In [125]:
agent_output = agent.run("Can you tell me about Biochemical studies on camomile components???")

print("Final output:")
print(agent_output)

Final output:
Biochemical studies on camomile components/III. In vitro studies about the antipeptic activity of (--)-alpha-bisabolol. (--)-alpha-Bisabolol has a primary antipeptic action depending on dosage, which is not caused by an alteration of the pH-value. The proteolytic activity of pepsin is reduced by 50 percent through addition of bisabolol in the ratio of 1/0.5. The antipeptic action of bisabolol only occurs in case of direct contact. In case of a previous contact with the substrate, the inhibiting effect is lost.


Links :

Hybrid Search : https://www.youtube.com/watch?v=CK0ExcCWDP4
Agentic RAG : https://huggingface.co/docs/smolagents/examples/rag


