## Hybrid Search LangChain

In [15]:
!pip install pinecone pinecone-text pinecone-notebooks

Collecting pinecone
  Downloading pinecone-7.3.0-py3-none-any.whl.metadata (9.5 kB)
Collecting pinecone-plugin-assistant<2.0.0,>=1.6.0 (from pinecone)
  Downloading pinecone_plugin_assistant-1.7.0-py3-none-any.whl.metadata (28 kB)
Collecting packaging<25.0,>=24.2 (from pinecone-plugin-assistant<2.0.0,>=1.6.0->pinecone)
  Using cached packaging-24.2-py3-none-any.whl.metadata (3.2 kB)
Downloading pinecone-7.3.0-py3-none-any.whl (587 kB)
   ---------------------------------------- 0.0/587.6 kB ? eta -:--:--
   ---------------------------------------- 587.6/587.6 kB 3.0 MB/s eta 0:00:00
Downloading pinecone_plugin_assistant-1.7.0-py3-none-any.whl (239 kB)
Using cached packaging-24.2-py3-none-any.whl (65 kB)
Installing collected packages: packaging, pinecone-plugin-assistant, pinecone

  Attempting uninstall: packaging

    Found existing installation: packaging 23.2

    Uninstalling packaging-23.2:

      Successfully uninstalled packaging-23.2

   ----------------------------------------

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
chromadb 0.5.23 requires tokenizers<=0.20.3,>=0.13.2, but you have tokenizers 0.21.1 which is incompatible.
langchain-openai 0.0.8 requires langchain-core<0.2.0,>=0.1.27, but you have langchain-core 0.3.66 which is incompatible.


In [12]:
import os
from dotenv import load_dotenv

load_dotenv()

api_key = os.getenv('PINECONE_API_KEY')

In [16]:
from langchain_community.retrievers import PineconeHybridSearchRetriever
from pinecone import Pinecone, ServerlessSpec

index_name = 'hybrid-search-langchain-pinecone'

# Initialize the Pinecone client
pc = Pinecone(api_key=api_key)

# Create the index
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name = index_name,
        dimension = 384, # Dimension of dense vector
        metric = 'dotproduct', # Sparse values supported only for dotproduct
        spec = ServerlessSpec(cloud='aws', region='us-east-1')
    )

In [17]:
index = pc.Index(index_name)

  from .autonotebook import tqdm as notebook_tqdm


In [19]:
# Vector Embedding and Sparse matrix
from langchain_huggingface import HuggingFaceEmbeddings

os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN')
embeddings = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')

In [20]:
from pinecone_text.sparse import BM25Encoder

bm25_encoder = BM25Encoder().default()

In [22]:
sentences = [
    'In 2024, I visited Paris',
    'In 2023, I visited India',
    'In 2022, I visited USA'
]

# TF IDF values on these sentence
bm25_encoder.fit(sentences)

# Store the values to a json files
bm25_encoder.dump('bm25_values.json')

# Load to your BM25Encoder object
bm25_encoder = BM25Encoder().load('bm25_values.json')

100%|██████████| 3/3 [00:00<00:00, 33.12it/s]


In [23]:
retriever = PineconeHybridSearchRetriever(embeddings=embeddings, sparse_encoder=bm25_encoder, index=index)

In [24]:
retriever.add_texts(sentences)

100%|██████████| 1/1 [00:02<00:00,  2.48s/it]


In [27]:
retriever.invoke('Where did i visited recently')

[Document(metadata={'score': 0.273100853}, page_content='In 2022, I visited USA'),
 Document(metadata={'score': 0.250942707}, page_content='In 2024, I visited Paris'),
 Document(metadata={'score': 0.25071764}, page_content='In 2023, I visited India')]