## Hybrid Search

In [24]:
import os
from dotenv import load_dotenv
load_dotenv()
api_key=os.getenv("PINECONE_API_KEY")

In [25]:
from langchain_community.retrievers import PineconeHybridSearchRetriever
from pinecone import Pinecone, ServerlessSpec
index_name="hybrid-search-langchain-pinecone"
## initialize the Pinecone client
pc=Pinecone(api_key=api_key)

In [26]:
## create the index
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=768, # dimension of dense vector
        metric='dotproduct', ## sparse values supported only for dotproduct
        spec=ServerlessSpec(cloud='aws',region='us-east-1')
    )

In [27]:
index=pc.Index(index_name)
index

<pinecone.db_data.index.Index at 0x2b055c6e870>

In [28]:
## vector embedding 
from langchain_community.embeddings import OllamaEmbeddings
embeddings=OllamaEmbeddings(model='nomic-embed-text')
embeddings

OllamaEmbeddings(base_url='http://localhost:11434', model='nomic-embed-text', embed_instruction='passage: ', query_instruction='query: ', mirostat=None, mirostat_eta=None, mirostat_tau=None, num_ctx=None, num_gpu=None, num_thread=None, repeat_last_n=None, repeat_penalty=None, temperature=None, stop=None, tfs_z=None, top_k=None, top_p=None, show_progress=False, headers=None, model_kwargs=None)

In [29]:
## sparse vector
from pinecone_text.sparse import BM25Encoder
bm25_encoder=BM25Encoder().default()
bm25_encoder

<pinecone_text.sparse.bm25_encoder.BM25Encoder at 0x2b055bfca10>

In [30]:
sentences=[
    "In 2023, I visited Paris",
    "In 2022, I visited New York",
    "In 2021, I visited New Orleans"
]

## tfidf values on these sentence
bm25_encoder.fit(sentences)

## store the values to a json file
bm25_encoder.dump("bm25_values.json")

## load to your BM25Encoder object
bm25_encoder=BM25Encoder().load("bm25_values.json")

100%|██████████| 3/3 [00:00<00:00, 3005.95it/s]


In [31]:
retriever=PineconeHybridSearchRetriever(embeddings=embeddings,sparse_encoder=bm25_encoder,index=index)

In [32]:
retriever

PineconeHybridSearchRetriever(embeddings=OllamaEmbeddings(base_url='http://localhost:11434', model='nomic-embed-text', embed_instruction='passage: ', query_instruction='query: ', mirostat=None, mirostat_eta=None, mirostat_tau=None, num_ctx=None, num_gpu=None, num_thread=None, repeat_last_n=None, repeat_penalty=None, temperature=None, stop=None, tfs_z=None, top_k=None, top_p=None, show_progress=False, headers=None, model_kwargs=None), sparse_encoder=<pinecone_text.sparse.bm25_encoder.BM25Encoder object at 0x000002B055BB1520>, index=<pinecone.db_data.index.Index object at 0x000002B055C6E870>)

In [33]:
retriever.add_texts(
    [
    "In 2023, I visited Paris",
    "In 2022, I visited New York",
    "In 2021, I visited New Orleans"
    ]
)

100%|██████████| 1/1 [00:10<00:00, 10.26s/it]
