In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

api_key = os.getenv("PINECONE_API_KEY")

In [2]:
from langchain_community.retrievers import PineconeHybridSearchRetriever

In [3]:
from pinecone import Pinecone, ServerlessSpec

index_name = "hybrid-search-langchain-pinecone"

### initialize Pinecone client
pc = Pinecone(api_key=api_key)

### Create the index
if index_name not in pc.list_indexes():
    pc.create_index(
        name=index_name,
        dimension=384, ### Dimension of dense vector ### Because we will use Hugging Face embeddings 
                        ### Which by default converts any text to a 384-dimensional vector
        metric="dotproduct", ### Sparse values supported only for dotproduct
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1",
        ),
    )

In [5]:
index = pc.Index(index_name)
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'metric': 'dotproduct',
 'namespaces': {},
 'total_vector_count': 0,
 'vector_type': 'dense'}

In [None]:
### Vector embeddings 
from langchain_huggingface import HuggingFaceEmbeddings

os.environ['HF_TOKEN'] = os.getenv('HF_TOKEN')

embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
embeddings

HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [10]:
### Sparse matrix

from pinecone_text.sparse import BM25Encoder ### Uses TF-IDF to encode text into sparse vectors

bm25_encoder = BM25Encoder().default()
bm25_encoder

<pinecone_text.sparse.bm25_encoder.BM25Encoder at 0x27efd4dbd90>

In [11]:
sentences = [
    "In 2023, I visited Paris",
    "In 2022, I visited New York",
    "In 2021, I visited London",
]

### tfidf values on these sentence
bm25_encoder.fit(sentences) 

### Store the values to a json file
bm25_encoder.dump("bm25_encoder.json")

### Load the values from a json file
bm25_encoder = BM25Encoder().load("bm25_encoder.json")

100%|██████████| 3/3 [00:00<00:00, 53.98it/s]


In [12]:
#### Create the retriever
retriever = PineconeHybridSearchRetriever(embeddings=embeddings,
                                         sparse_encoder=bm25_encoder,
                                         index=index,
                                        )

In [13]:
retriever

PineconeHybridSearchRetriever(embeddings=HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False), sparse_encoder=<pinecone_text.sparse.bm25_encoder.BM25Encoder object at 0x0000027EFDB03490>, index=<pinecone.db_data.index.Index object at 0x0000027ED553B520>)

In [14]:
retriever.add_texts(sentences)

100%|██████████| 1/1 [00:01<00:00,  1.89s/it]


In [15]:
retriever.invoke("Which city did I visit in 2023?")

[Document(metadata={'score': 0.480033755}, page_content='In 2023, I visited Paris'),
 Document(metadata={'score': 0.363877326}, page_content='In 2022, I visited New York'),
 Document(metadata={'score': 0.350818694}, page_content='In 2021, I visited London')]

In [16]:
retriever.invoke("which city I visited 1st")

[Document(metadata={'score': 0.266758025}, page_content='In 2021, I visited London'),
 Document(metadata={'score': 0.22342664}, page_content='In 2023, I visited Paris'),
 Document(metadata={'score': 0.220567271}, page_content='In 2022, I visited New York')]