In [1]:
%pip install --upgrade --quiet  pinecone-client pinecone-text pinecone-notebooks

Note: you may need to restart the kernel to use updated packages.


In [2]:
api_key="5ce5e927-1de1-****-****-*********"

In [3]:
from langchain_community.retrievers import PineconeHybridSearchRetriever

In [81]:
#setup pinecone

import os

from pinecone import Pinecone, ServerlessSpec

index_name = "langchain-pinecone-hybrid-search"

# initialize Pinecone client
pc = Pinecone(api_key=api_key)

# create the index
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=1536,  # dimensionality of dense model
        metric="dotproduct",  # sparse values supported only for dotproduct
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )

In [82]:
# Now that the index is created, we can use it.

index = pc.Index(index_name)



## Get embeddings and sparse encoders

In [83]:
import os
from dotenv import load_dotenv

load_dotenv()
#os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN")
os.environ["OPENAI_API_KEY"]=os.getenv("OPENAI_API_KEY")


In [84]:
#from langchain_huggingface import HuggingFaceEmbeddings
#from langchain_community.embeddings import HuggingFaceEmbeddings

#embeddings=HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

In [85]:
## To encode the text to sparse values you can either choose SPLADE or BM25. For out of domain tasks we recommend using BM25.
# uses TFIDF encoder

from pinecone_text.sparse import BM25Encoder

# or from pinecone_text.sparse import SpladeEncoder if you wish to work with SPLADE

# use default tf-idf values
bm25_encoder = BM25Encoder().default()

In [86]:
bm25_encoder

<pinecone_text.sparse.bm25_encoder.BM25Encoder at 0x7fc80b13e380>

In [87]:
corpus = ["foo", "bar", "world", "hello"]

# fit tf-idf values on your corpus
bm25_encoder.fit(corpus)

# store the values to a json file
bm25_encoder.dump("bm25_values.json")

# load to your BM25Encoder object
bm25_encoder = BM25Encoder().load("bm25_values.json")

100%|██████████| 4/4 [00:00<00:00, 10987.04it/s]


In [76]:
#tfidf on sentences
#bm25_encoder.fit(corpus)

In [77]:
# store the values to a json file
#bm25_encoder.dump("bm25_values.json")


In [78]:
# load to your BM25Encoder object
#bm25_encoder = BM25Encoder().load("bm25_values.json")

# Load Retriever

In [88]:
retriever = PineconeHybridSearchRetriever(
    embeddings=embeddings, sparse_encoder=bm25_encoder, index=index
)

## Add texts

In [89]:
retriever.add_texts(["foo", "bar", "world", "hello"])

100%|██████████| 1/1 [00:02<00:00,  2.05s/it]


# Use Retriever

In [90]:
result = retriever.invoke("foo")
result[0]

Document(page_content='foo')