In [56]:

import os
from dotenv import load_dotenv
load_dotenv()

from pinecone import (
    Pinecone,
    ServerlessSpec,
    CloudProvider,
    AwsRegion,
    VectorType
)

from langchain_huggingface import HuggingFaceEmbeddings
from pinecone_text.sparse import BM25Encoder
from langchain_community.retrievers import PineconeHybridSearchRetriever



In [None]:
api_key = ""

In [58]:
pc = Pinecone(api_key=api_key)

In [59]:
index_name = "hybrid-search-langchain-pinecone-v2"

In [60]:
pc.create_index(
    name=index_name,
    dimension=384,
    metric="dotproduct",
    spec=ServerlessSpec(
        cloud=CloudProvider.AWS,
        region=AwsRegion.US_EAST_1
    ),
    
)

{
    "name": "hybrid-search-langchain-pinecone-v2",
    "metric": "dotproduct",
    "host": "hybrid-search-langchain-pinecone-v2-3od08hd.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [61]:
index = pc.Index(index_name)

index

<pinecone.db_data.index.Index at 0x2498080cbf0>

In [62]:
HF_TOKEN = os.getenv("HF_TOKEN")  # Make sure this is set in your .env file
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [63]:
# Sparse encoder (BM25)
bm25_encoder = BM25Encoder().default()

In [64]:
sentences = [
    "In 2023, I visited Paris",
    "In 2022, I visited New York",
    "In 2021, I visited New Orleans",
]
bm25_encoder.fit(sentences)

100%|██████████| 3/3 [00:00<00:00, 3016.76it/s]


<pinecone_text.sparse.bm25_encoder.BM25Encoder at 0x249537bea50>

In [65]:
bm25_encoder.dump("bm25_values.json")

In [66]:
retriever = PineconeHybridSearchRetriever(
    embeddings=embeddings,
    sparse_encoder=bm25_encoder,
    index=index
)

In [67]:
retriever.add_texts(sentences)

100%|██████████| 1/1 [00:02<00:00,  2.40s/it]


In [78]:
results = retriever.invoke("Which city did i visit first")

In [79]:
results

[Document(metadata={'score': 0.239503518}, page_content='In 2021, I visited New Orleans'),
 Document(metadata={'score': 0.232040539}, page_content='In 2022, I visited New York'),
 Document(metadata={'score': 0.220621794}, page_content='In 2023, I visited Paris')]