In [5]:
import os
from dotenv import load_dotenv
import os
from langchain_community.retrievers import PineconeHybridSearchRetriever

In [6]:
load_dotenv()
PINECONE_API_KEY=os.getenv('PINECONE_API_KEY')

In [16]:
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Retrieve the Pinecone API key from environment variables
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

# Define the index name (use only lowercase letters, numbers, and hyphens)
index_name = "hybrid-search-pinecode-langchain"

# Initialize Pinecone with the API key
pc = Pinecone(api_key=PINECONE_API_KEY)

# Check if the index already exists
if index_name not in [index.name for index in pc.list_indexes()]:
    # Create the index if it does not exist
    pc.create_index(
        name=index_name,
        dimension=384,  # Dimension of the dense vector (using Hugging Face embeddings)
        metric="dotproduct",  # Sparse values supported only for dotproduct
        spec=ServerlessSpec(cloud='aws', region="us-east-1"),
    )

print(f"Index '{index_name}' is ready.")

Index 'hybrid-search-pinecode-langchain' is ready.


In [17]:
index=pc.Index(index_name)

In [18]:
index

<pinecone.data.index.Index at 0x234e4e45f50>

In [25]:
## Vector embedding And Sparse Metrix
from langchain.embeddings import HuggingFaceBgeEmbeddings # This HuggingFaceBgeEmbeddings is used to create dense vector
# Initialize the Hugging Face embeddings model
model_name = "all-MiniLM-L6-v2"
embeddings = HuggingFaceBgeEmbeddings(model_name=model_name)
# Example: Embed a sample text
text = "This is a sample sentence."
vector = embeddings.embed_query(text)
print(vector)



[-4.445405920705525e-06, 0.14679904282093048, 0.05518879368901253, 0.017703330144286156, 0.042779624462127686, 0.07864180207252502, 0.05290604382753372, 0.016132432967424393, 0.042034972459077835, 0.016652770340442657, 0.051313821226358414, -0.06861812621355057, -0.011278809048235416, -0.024253731593489647, 0.12017810344696045, 0.009854670614004135, -0.014674332924187183, -0.03764597699046135, -0.032778043299913406, 0.01442735642194748, 0.08596458286046982, 0.07534053176641464, 0.04968412593007088, -0.02382204681634903, -0.02630639262497425, 0.0012724777916446328, 0.025696588680148125, -0.012068831361830235, 0.15250274538993835, -0.028572631999850273, -0.0167866051197052, 0.03342984616756439, 0.07145553827285767, 0.027607416734099388, 0.0741976946592331, 0.03960345312952995, -0.006842474453151226, 0.037319473922252655, 0.06753368675708771, -0.010548585094511509, 0.02467515505850315, -0.015932446345686913, 0.00819754134863615, 0.015393165871500969, -0.004537752363830805, -0.055178549140

In [27]:
len(vector)

384

In [28]:
embeddings

HuggingFaceBgeEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, query_instruction='Represent this question for searching relevant passages: ', embed_instruction='', show_progress=False)

BM25Encoder is responsible for create sparse metrix

In [30]:
from pinecone_text.sparse import BM25Encoder #This used internally TF-IDF Encoder
bm25ecoder=BM25Encoder().default()
bm25ecoder

<pinecone_text.sparse.bm25_encoder.BM25Encoder at 0x23483e65650>

In [34]:
sentenses=[
    "In 2023 I visited peris",
      "In 2022 I visited New York",
       "In 2021 I visited New Orleans"
]
# Applying TF-IDF values on these sentences
bm25ecoder.fit(sentenses)

## Store values to a JSON file
bm25ecoder.dump("bm25_values.json")

#load to your BM25Encoder object
bm25ecoder=BM25Encoder().load("bm25_values.json")

100%|██████████| 3/3 [00:00<00:00, 1510.19it/s]


In [35]:
retriver=PineconeHybridSearchRetriever(embeddings=embeddings,sparse_encoder=bm25ecoder,index=index)

In [37]:
retriver.add_texts(sentenses)

100%|██████████| 1/1 [00:03<00:00,  3.17s/it]


In [38]:
retriver.invoke("which city did I visit last")

[Document(page_content='In 2021 I visited New Orleans'),
 Document(page_content='In 2022 I visited New York'),
 Document(page_content='In 2023 I visited peris')]

In [39]:
retriver.invoke("which city did I visit recent")

[Document(page_content='In 2021 I visited New Orleans'),
 Document(page_content='In 2022 I visited New York'),
 Document(page_content='In 2023 I visited peris')]

In [43]:
query = "most recent city I visited"
results = retriver.get_relevant_documents(query)[:1]

for result in results:
    print(result.page_content)


In 2021 I visited New Orleans


In [45]:
import re

query = "which city did I visit last"
results = retriver.get_relevant_documents(query)

# Extract years and re-rank by most recent
def extract_year(text):
    match = re.search(r'\b(20\d{2})\b', text)
    return int(match.group(0)) if match else 0

sorted_results = sorted(results, key=lambda doc: extract_year(doc.page_content), reverse=True)

# Print the most relevant (most recent) result
if sorted_results:
    print(sorted_results[0].page_content)


In 2023 I visited peris
