In [1]:
import os
api_key = os.environ.get("OPENAI_API_KEY")

In [2]:
!pip install langchain chromadb faiss-cpu openai tiktoken langchain_openai langchain-community wikipedia



# Wikipedia Retriever

In [3]:
from langchain_community.retrievers import WikipediaRetriever

# Inititalize the Retriever (optional : set language and top_k)
retriever = WikipediaRetriever(top_k_results = 2,lang='en')

#Define the Query
query = " The geopolitical history of india and pakistan from te perspective of a chinese"

# Get Relevant Wikipedia Documents
docs = retriever.invoke(query)
print(docs)

[Document(metadata={'title': 'History of Punjab', 'summary': 'The History of Punjab is the history of the Punjab region which is a geopolitical, cultural, and historical region in the northwest of South Asia, comprising the Punjab province in Pakistan and the Punjab state in India. It is believed that the earliest evidence of human habitation in Punjab traces to the Soan valley of the Pothohar, between the Indus and the Jhelum rivers, where Soanian culture developed between 774,000 BC and 11,700 BC. This period goes back to the first interglacial period in the second Ice Age, from which remnants of stone and flint tools have been found.\nThe Punjab region was the site of one of the earliest cradle of civilizations, the Bronze Age Harrapan civilization that flourished from about 3000 B.C. and declined rapidly 1,000 years later, following the Indo-Aryan migrations that overran the region in waves between 1500 and 500 B.C. The migrating Indo-Aryan tribes gave rise to the Iron Age Vedic ci

In [5]:
## Print the Retrieved Content
for i,doc in enumerate(docs):
    print(f"\n-- Result {i+1} ---")
    print(f"Content: \n {doc.page_content}...")  # Truncate for display


-- Result 1 ---
Content: 
 The History of Punjab is the history of the Punjab region which is a geopolitical, cultural, and historical region in the northwest of South Asia, comprising the Punjab province in Pakistan and the Punjab state in India. It is believed that the earliest evidence of human habitation in Punjab traces to the Soan valley of the Pothohar, between the Indus and the Jhelum rivers, where Soanian culture developed between 774,000 BC and 11,700 BC. This period goes back to the first interglacial period in the second Ice Age, from which remnants of stone and flint tools have been found.
The Punjab region was the site of one of the earliest cradle of civilizations, the Bronze Age Harrapan civilization that flourished from about 3000 B.C. and declined rapidly 1,000 years later, following the Indo-Aryan migrations that overran the region in waves between 1500 and 500 B.C. The migrating Indo-Aryan tribes gave rise to the Iron Age Vedic civilization, which lasted till 500 B

# Vector Store Retriever

In [6]:
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_core.documents import  Document

# Step 1: Your source documents
documents = [
    Document(page_content="LangChain helps developers build LLM applications easily."),
    Document(page_content="Chroma is a vector database optimized for LLM-based search."),
    Document(page_content="Embeddings convert text into high-dimensional vectors."),
    Document(page_content="OpenAI provides powerful embedding models."),
]

# Step 2: Initialize Embedding Model
embedding_model = OpenAIEmbeddings()

# Step 3: Create Chroma vector store in memory
vectorstore = Chroma.from_documents(
    documents=documents,
    embedding=embedding_model,
    collection_name="my_collection"
    
)

# Step 4: Convert vectorstore into a retriever
retriever = vectorstore.as_retriever(search_kwargs={"k":2})

In [8]:
query = "Why ChromaDB is used"
results = retriever.invoke(query)

In [9]:
for i,doc in enumerate(results):
    print(f"\n--- Result {i+1} ---")
    print(doc.page_content)


--- Result 1 ---
Chroma is a vector database optimized for LLM-based search.

--- Result 2 ---
LangChain helps developers build LLM applications easily.


In [13]:
results = vectorstore.similarity_search(query,k=4)
print(results)

[Document(metadata={}, page_content='Chroma is a vector database optimized for LLM-based search.'), Document(metadata={}, page_content='LangChain helps developers build LLM applications easily.'), Document(metadata={}, page_content='Embeddings convert text into high-dimensional vectors.'), Document(metadata={}, page_content='OpenAI provides powerful embedding models.')]


In [14]:
for i,doc in enumerate(results):
    print(f"\n ---Result {i+1} ---")
    print(doc.page_content)


 ---Result 1 ---
Chroma is a vector database optimized for LLM-based search.

 ---Result 2 ---
LangChain helps developers build LLM applications easily.

 ---Result 3 ---
Embeddings convert text into high-dimensional vectors.

 ---Result 4 ---
OpenAI provides powerful embedding models.


## MMR

In [17]:
# Sample documents
docs = [
    Document(page_content="LangChain makes it easy to work with LLMs."),
    Document(page_content="LangChain is used to build LLM based applications."),
    Document(page_content="Chroma is used to store and search document embeddings."),
    Document(page_content="Embeddings are vector representations of text."),
    Document(page_content="MMR helps you get diverse results when doing similarity search."),
    Document(page_content="LangChain supports Chroma, FAISS, Pinecone, and more."),
]

from langchain_community.vectorstores import FAISS

# Initialize OpenAI embeddings
embedding_model = OpenAIEmbeddings()

# Step 2: Create the FAISS vector store from documents
vectorstore = FAISS.from_documents(
    documents=docs,
    embedding=embedding_model
)

# Enable MMR in the retriever
retriever = vectorstore.as_retriever(
    search_type="mmr",                   # <-- This enables MMR
    search_kwargs={"k": 3, "lambda_mult": 0.5}  # k = top results, lambda_mult = relevance-diversity balance
)

In [18]:
query = "What is langchain?"
results = retriever.invoke(query)

In [19]:
for i, doc in enumerate(results):
    print(f"\n--- Result {i+1} ---")
    print(doc.page_content)


--- Result 1 ---
LangChain is used to build LLM based applications.

--- Result 2 ---
Embeddings are vector representations of text.

--- Result 3 ---
LangChain supports Chroma, FAISS, Pinecone, and more.


## Multiquery Retriever

In [None]:
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_core.documents import Document
from langchain_openai import ChatOpenAI
from langchain.retrievers.multi_query import MultiQueryRetriever


# Relevant health & wellness documents
all_docs = [
    Document(page_content="Regular walking boosts heart health and can reduce symptoms of depression.", metadata={"source": "H1"}),
    Document(page_content="Consuming leafy greens and fruits helps detox the body and improve longevity.", metadata={"source": "H2"}),
    Document(page_content="Deep sleep is crucial for cellular repair and emotional regulation.", metadata={"source": "H3"}),
    Document(page_content="Mindfulness and controlled breathing lower cortisol and improve mental clarity.", metadata={"source": "H4"}),
    Document(page_content="Drinking sufficient water throughout the day helps maintain metabolism and energy.", metadata={"source": "H5"}),
    Document(page_content="The solar energy system in modern homes helps balance electricity demand.", metadata={"source": "I1"}),
    Document(page_content="Python balances readability with power, making it a popular system design language.", metadata={"source": "I2"}),
    Document(page_content="Photosynthesis enables plants to produce energy by converting sunlight.", metadata={"source": "I3"}),
    Document(page_content="The 2022 FIFA World Cup was held in Qatar and drew global energy and excitement.", metadata={"source": "I4"}),
    Document(page_content="Black holes bend spacetime and store immense gravitational energy.", metadata={"source": "I5"}),
]

# Initialize OpenAI embeddings
embedding_model = OpenAIEmbeddings()

# Create FAISS vector store
vectorstore = FAISS.from_documents(documents=all_docs, embedding=embedding_model)

multiquery_retriever = MultiQueryRetriever.from_llm(
    retriever=vectorstore.as_retriever(search_kwargs={"k": 5}),
    llm=ChatOpenAI(model="gpt-3.5-turbo")
)

# Query
query = "How to improve energy levels and maintain balance?"

# Retrieve results
similarity_results = similarity_retriever.invoke(query)
multiquery_results= multiquery_retriever.invoke(query)

for i, doc in enumerate(similarity_results):
    print(f"\n--- Result {i+1} ---")
    print(doc.page_content)

print("*"*150)

for i, doc in enumerate(multiquery_results):
    print(f"\n--- Result {i+1} ---")
    print(doc.page_content)






ModuleNotFoundError: No module named 'langchain.retrievers'

In [23]:

import os
from dotenv import load_dotenv

load_dotenv()  


from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.documents import Document
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser


# Documents
all_docs = [
    Document(page_content="Regular walking boosts heart health and can reduce symptoms of depression.", metadata={"source": "H1"}),
    Document(page_content="Consuming leafy greens and fruits helps detox the body and improve longevity.", metadata={"source": "H2"}),
    Document(page_content="Deep sleep is crucial for cellular repair and emotional regulation.", metadata={"source": "H3"}),
    Document(page_content="Mindfulness and controlled breathing lower cortisol and improve mental clarity.", metadata={"source": "H4"}),
    Document(page_content="Drinking sufficient water throughout the day helps maintain metabolism and energy.", metadata={"source": "H5"}),
    Document(page_content="The solar energy system in modern homes helps balance electricity demand.", metadata={"source": "I1"}),
    Document(page_content="Python balances readability with power, making it a popular system design language.", metadata={"source": "I2"}),
    Document(page_content="Photosynthesis enables plants to produce energy by converting sunlight.", metadata={"source": "I3"}),
    Document(page_content="The 2022 FIFA World Cup was held in Qatar and drew global energy and excitement.", metadata={"source": "I4"}),
    Document(page_content="Black holes bend spacetime and store immense gravitational energy.", metadata={"source": "I5"}),
]


# Vector Store (FAISS)
embeddings = OpenAIEmbeddings()

vectorstore = FAISS.from_documents(
    documents=all_docs,
    embedding=embeddings
)


# Query
query = "How to improve energy levels and maintain balance?"


# 1Ô∏è‚É£ Simple Similarity Retrieval
similarity_retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
similarity_results = similarity_retriever.invoke(query)


# 2Ô∏è‚É£ Multi-Query (LCEL replacement)
prompt = PromptTemplate.from_template(
    "Generate 3 different search queries related to:\n{question}"
)

llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
parser = StrOutputParser()
query_chain = prompt | llm | parser

generated_queries = query_chain.invoke({"question": query}).split("\n")


multiquery_results = []
for q in generated_queries:
    multiquery_results.extend(
        vectorstore.similarity_search(q, k=3)
    )



# Output
print("üîπ Similarity Search Results")
for i, doc in enumerate(similarity_results, 1):
    print(f"\n--- Result {i} ---")
    print(doc.page_content)

print("\n" + "=" * 120)

print("üîπ Multi-Query Search Results")
for i, doc in enumerate(multiquery_results, 1):
    print(f"\n--- Result {i} ---")
    print(doc.page_content)


üîπ Similarity Search Results

--- Result 1 ---
Drinking sufficient water throughout the day helps maintain metabolism and energy.

--- Result 2 ---
Mindfulness and controlled breathing lower cortisol and improve mental clarity.

--- Result 3 ---
Regular walking boosts heart health and can reduce symptoms of depression.

--- Result 4 ---
Deep sleep is crucial for cellular repair and emotional regulation.

--- Result 5 ---
Consuming leafy greens and fruits helps detox the body and improve longevity.

üîπ Multi-Query Search Results

--- Result 1 ---
Drinking sufficient water throughout the day helps maintain metabolism and energy.

--- Result 2 ---
Consuming leafy greens and fruits helps detox the body and improve longevity.

--- Result 3 ---
Regular walking boosts heart health and can reduce symptoms of depression.

--- Result 4 ---
Drinking sufficient water throughout the day helps maintain metabolism and energy.

--- Result 5 ---
Consuming leafy greens and fruits helps detox the bod

## ContextualCompressionRetriever

In [30]:
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import StrOutputParser

# 1. Setup Data & Vectorstore
docs = [
    Document(page_content="The Grand Canyon is one of the most visited natural wonders. Photosynthesis is the process by which green plants convert sunlight into energy. The rocks date back millions of years.", metadata={"source": "Doc1"}),
    Document(page_content="In medieval Europe, castles were built for defense. The chlorophyll in plant cells captures sunlight during photosynthesis. Knights wore armor made of metal.", metadata={"source": "Doc2"}),
    Document(page_content="Basketball was invented by Dr. James Naismith. It was originally played with a soccer ball. NBA is now a global league.", metadata={"source": "Doc3"}),
    Document(page_content="The history of cinema began in the late 1800s. Silent films were the earliest form. Photosynthesis does not occur in animal cells.", metadata={"source": "Doc4"})
]

vectorstore = FAISS.from_documents(docs, OpenAIEmbeddings())
base_retriever = vectorstore.as_retriever(search_kwargs={'k': 3})

# 2. Define the Extraction Logic (The "Compressor" Replacement)
llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=0)

extraction_prompt = PromptTemplate.from_template(
    "Given the following document, extract only the parts relevant to the question: '{query}'\n\n"
    "Document: {page_content}\n\n"
    "If no part is relevant, respond with 'NO_RELEVANT_INFO'."
)

def compress_docs(input_data):
    """Custom function to filter and shorten documents based on the query."""
    query = input_data["query"]
    documents = input_data["documents"]
    
    compressed_docs = []
    for doc in documents:
        # Run extraction for each document
        chain = extraction_prompt | llm | StrOutputParser()
        response = chain.invoke({"query": query, "page_content": doc.page_content})
        
        if "NO_RELEVANT_INFO" not in response:
            compressed_docs.append(Document(page_content=response, metadata=doc.metadata))
            
    return compressed_docs

# 3. The LCEL Chain
# This replaces ContextualCompressionRetriever
compression_chain = (
    {"query": RunnablePassthrough(), "documents": base_retriever}
    | RunnableLambda(compress_docs)
)

# 4. Execution
query = 'What is photosynthesis?'
results = compression_chain.invoke(query)

# Output
for i, doc in enumerate(results):
    print(f'\n--- Result {i+1} (Source: {doc.metadata["source"]}) ---')
    print(doc.page_content)


--- Result 1 (Source: Doc1) ---
Photosynthesis is the process by which green plants convert sunlight into energy.

--- Result 2 (Source: Doc2) ---
The chlorophyll in plant cells captures sunlight during photosynthesis.
