In [25]:
from langchain_community.retrievers import BM25Retriever
from langchain_classic.retrievers import EnsembleRetriever
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_cohere import CohereRerank

from langchain_core.documents import Document
from langchain_core.messages import HumanMessage, SystemMessage
from dotenv import load_dotenv

load_dotenv()

True

In [5]:
chunks = [
    # Tesla - Financial & Production
    "Tesla reported record quarterly revenue of $25.2 billion in Q3 2024.",
    "Tesla's automotive gross margin improved to 19.3% this quarter.",
    "Tesla Cybertruck production ramp begins in 2024 with initial deliveries.",
    "Tesla announced plans to expand Gigafactory production capacity.",
    "Tesla stock price reached new highs following earnings announcement.",
    "Tesla's energy storage business grew 40% year-over-year.",
    "Tesla continues to lead in electric vehicle market share globally.",
    "Tesla Model Y became the best-selling vehicle worldwide.",
    "Tesla reported strong free cash flow generation of $7.5 billion.",
    "Tesla's Full Self-Driving revenue increased significantly.",
    
    # Microsoft - Development & Acquisitions
    "Microsoft acquired GitHub for $7.5 billion in 2018.",
    "Microsoft's cloud revenue Azure grew 29% year-over-year.",
    "Microsoft announced new AI features for Visual Studio Code.",
    "Microsoft Teams integration with GitHub enhances developer workflow.",
    "Microsoft's developer tools division sees strong adoption.",
    "Microsoft acquired Activision Blizzard for $68.7 billion.",
    "Microsoft's productivity suite gained 50 million new users.",
    "Microsoft announced new Surface devices for developers.",
    "Microsoft's AI Copilot features expand to more development tools.",
    "Microsoft's enterprise solutions drive revenue growth.",
    
    # NVIDIA - AI & Hardware
    "NVIDIA's data center revenue reached $47.5 billion annually.",
    "NVIDIA's H100 GPUs see unprecedented demand for AI training.",
    "NVIDIA announced next-generation Blackwell architecture.",
    "NVIDIA's gaming revenue declined due to crypto market changes.",
    "NVIDIA's automotive AI platform partnerships expanded.",
    "NVIDIA's AI chip shortage affects cloud providers.",
    "NVIDIA stock valuation exceeds $2 trillion market cap.",
    "NVIDIA's CUDA platform dominates AI development.",
    "NVIDIA announced new AI inference chips for edge computing.",
    "NVIDIA's partnership with major cloud providers strengthens.",
    
    # Google/Alphabet - AI & Cloud
    "Google's AI investments total over $100 billion in recent years.",
    "Google Cloud revenue grew 35% reaching $8.4 billion quarterly.",
    "Google announced Gemini AI model competing with GPT-4.",
    "Google's search advertising revenue remains strong at $59 billion.",
    "Google's Workspace products integrate advanced AI features.",
    "Google announced quantum computing breakthroughs.",
    "Google's autonomous vehicle division Waymo expands operations.",
    "Google's AI research published breakthrough papers.",
    "Google's cloud AI services see enterprise adoption.",
    "Google faces regulatory scrutiny over AI dominance.",
    
    # Noisy/Less Relevant Chunks
    "The Tesla coil was invented by Nikola Tesla in 1891.",
    "Microsoft Excel spreadsheet formulas can be complex for beginners.",
    "NVIDIA Shield TV streaming device gets software update.",
    "Google Maps navigation improved with real-time traffic data.",
    "Production delays affected multiple manufacturing sectors.",
    "Financial markets showed volatility during earnings season.",
    "Revenue recognition standards changed for software companies.",
    "Hardware components face supply chain constraints globally.",
    "Development tools market grows with remote work trends.",
    "AI research requires significant computational resources.",
    "Quarterly reports show mixed results across tech sector.",
    "Stock market analysts upgrade technology sector ratings.",
    "Cloud computing adoption accelerates in enterprise market.",
    "Data center construction increases globally.",
    "Semiconductor shortage impacts various industries.",
    "Electric vehicle charging infrastructure expands rapidly.",
    "Software development productivity tools gain popularity.",
    "Machine learning frameworks become more accessible.",
    "Enterprise software licensing models evolve.",
    "Technology conferences showcase latest innovations."
]

print(f"Created {len(chunks)} sample chunks for demonstration")

Created 60 sample chunks for demonstration


In [6]:
# Convert to Document objects for LangChain
documents = [Document(page_content=chunk, metadata={"source": f"chunk_{i}"}) for i, chunk in enumerate(chunks)]

print("Sample Data:")
for i, chunk in enumerate(chunks, 1):
    print(f"{i}. {chunk}")

print("\n" + "="*80)

Sample Data:
1. Tesla reported record quarterly revenue of $25.2 billion in Q3 2024.
2. Tesla's automotive gross margin improved to 19.3% this quarter.
3. Tesla Cybertruck production ramp begins in 2024 with initial deliveries.
4. Tesla announced plans to expand Gigafactory production capacity.
5. Tesla stock price reached new highs following earnings announcement.
6. Tesla's energy storage business grew 40% year-over-year.
7. Tesla continues to lead in electric vehicle market share globally.
8. Tesla Model Y became the best-selling vehicle worldwide.
9. Tesla reported strong free cash flow generation of $7.5 billion.
10. Tesla's Full Self-Driving revenue increased significantly.
11. Microsoft acquired GitHub for $7.5 billion in 2018.
12. Microsoft's cloud revenue Azure grew 29% year-over-year.
13. Microsoft announced new AI features for Visual Studio Code.
14. Microsoft Teams integration with GitHub enhances developer workflow.
15. Microsoft's developer tools division sees strong adop

# Vector Retriever

In [7]:
model = HuggingFaceEmbeddings(model="intfloat/e5-large-v2")
print(f"Creating a sample vectorstore....")
vectorstore = Chroma.from_documents(
    documents=documents,
    embedding=model,
    persist_directory="db/dummy_db_reranker",
    collection_metadata={"hnsw:space": "cosine"}
)

Creating a sample vectorstore....


In [14]:
vector_retriever = vectorstore.as_retriever(search_kwargs={"k": 15})
query = "How does Tesla earn money"

relevant_docs = vector_retriever.invoke(query)

In [15]:
for doc in relevant_docs:
    print(doc)

page_content='Tesla reported strong free cash flow generation of $7.5 billion.' metadata={'source': 'chunk_8'}
page_content='Tesla's automotive gross margin improved to 19.3% this quarter.' metadata={'source': 'chunk_1'}
page_content='Tesla's Full Self-Driving revenue increased significantly.' metadata={'source': 'chunk_9'}
page_content='Tesla stock price reached new highs following earnings announcement.' metadata={'source': 'chunk_4'}
page_content='Tesla reported record quarterly revenue of $25.2 billion in Q3 2024.' metadata={'source': 'chunk_0'}
page_content='Tesla's energy storage business grew 40% year-over-year.' metadata={'source': 'chunk_5'}
page_content='Tesla continues to lead in electric vehicle market share globally.' metadata={'source': 'chunk_6'}
page_content='Tesla announced plans to expand Gigafactory production capacity.' metadata={'source': 'chunk_3'}
page_content='Tesla Model Y became the best-selling vehicle worldwide.' metadata={'source': 'chunk_7'}
page_content='

# BM25 (Keyword Search)

In [16]:
bm25_retriever = BM25Retriever.from_documents(documents)
bm25_retriever.k = 15

In [17]:
query = "Cybertruck"
test_docs = bm25_retriever.invoke(query)

In [18]:
for doc in test_docs:
    print(doc)

page_content='Tesla Cybertruck production ramp begins in 2024 with initial deliveries.' metadata={'source': 'chunk_2'}
page_content='Technology conferences showcase latest innovations.' metadata={'source': 'chunk_59'}
page_content='Microsoft acquired Activision Blizzard for $68.7 billion.' metadata={'source': 'chunk_15'}
page_content='NVIDIA's CUDA platform dominates AI development.' metadata={'source': 'chunk_27'}
page_content='NVIDIA stock valuation exceeds $2 trillion market cap.' metadata={'source': 'chunk_26'}
page_content='NVIDIA's AI chip shortage affects cloud providers.' metadata={'source': 'chunk_25'}
page_content='NVIDIA's automotive AI platform partnerships expanded.' metadata={'source': 'chunk_24'}
page_content='NVIDIA's gaming revenue declined due to crypto market changes.' metadata={'source': 'chunk_23'}
page_content='NVIDIA announced next-generation Blackwell architecture.' metadata={'source': 'chunk_22'}
page_content='NVIDIA's H100 GPUs see unprecedented demand for AI 

# Combining the two (Hybrid Search)

In [19]:
hybrid_retriever = EnsembleRetriever(
    retrievers=[vector_retriever, bm25_retriever],
    weights=[0.7, 0.3]
)

In [20]:
query = "company performance Tesla"
retrieved_chunks = hybrid_retriever.invoke(query)
for i, doc in enumerate(retrieved_chunks, 1):
    print(f"Chunk {i} - {doc.page_content}")

Chunk 1 - Tesla reported strong free cash flow generation of $7.5 billion.
Chunk 2 - Tesla stock price reached new highs following earnings announcement.
Chunk 3 - Tesla continues to lead in electric vehicle market share globally.
Chunk 4 - Tesla announced plans to expand Gigafactory production capacity.
Chunk 5 - Tesla reported record quarterly revenue of $25.2 billion in Q3 2024.
Chunk 6 - Tesla Model Y became the best-selling vehicle worldwide.
Chunk 7 - Tesla Cybertruck production ramp begins in 2024 with initial deliveries.
Chunk 8 - The Tesla coil was invented by Nikola Tesla in 1891.
Chunk 9 - NVIDIA stock valuation exceeds $2 trillion market cap.
Chunk 10 - Tesla's energy storage business grew 40% year-over-year.
Chunk 11 - Tesla's automotive gross margin improved to 19.3% this quarter.
Chunk 12 - Tesla's Full Self-Driving revenue increased significantly.
Chunk 13 - Quarterly reports show mixed results across tech sector.
Chunk 14 - Microsoft's cloud revenue Azure grew 29% year

# RERANKER

In [26]:
reranker = CohereRerank(model="rerank-english-v3.0", top_n=10)
reranked_docs = reranker.compress_documents(retrieved_chunks, query)

In [27]:
for i, doc in enumerate(reranked_docs, 1):
    print(f"Chunk {i} - {doc.page_content}")

Chunk 1 - Tesla reported record quarterly revenue of $25.2 billion in Q3 2024.
Chunk 2 - Tesla's energy storage business grew 40% year-over-year.
Chunk 3 - Tesla reported strong free cash flow generation of $7.5 billion.
Chunk 4 - Tesla's automotive gross margin improved to 19.3% this quarter.
Chunk 5 - Tesla continues to lead in electric vehicle market share globally.
Chunk 6 - Tesla's Full Self-Driving revenue increased significantly.
Chunk 7 - Tesla stock price reached new highs following earnings announcement.
Chunk 8 - Tesla announced plans to expand Gigafactory production capacity.
Chunk 9 - Tesla Model Y became the best-selling vehicle worldwide.
Chunk 10 - Electric vehicle charging infrastructure expands rapidly.


In [28]:
from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace

combined_input = f"""Based on the following documents, please answer this question : {query}

Documents : 
{chr(10).join([f" - {doc.page_content}" for doc in reranked_docs])}

Please provide a clear, helpful answer using only the information from these documents. If you can't find the answer in the documents, say "I don't have enough information to answer the question".
"""

In [30]:
combined_input

'Based on the following documents, please answer this question : company performance Tesla\n\nDocuments : \n - Tesla reported record quarterly revenue of $25.2 billion in Q3 2024.\n - Tesla\'s energy storage business grew 40% year-over-year.\n - Tesla reported strong free cash flow generation of $7.5 billion.\n - Tesla\'s automotive gross margin improved to 19.3% this quarter.\n - Tesla continues to lead in electric vehicle market share globally.\n - Tesla\'s Full Self-Driving revenue increased significantly.\n - Tesla stock price reached new highs following earnings announcement.\n - Tesla announced plans to expand Gigafactory production capacity.\n - Tesla Model Y became the best-selling vehicle worldwide.\n - Electric vehicle charging infrastructure expands rapidly.\n\nPlease provide a clear, helpful answer using only the information from these documents. If you can\'t find the answer in the documents, say "I don\'t have enough information to answer the question".\n'

In [29]:
llm = HuggingFaceEndpoint(
    model="meta-llama/Llama-3.1-8B-Instruct", # you have to make sure that this model has an InferenceProvider on the HuggingFace Website.
    task="text-generation",
    max_new_tokens=200,
    temperature=0.7,
    provider="auto"
)

model = ChatHuggingFace(llm=llm)


messages = [
    SystemMessage(content="You are a helpful assistant that answers questions based on the provided documents. If you can't find the answer in the documents, say 'I don't have enough information to answer the question'."),
    HumanMessage(content=combined_input)
]

response = model.invoke(messages)

print(response.content)

Based on the provided documents, I can answer the following questions about Tesla's company performance:

1. What was Tesla's record quarterly revenue in Q3 2024?
   - Tesla reported record quarterly revenue of $25.2 billion in Q3 2024.

2. How did Tesla's energy storage business grow year-over-year?
   - Tesla's energy storage business grew 40% year-over-year.

3. What was Tesla's free cash flow generation in Q3 2024?
   - Tesla reported strong free cash flow generation of $7.5 billion.

4. What was Tesla's automotive gross margin in Q3 2024?
   - Tesla's automotive gross margin improved to 19.3% this quarter.

5. What position does Tesla hold in the global electric vehicle market?
   - Tesla continues to lead in electric vehicle market share globally.

6. What happened to Tesla's Full Self-Driving revenue?
   - Tesla's Full Self-Driving revenue increased significantly
