In [20]:
from langchain_community.retrievers import BM25Retriever
from langchain_classic.retrievers import EnsembleRetriever
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_core.documents import Document
from langchain_core.messages import HumanMessage, SystemMessage
from dotenv import load_dotenv

load_dotenv()

True

In [3]:
chunks = [
    "Microsoft acquired GitHub for 7.5 billion dollars in 2018.",
    "Tesla Cybertruck production ramp begins in 2024.",
    "Google is a large technology company with global operations.",
    "Tesla reported strong quarterly results. Tesla continues to lead in electric vehicles. Tesla announced new manufacturing facilities.",
    "SpaceX develops Starship rockets for Mars missions.",
    "The tech giant acquired the code repository platform for software development.",
    "NVIDIA designs Starship architecture for their new GPUs.",
    "Tesla Tesla Tesla financial quarterly results improved significantly.",
    "Cybertruck reservations exceeded company expectations.",
    "Microsoft is a large technology company with global operations.", 
    "Apple announced new iPhone features for developers.",
    "The apple orchard harvest was excellent this year.",
    "Python programming language is widely used in AI.",
    "The python snake can grow up to 20 feet long.",
    "Java coffee beans are imported from Indonesia.", 
    "Java programming requires understanding of object-oriented concepts.",
    "Orange juice sales increased during winter months.",
    "Orange County reported new housing developments."
]

In [4]:
# Convert to Document objects for LangChain
documents = [Document(page_content=chunk, metadata={"source": f"chunk_{i}"}) for i, chunk in enumerate(chunks)]

print("Sample Data:")
for i, chunk in enumerate(chunks, 1):
    print(f"{i}. {chunk}")

print("\n" + "="*80)

Sample Data:
1. Microsoft acquired GitHub for 7.5 billion dollars in 2018.
2. Tesla Cybertruck production ramp begins in 2024.
3. Google is a large technology company with global operations.
4. Tesla reported strong quarterly results. Tesla continues to lead in electric vehicles. Tesla announced new manufacturing facilities.
5. SpaceX develops Starship rockets for Mars missions.
6. The tech giant acquired the code repository platform for software development.
7. NVIDIA designs Starship architecture for their new GPUs.
8. Tesla Tesla Tesla financial quarterly results improved significantly.
9. Cybertruck reservations exceeded company expectations.
10. Microsoft is a large technology company with global operations.
11. Apple announced new iPhone features for developers.
12. The apple orchard harvest was excellent this year.
13. Python programming language is widely used in AI.
14. The python snake can grow up to 20 feet long.
15. Java coffee beans are imported from Indonesia.
16. Java pr

# Vector Retriever

In [6]:
model = HuggingFaceEmbeddings(model="intfloat/e5-large-v2")
print(f"Creating a sample vectorstore....")
vectorstore = Chroma.from_documents(
    documents=documents,
    embedding=model,
    persist_directory="db/dummy_db",
    collection_metadata={"hnsw:space": "cosine"}
)

Creating a sample vectorstore....


In [7]:
vector_retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
query = "space exploration company"

relevant_docs = vector_retriever.invoke(query)

In [8]:
for doc in relevant_docs:
    print(doc)

page_content='SpaceX develops Starship rockets for Mars missions.' metadata={'source': 'chunk_4'}
page_content='Google is a large technology company with global operations.' metadata={'source': 'chunk_2'}
page_content='NVIDIA designs Starship architecture for their new GPUs.' metadata={'source': 'chunk_6'}


# BM25 (Keyword Search)

In [9]:
bm25_retriever = BM25Retriever.from_documents(documents)
bm25_retriever.k = 3

In [12]:
query = "Cybertruck"
test_docs = bm25_retriever.invoke(query)

In [13]:
for doc in test_docs:
    print(doc)

page_content='Cybertruck reservations exceeded company expectations.' metadata={'source': 'chunk_8'}
page_content='Tesla Cybertruck production ramp begins in 2024.' metadata={'source': 'chunk_1'}
page_content='Orange juice sales increased during winter months.' metadata={'source': 'chunk_16'}


# Combining the two (Hybrid Search)

In [14]:
hybrid_retriever = EnsembleRetriever(
    retrievers=[vector_retriever, bm25_retriever],
    weights=[0.5, 0.5]
)

In [16]:
query = "company performance Tesla"
retrieved_chunks = hybrid_retriever.invoke(query)
for i, doc in enumerate(retrieved_chunks, 1):
    print(f"Chunk {i} - {doc.page_content}")

Chunk 1 - Tesla Tesla Tesla financial quarterly results improved significantly.
Chunk 2 - Tesla reported strong quarterly results. Tesla continues to lead in electric vehicles. Tesla announced new manufacturing facilities.
Chunk 3 - Cybertruck reservations exceeded company expectations.


In [None]:
from langchain_huggingface import HuggingFaceEndpoint, ChatHuggingFace

combined_input = f"""Based on the following documents, please answer this question : {query}

Documents : 
{chr(10).join([f" - {doc.page_content}" for doc in retrieved_chunks])}

Please provide a clear, helpful answer using only the information from these documents. If you can't find the answer in the documents, say "I don't have enough information to answer the question".
"""

In [21]:
llm = HuggingFaceEndpoint(
    model="meta-llama/Llama-3.1-8B-Instruct", # you have to make sure that this model has an InferenceProvider on the HuggingFace Website.
    task="text-generation",
    max_new_tokens=200,
    temperature=0.7,
    provider="auto"
)

model = ChatHuggingFace(llm=llm)


messages = [
    SystemMessage(content="You are a helpful assistant that answers questions based on the provided documents. If you can't find the answer in the documents, say 'I don't have enough information to answer the question'."),
    HumanMessage(content=combined_input)
]

response = model.invoke(messages)

print(response.content)

Based on the provided documents, here's the information about Tesla's company performance:

1. Tesla's financial quarterly results improved significantly.
2. Tesla continues to lead in electric vehicles.
3. Cybertruck reservations exceeded company expectations.

Overall, the information suggests that Tesla is performing well, with improvements in financial results and strong sales of electric vehicles, including the Cybertruck.
