In [4]:
import os
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())
mistral_api_key = os.environ["MISTRAL_API_KEY"]
tavily_api_key = os.environ["TAVILY_API_KEY"]


In [3]:
from langchain_mistralai import ChatMistralAI

chatModel = ChatMistralAI(
    mistral_api_key=mistral_api_key,
    model="mistral-large-latest",
    temperature=1,
    max_retries=2,
    # other params...
)

In [5]:
# for RAG - Import PDF

from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader('./data/kafka_definite_guide.pdf')

loaded_data = loader.load_and_split()

In [6]:
# Insert PDF to Vector DB
vectorstore = Chroma.from_documents(documents=loaded_data, embedding=HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2"))


  from tqdm.autonotebook import tqdm, trange


In [7]:
# Find top 5 Search from Vector DB

retriever = vectorstore.as_retriever(search_kwargs={"k": 5})


In [15]:
# template to send, TOP 5 search from Vector DB for LLM as context and Question.

from langchain_core.prompts import ChatPromptTemplate

template = """Answer the question based only on the following context. If context is not available then you reply based on your information:

{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

In [12]:
# To call LLM 

from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [16]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | chatModel
    | StrOutputParser()
)

In [17]:
query= "what is the use of Apache Kafka"
for chunk in rag_chain.stream(query):
    print(chunk, end="", flush=True)

Based on the provided context, Apache Kafka is used for the following purposes

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


:

1. **Messaging System**: Kafka behaves as a message queue, providing ordering guarantees on the messages it stores. This means it can be used to pass messages between different systems or components in a reliable and ordered manner.

2. **Data Streaming**: Kafka is used for streaming data between systems. It allows for the integration of data from various sources and can handle high throughput and low latency.

3. **Distributed Systems**: Kafka is useful in distributed systems for its ability to handle large volumes of data and provide robust data pipelines. It is designed to work in a distributed environment and provides features like replication and fault tolerance.

4. **Real-time Analytics**: Kafka's ability to process and analyze data in real-time makes it a powerful tool for applications that require immediate insights and responses based on live data.

5. **Log Aggregation**: Kafka can be used to aggregate logs from multiple sources, providing a centralized way to store and p

Failed to multipart ingest runs: langsmith.utils.LangSmithError: Failed to POST https://api.smith.langchain.com/runs/multipart in LangSmith API. HTTPError('400 Client Error: Bad Request for url: https://api.smith.langchain.com/runs/multipart', '{"detail":"Empty request"}')
Failed to multipart ingest runs: langsmith.utils.LangSmithError: Failed to POST https://api.smith.langchain.com/runs/multipart in LangSmith API. HTTPError('400 Client Error: Bad Request for url: https://api.smith.langchain.com/runs/multipart', '{"detail":"Empty request"}')
Failed to multipart ingest runs: langsmith.utils.LangSmithError: Failed to POST https://api.smith.langchain.com/runs/multipart in LangSmith API. HTTPError('400 Client Error: Bad Request for url: https://api.smith.langchain.com/runs/multipart', '{"detail":"Empty request"}')
Failed to multipart ingest runs: langsmith.utils.LangSmithError: Failed to POST https://api.smith.langchain.com/runs/multipart in LangSmith API. HTTPError('400 Client Error: Bad 

In [18]:
query= "Who is Binod Suman"
for chunk in rag_chain.stream(query):
    print(chunk, end="", flush=True)

Based on the provided context, there is no information about a person named Binod Suman. Therefore, I cannot answer who Binod Suman is.

In [19]:
# addig Taveli for web search in case, information not present in provided PDF

from langchain_community.tools.tavily_search import TavilySearchResults

online_search_tool = [TavilySearchResults(tavily_api_key = tavily_api_key, max_results=5)]



In [20]:
llm_with_tools = chatModel.bind_tools(online_search_tool)

In [21]:
query= "Who is Binod Suman"
for chunk in rag_chain.stream(query):
    print(chunk, end="", flush=True)

I cannot find any information about Binod Suman in the provided context. Based on my general knowledge, I am also unable to provide any information about this individual.

In [32]:
# Need to enhance template, to support online search tool.

from langchain_core.prompts import ChatPromptTemplate

template = """Answer the question based only on the following context. If context is not available then use search online tavily tool:

{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

In [33]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm_with_tools
    | StrOutputParser()
)

In [36]:
query= "How is weather today in Bangalore?"
for chunk in rag_chain.stream(query):
    print(chunk, end="", flush=True)

In [38]:
aa = llm_with_tools.invoke("How is weather today in Bangalore?")

In [40]:
aa.content

''

In [41]:
from langchain_community.retrievers import TavilySearchAPIRetriever

In [42]:
tavily_retriever = TavilySearchAPIRetriever(k=3)

In [46]:
query = "How is weather today in Bangalore?";

aaa = retriever.invoke(query);
aaa

[Document(metadata={'page': 20, 'source': './data/kafka_definite_guide.pdf'}, page_content='support for all the extra time writing, and long hours\nrunning to clear his head, keeps him going.\nRajini would like to thank her husband, Manjunath, and\nson, T arun, for their unwavering support and\nencouragement, for spending weekends reviewing the early\ndrafts, and for always being there for her .\nKrit shares his love and gratitude with his wife, Cecilia, and\ntwo children, Lucas and Lizabeth. Their love and support\nmake every day a joy , and he wouldn’t be able to pursue his\npassions without them. He also wants to thank his mom,\nCindy P etty, for instilling in Krit a desire to always be the\nbest version of himself .'),
 Document(metadata={'page': 3, 'source': './data/kafka_definite_guide.pdf'}, page_content='Kafka: The Deﬁnitive\nGuide\nSECOND EDITION\nReal-Time Data and Stream Processing at\nScale\nGwen Shapira, Todd Palino, Rajini\nSivaram, and Krit Petty'),
 Document(metadata={'