# Search engine with tools and agents
## Arxiv -- research papers
## Tools Creation

In [1]:
from langchain_community.tools import ArxivQueryRun,WikipediaQueryRun
from langchain_community.utilities import WikipediaAPIWrapper,ArxivAPIWrapper

In [None]:
# used inbuilt tool of wikipedia
api_wrapper_wiki = WikipediaAPIWrapper(top_k_results = 2, doc_content_chars_max=500) # fetch top 2 results with 500 characters
# wrapper to call wikipedia,acts as interface between outside world actual wiki documents called by WikipediaQueryRun and our python code
wiki = WikipediaQueryRun(api_wrapper = api_wrapper_wiki) # wikipedia tool
wiki.name

'wikipedia'

In [3]:
# used inbuilt tool of arxiv
api_wrapper_arxiv = ArxivAPIWrapper(top_k_results = 1, doc_content_chars_max=250) # fetch top 1 results with 250 characters
# wrapper to call arxiv,acts as interface between outside world actual arxiv documents called by ArxivQueryRun and our python code
arxiv = ArxivQueryRun(api_wrapper = api_wrapper_arxiv) # arxiv tool
arxiv.name

'arxiv'

In [None]:
tools = [wiki,arxiv] # to run these tools I need llm model and chat prompt template[imported from hub below]
# generally recommended to create prompt template
# from langchain.chains.combine_documents import create_stuff_documents_chain
# from langchain.chains import create_retrieval_chain

In [None]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_groq import ChatGroq
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

import os
from dotenv import load_dotenv
load_dotenv()

# Set environment variables for LangSmith tracking and LangChain project
os.environ["LANGCHAIN_API_KEY"] = os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_PROJECT"] = os.getenv("SIMPLE_LANGCHAIN_PROJECT")
os.environ["LANGCHAIN_TRACING_V2"] = "true"

# open source model from groq
os.environ["GROQ_API_KEY"]=os.getenv("GROQ_API_KEY")
groq_api_key = os.getenv("GROQ_API_KEY")
llm = ChatGroq(model="llama-3.1-8b-instant", groq_api_key=groq_api_key)

# hugging face embeddings
os.environ["HUGGING_FACE_API_KEY"] = os.getenv("HUGGING_FACE_API_KEY")
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [None]:
#1. load,chunk(as every llm has context size) and index the contents of the blog to create a retriever, 
# use this webpage as my external data source
loader = WebBaseLoader("https://docs.langchain.com/langsmith/home")
docs = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents=text_splitter.split_documents(docs)
vectorstore = FAISS.from_documents(documents=documents,embedding=embeddings)
retriever = vectorstore.as_retriever()
retriever 

VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x000001C780132590>, search_kwargs={})

In [None]:
#convert this "retriever" into tool
#retriever must be an object with a .get_relevant_documents() method (e.g., from FAISS, Chroma, Pinecone, etc.).
# The name must be a short identifier (no spaces).
#The description should tell the LLM when to use the tool.

from langchain.tools.retriever import create_retriever_tool
retriever_tool = create_retriever_tool(
    retriever,
    name="langsmith-search",
    description="Search any information about langsmith")
#create custom tools - [Rag tool]
retriever_tool.name

'langsmith-search'

In [11]:
tools=[wiki,arxiv,retriever_tool]
tools

[WikipediaQueryRun(api_wrapper=WikipediaAPIWrapper(wiki_client=<module 'wikipedia' from 'd:\\study\\udemy\\langchain\\venv\\lib\\site-packages\\wikipedia\\__init__.py'>, top_k_results=2, lang='en', load_all_available_meta=False, doc_content_chars_max=500)),
 ArxivQueryRun(api_wrapper=ArxivAPIWrapper(arxiv_search=<class 'arxiv.Search'>, arxiv_exceptions=(<class 'arxiv.ArxivError'>, <class 'arxiv.UnexpectedEmptyPageError'>, <class 'arxiv.HTTPError'>), top_k_results=1, ARXIV_MAX_QUERY_LENGTH=300, continue_on_failure=False, load_max_docs=100, load_all_available_meta=False, doc_content_chars_max=250)),
 Tool(name='langsmith-search', description='Search any information about langsmith', args_schema=<class 'langchain_core.tools.retriever.RetrieverInput'>, func=functools.partial(<function _get_relevant_documents at 0x000001C785755C60>, retriever=VectorStoreRetriever(tags=['FAISS', 'HuggingFaceEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x000001C780132590>,

In [None]:
# run all these tools with agents and llm models
# tools, llm->agent executor

In [None]:
# hub - different types of prompts are already created and stored in hub 
from langchain import hub
prompt = hub.pull("hwchase17/openai-functions-agent", include_model=True) # pull one of the type of prompt which is already created
prompt.messages

[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], input_types={}, partial_variables={}, template='You are a helpful assistant'), additional_kwargs={}),
 MessagesPlaceholder(variable_name='chat_history', optional=True),
 HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], input_types={}, partial_variables={}, template='{input}'), additional_kwargs={}),
 MessagesPlaceholder(variable_name='agent_scratchpad')]

In [13]:
# agents
from langchain.agents import create_openai_tools_agent
agent = create_openai_tools_agent(llm,tools,prompt)
agent

RunnableAssign(mapper={
  agent_scratchpad: RunnableLambda(lambda x: format_to_openai_tool_messages(x['intermediate_steps']))
})
| ChatPromptTemplate(input_variables=['agent_scratchpad', 'input'], optional_variables=['chat_history'], input_types={'chat_history': list[typing.Annotated[typing.Union[typing.Annotated[langchain_core.messages.ai.AIMessage, Tag(tag='ai')], typing.Annotated[langchain_core.messages.human.HumanMessage, Tag(tag='human')], typing.Annotated[langchain_core.messages.chat.ChatMessage, Tag(tag='chat')], typing.Annotated[langchain_core.messages.system.SystemMessage, Tag(tag='system')], typing.Annotated[langchain_core.messages.function.FunctionMessage, Tag(tag='function')], typing.Annotated[langchain_core.messages.tool.ToolMessage, Tag(tag='tool')], typing.Annotated[langchain_core.messages.ai.AIMessageChunk, Tag(tag='AIMessageChunk')], typing.Annotated[langchain_core.messages.human.HumanMessageChunk, Tag(tag='HumanMessageChunk')], typing.Annotated[langchain_core.messages

In [None]:
#agent executor
from langchain.agents import AgentExecutor
agent_executor = AgentExecutor(agent=agent,tools=tools,verbose=True)
agent_executor

AgentExecutor(verbose=True, agent=RunnableMultiActionAgent(runnable=RunnableAssign(mapper={
  agent_scratchpad: RunnableLambda(lambda x: format_to_openai_tool_messages(x['intermediate_steps']))
})
| ChatPromptTemplate(input_variables=['agent_scratchpad', 'input'], optional_variables=['chat_history'], input_types={'chat_history': list[typing.Annotated[typing.Union[typing.Annotated[langchain_core.messages.ai.AIMessage, Tag(tag='ai')], typing.Annotated[langchain_core.messages.human.HumanMessage, Tag(tag='human')], typing.Annotated[langchain_core.messages.chat.ChatMessage, Tag(tag='chat')], typing.Annotated[langchain_core.messages.system.SystemMessage, Tag(tag='system')], typing.Annotated[langchain_core.messages.function.FunctionMessage, Tag(tag='function')], typing.Annotated[langchain_core.messages.tool.ToolMessage, Tag(tag='tool')], typing.Annotated[langchain_core.messages.ai.AIMessageChunk, Tag(tag='AIMessageChunk')], typing.Annotated[langchain_core.messages.human.HumanMessageChunk, Tag

In [None]:
# our description is given above while creating retriever tool "Search any information about langsmith"
#invoking our retriever tool `langsmith-search`
agent_executor.invoke({"input":"What is langsmith"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `langsmith-search` with `{'query': 'what is langsmith'}`


[0m[38;5;200m[1;3mGet started with LangSmith - Docs by LangChainDocs by LangChain home pagePythonSearch...⌘KLangSmithPlatform for LLM observability and evaluationOverviewQuickstartsTrace an applicationEvaluate an applicationTest promptsAPI & SDKsAPI referencePython SDKJS/TS SDKPricingPlansPricing FAQDocs by LangChain home pagePythonSearch...⌘KGitHubForumForumSearch...NavigationGet started with LangSmithGet startedObservabilityEvaluationPrompt engineeringSelf-hostingAdministrationGet startedObservabilityEvaluationPrompt engineeringSelf-hostingAdministrationGitHubForumGet started with LangSmithCopy pageCopy pageLangSmith is a platform for building production-grade LLM applications. Monitor and evaluate your application, so you can ship quickly and with confidence.
LangSmith is framework agnostic — you can use it with or without LangChain’s open source fram

{'input': 'What is langsmith',
 'output': 'Some of the key use cases of Langsmith include:\n\n*   Document analysis and summarization\n*   Chatbots\n*   Code analysis\n*   Trace an application\n*   Evaluate an application\n*   Test prompts\n\nThese use cases largely overlap with those of language models in general, and Langsmith provides a platform for building production-grade LLM applications.'}

In [19]:
agent_executor.invoke({"input":"What is machine learning"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `wikipedia` with `{'query': 'machine learning'}`


[0m[36;1m[1;3mPage: Machine learning
Summary: Machine learning (ML) is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalise to unseen data, and thus perform tasks without explicit instructions. Within a subdiscipline in machine learning, advances in the field of deep learning have allowed neural networks, a class of statistical algorithms, to surpass many previous machine learning approaches in performance.
ML fi[0m[32;1m[1;3m
Invoking: `wikipedia` with `{'query': 'Machine learning algorithms'}`


[0m[36;1m[1;3mPage: Machine learning
Summary: Machine learning (ML) is a field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalise to unseen data, and thus perform tasks withou

{'input': 'What is machine learning',
 'output': 'The requested information on machine learning appears to be a mix of Wikipedia pages and LangSmith platform features. Here is the information organized:\n\nMachine Learning:\n- A field of study in artificial intelligence concerned with the development and study of statistical algorithms that can learn from data and generalise to unseen data\n- Allows tasks to be performed without explicit instructions\n\nMachine Learning Algorithms:\n- Statistical algorithms that can learn from data and generalise to unseen data\n\nMachine Learning Applications:\n- Include predictive maintenance, image and speech recognition, and more\n\nLangSmith Platform Features:\n- Allows for observability and evaluation of LLM applications\n- Provides features for prompt engineering, self-hosting, and administration\n- Framework agnostic, can be used with or without LangChain’s open source frameworks'}

In [20]:
agent_executor.invoke({"input":"what is research paper all about  1706.03762"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `arxiv` with `{'query': '1706.03762'}`


[0m[33;1m[1;3mPublished: 2023-08-02
Title: Attention Is All You Need
Authors: Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, Illia Polosukhin
Summary: The dominant sequence transduction models are based on c[0m[32;1m[1;3m
Invoking: `wikipedia` with `{'query': 'research paper'}`


[0m



  lis = BeautifulSoup(html).find_all('li')


[36;1m[1;3mPage: Research paper mill
Summary: In research, a paper mill is a business that publishes poor quality or completely fraudulent journal papers that seem to resemble genuine research, as well as sells authorship on such papers. 
In some cases, paper mills are sophisticated operations that sell authorship positions on legitimate (but poor quality) research, but in many cases the papers contain fraudulent data and can be heavily plagiarized or otherwise unprofessional. According to a report from Na[0m[32;1m[1;3mIt seems the previous response was cut short. The function call for arxiv returned information about a specific paper, but since the query "1706.03762" is a specific ID for a research paper on arxiv, it would be more relevant to search for the paper title or a summary of the paper itself. The arxiv function is used to answer general questions about research papers, but it is not ideal for finding specific information about a particular paper. A better approach would

{'input': 'what is research paper all about  1706.03762',
 'output': 'It seems the previous response was cut short. The function call for arxiv returned information about a specific paper, but since the query "1706.03762" is a specific ID for a research paper on arxiv, it would be more relevant to search for the paper title or a summary of the paper itself. The arxiv function is used to answer general questions about research papers, but it is not ideal for finding specific information about a particular paper. A better approach would be to use the arxiv function with the title of the paper instead of the ID.'}