In [15]:
from helper import get_openai_api_key
OPENAI_API_KEY = get_openai_api_key()

In [16]:
import nest_asyncio
nest_asyncio.apply()

In [17]:
urls = [
    "https://openreview.net/pdf?id=VtmBAGCN7o",
    "https://openreview.net/pdf?id=6PmJoRfdaK",
    "https://openreview.net/pdf?id=hSyW5go0v8",
    "https://openreview.net/pdf?id=VTF8yNQM66",
    "https://openreview.net/pdf?id=hSyW5go0v8",
    "https://openreview.net/pdf?id=9WD9KwssyT",
    "https://openreview.net/pdf?id=yV6fD7LYkF",
    "https://openreview.net/pdf?id=hnrB5YHoYu"
]

papers = [
    "metagpt.pdf",
    "longlora.pdf",
    "loftq.pdf",
    "swebench.pdf",
    "selfrag.pdf",
    "zipformer.pdf",
    "values.pdf",
    "finetune_fair_diffusion.pdf"
]

In [18]:
from utils import get_doc_tools
from pathlib import Path

paper_to_tools_dict = {}
for paper in papers:
    print(f"Getting tools for paper: {paper}")
    vector_tool, summary_tool = get_doc_tools(paper, Path(paper).stem)
    paper_to_tools_dict[paper] = [vector_tool, summary_tool]

Getting tools for paper: metagpt.pdf
Getting tools for paper: longlora.pdf
Getting tools for paper: loftq.pdf
Getting tools for paper: swebench.pdf
Getting tools for paper: selfrag.pdf
Getting tools for paper: zipformer.pdf
Getting tools for paper: values.pdf
Getting tools for paper: finetune_fair_diffusion.pdf


In [19]:
all_tools = [t for paper in papers for t in paper_to_tools_dict[paper]]

In [20]:
from llama_index.llms.openai import OpenAI

llm = OpenAI(model="gpt-3.5-turbo")

In [21]:
len(all_tools)

16

In [22]:
# define an "object" index and retriever over these tools
from llama_index.core import VectorStoreIndex
from llama_index.core.objects import ObjectIndex

obj_index = ObjectIndex.from_objects(
    all_tools,
    index_cls=VectorStoreIndex,
)

In [23]:
obj_retriever = obj_index.as_retriever(similarity_top_k=3)

In [24]:
tools = obj_retriever.retrieve(
    "Tell me about the eval dataset used in MetaGPT and SWE-Bench"
)

In [25]:
tools[2].metadata

ToolMetadata(description='Useful for summarization questions related to finetune_fair_diffusion', name='summary_tool_finetune_fair_diffusion', fn_schema=<class 'llama_index.core.tools.types.DefaultToolFnSchema'>, return_direct=False)

In [26]:
from llama_index.core.agent import FunctionCallingAgentWorker
from llama_index.core.agent import AgentRunner

agent_worker = FunctionCallingAgentWorker.from_tools(
    tool_retriever=obj_retriever,
    llm=llm, 
    system_prompt=""" \
You are an agent designed to answer queries over a set of given papers.
Please always use the tools provided to answer a question. Do not rely on prior knowledge.\

""",
    verbose=True
)
agent = AgentRunner(agent_worker)

In [27]:
response = agent.query(
    "Tell me about the evaluation dataset used in LongLoRA, "
    "and then tell me about the evaluation results"
)

Added user message to memory: Tell me about the evaluation dataset used in LongLoRA, and then tell me about the evaluation results
=== Calling Function ===
Calling function: summary_tool_longlora with args: {"input": "evaluation dataset"}
=== Function Output ===
The evaluation datasets mentioned in the provided context include PG19, the cleaned Arxiv Math proof-pile dataset, LongBench, LEval, and LongAlpaca-12k. Additionally, the proposed dataset called SAFECONV is designed for evaluating conversation safety.
=== Calling Function ===
Calling function: summary_tool_longlora with args: {"input": "evaluation results"}
=== Function Output ===
The evaluation results demonstrate that the models achieve better perplexity with longer context sizes and show that different attention patterns have varying impacts on model performance during fine-tuning. The models are fine-tuned via the next token prediction objective and achieve promising results on extremely large settings. Additionally, the ev

In [28]:
response = agent.query(
    "Tell me about the evaluation dataset used "
    "in MetaGPT and compare it against SWE-Bench"
)

Added user message to memory: Tell me about the evaluation dataset used in MetaGPT and compare it against SWE-Bench
=== Calling Function ===
Calling function: summary_tool_metagpt with args: {"input": "evaluation dataset"}
=== Function Output ===
The evaluation dataset used in the research project includes three benchmarks: HumanEval, MBPP, and SoftwareDev. HumanEval consists of 164 handwritten programming tasks, MBPP comprises 427 Python tasks, and SoftwareDev includes 70 diverse software development tasks covering various scopes like mini-games, image processing algorithms, and data visualization. The evaluation metrics for HumanEval and MBPP include Pass @ k for functional accuracy, while SoftwareDev evaluation includes metrics like Executability, Cost, Code Statistics, Productivity, and Human Revision Cost. The dataset provides insights into the performance and characteristics of the generated code for different software development tasks, offering a comprehensive view of the softw