In [11]:
!pip install crewai crewai_tools arxiv



In [3]:
import os
from dotenv import load_dotenv

load_dotenv()

False

In [4]:
import arxiv
import time
import datetime

from typing import Type, List
from pydantic import BaseModel, Field 
from crewai.tools import BaseTool 
from crewai import Agent, Task, Crew

* 'fields' has been removed


## Define the Tools

In [23]:
class SearchArxivPapersInput(BaseModel):
    """Input schema for SearchArxivPapersTool."""
    start_date: datetime.date = Field(..., description="Starat date to fetch papers for.")
    end_date: datetime.date = Field(..., description="End date to fetch papers for.")

class SearchArxivPapersTool(BaseTool):
    name: str = "search_arxiv_papers"
    description: str = "Searches all ArXiv papers from selected categories submitted from the start date to end date."
    args_schema: Type[BaseModel] = SearchArxivPapersInput

    def _run(self, start_date: datetime.date, end_date: datetime.date = None) -> List[dict]:
        arxiv_ai_categories = ["cs.AI"]
        # arxiv_ai_categories = ["cs.AI", "cs.LG", "cs.CV", "cs.CL", "cs.RO"]

        # Define the date range for the target date
        start_date = start_date.strftime('%Y%m%d%H%M')
        if end_date:
            end_date = end_date.strftime('%Y%m%d%H%M')
        else:
            end_date = (start_date + datetime.timedelta(days=1)).strftime('%Y%m%d%H%M')

        # Initialize the ArXiv client
        client = arxiv.Client(
            page_size=100,  # Search 100 results per page
            delay_seconds=3  # Delay between requests to respect rate limits
        )

        all_papers = []
        for category in arxiv_ai_categories:
            print(f"Searching papers for category: {category}")

            search_query = f"cat:{category} AND submittedDate:[{start_date} TO {end_date}]"

            search = arxiv.Search(
                query=search_query,
                sort_by=arxiv.SortCriterion.SubmittedDate,
                max_results=20  # Search all results
            )

            # Collect results for the category
            category_papers = []
            for result in client.results(search):
                category_papers.append({
                    'title': result.title,
                    'authors': [author.name for author in result.authors],
                    'summary': result.summary,
                    'published': result.published,
                    'url': result.entry_id,
                    'comment':result.comment
                })

                # Delay between requests to respect rate limits
                time.sleep(1)

            print(f"Searched {len(category_papers)} papers from {category}")
            all_papers.extend(category_papers)

        return all_papers


In [24]:
arxiv_search_tool = SearchArxivPapersTool()

## Create our Agents 

In [25]:
# Agent 1: ArXiv Researcher
researcher = Agent(
    role = "Senior Researcher",
    goal = "Find the top 10 papers from the search results from ArXiv between dates {start_date} and {end_date}."
            "Rank them appropirately.",
    backstory = "You are a senior researcher with a deep understanding of all topics in AI and AI research."
                "You can identify the best research papers based on the title, abstract and comments."
                "Give higher priority to papers that have CVPR, ECCV, ICCV, ICLR, ICML, NeurIPS, ICRA, IROS, ACL, EMNLP mentioned in the comments"
                "Ignore papers which mention workshop in the comments",
    verbose = True,
    tools = [arxiv_search_tool],
)

# Agent 2: Frontend Engineer
frontend_engineer = Agent(
    role = "Senior Frontend & AI Engineer",
    goal = "Compile the results into a HTML file.",
    backstory = "You are a competent frontend engineer writing HTML, CSS and Markdown with decades of experience."
                "You have also been working with AI for decades and understand it well.",
    verbose = True,
)

## Create our Tasks


In [31]:
# Task for ArXiv Researcher
research_task = Task(
    description = ("Find the top 10 research papers from the search results from ArXiv between dates: {start_date} and {end_date}."),
    expected_output = (
        "A list of top 10 research papers with the following information in the following format:"
        "- Title"
        "- Authors"
        "- Abstract"
        "- Link to the paper"
    ),
    agent = researcher,
    human_input = True,
)

# Task for Frontend Engineer
reporting_task = Task(
    description = ("Compile the results into a detailed report in HTML file format."),
    expected_output = (
        "An HTML file with the results in the following format:"
        "Top 10 AI Research Papers published between dates {start_date} and {end_date}"
        "- Title (which on clicking opens the paper in a new tab)"
        "- Authors"
        "- Short summary of the abstract (2-4 sentences)"
    ),
    agent = frontend_engineer,
    context = [research_task],
    output_file = "./report.html",
    human_input = True,
)

## Crate the Crew

In [32]:
arxiv_research_crew = Crew(
    agents = [researcher, frontend_engineer],
    tasks = [research_task, reporting_task],
    verbose = True,
)

Overriding of current TracerProvider is not allowed


## Run the Crew

In [33]:
crew_inputs = {
    "start_date" : "2025-06-17", 
    "end_date" : "2025-06-18",
}
result = arxiv_research_crew.kickoff(inputs = crew_inputs)

[1m[95m# Agent:[00m [1m[92mSenior Researcher[00m
[95m## Task:[00m [92mFind the top 10 research papers from the search results from ArXiv between dates: 2025-06-17 and 2025-06-18.[00m
Searching papers for category: cs.AI
Searched 20 papers from cs.AI


[1m[95m# Agent:[00m [1m[92mSenior Researcher[00m
[95m## Thought:[00m [92mI need to search for research papers on ArXiv from the specified date range of 2025-06-17 to 2025-06-18. I will use the tool `search_arxiv_papers` to gather this information.[00m
[95m## Using tool:[00m [92msearch_arxiv_papers[00m
[95m## Tool Input:[00m [92m
"{\"start_date\": \"2025-06-17\", \"end_date\": \"2025-06-18\"}"[00m
[95m## Tool Output:[00m [92m
[{'title': 'Optimal Embedding Learning Rate in LLMs: The Effect of Vocabulary Size', 'authors': ['Soufiane Hayou', 'Liyuan Liu'], 'summary': 'Pretraining large language models is a costly process. To make this process\nmore efficient, several methods have been proposed to optimize model\

 looks good


Human feedback:  looks good
[1m[95m# Agent:[00m [1m[92mSenior Frontend & AI Engineer[00m
[95m## Task:[00m [92mCompile the results into a detailed report in HTML file format.[00m


[1m[95m# Agent:[00m [1m[92mSenior Frontend & AI Engineer[00m
[95m## Final Answer:[00m [92m
```html
<!DOCTYPE html>
<html lang="en">

<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Top 10 AI Research Papers (June 17-18, 2025)</title>
    <style>
        body {
            font-family: Arial, sans-serif;
            margin: 20px;
            background-color: #f9f9f9;
        }
        
        h1 {
            color: #333;
        }
        
        .paper {
            border: 1px solid #ccc;
            border-radius: 5px;
            margin: 10px 0;
            padding: 15px;
            background-color: #fff;
        }
        
        .title {
            font-size: 1.5em;
            color: #007bff;
        }
   

 good


Human feedback:  good
