In [3]:
from dotenv import load_dotenv
load_dotenv()


True

In [4]:

# Prepare for the data
# Jina website - Fetch Web Page – 
# The agent should be able to fetch the content of a web page given its URL.


# Agent
# Save Summary – The agent should be able to save a summary of the page it processed.
# Search – The agent should be able to perform a search for relevant or related information.

# Orchestration agent to orchestrate which agent to use

In [5]:
# Which framework you chose (if any)
# Which LLM provider you used

# If you have time, 
# # I suggest trying to implement some of the things yourself before using a framework. 
# You can refer to lectures and ToyAIKit code for details.

In [6]:
# Prep data agent
import requests
from requests.exceptions import RequestException
from typing import Optional


# Make the function with docstring, and type -> Easily parsed by the agent 
def fetch_url(url: str) -> Optional[str]:
    """
    Fetch the textual content of a webpage.

    Args:
        url (str): The target URL to fetch content from.

    Returns:
        Optional[str]: The decoded HTML/text content of the fetched page if successful,
        or None if an error occurred.

    Raises:
        ValueError: If the provided URL is empty or invalid.
    """
    if not url or not isinstance(url, str):
        raise ValueError("The 'url' parameter must be a non-empty string.")

    jina_reader_base_url = "https://r.jina.ai/"
    jina_reader_url = jina_reader_base_url + url.lstrip("/")

    try:
        response = requests.get(jina_reader_url, timeout=10)
        response.raise_for_status()  # Raises HTTPError for bad status codes
        return response.content.decode("utf-8")
    except RequestException as e:
        # Catch all network-related errors (e.g., ConnectionError, Timeout, HTTPError)
        print(f"Error fetching URL '{jina_reader_url}': {e}")
        return None
    except UnicodeDecodeError:
        print(f"Error decoding response from '{jina_reader_url}'.")
        return None

In [7]:
result = fetch_url('https://en.wikipedia.org/wiki/LeBron_James')

In [8]:
len(result) / 2

279547.5

In [9]:
from pydantic_ai import Agent
from pydantic import BaseModel, Field

In [10]:
from pydantic_ai.messages import FunctionToolCallEvent

async def print_function_calls(ctx, event):
    # Detect nested streams
    if hasattr(event, "__aiter__"):
        async for sub in event:
            await print_function_calls(ctx, sub)
        return

    if isinstance(event, FunctionToolCallEvent):
        print("TOOL CALL:", event.part.tool_name, event.part.args)

In [12]:
# tool calling monitoring 

from pydantic_ai.messages import FunctionToolCallEvent

class NamedCallback:

    def __init__(self, agent):
        self.agent_name = agent.name

    async def print_function_calls(self, ctx, event):
        # Detect nested streams
        if hasattr(event, "__aiter__"):
            async for sub in event:
                await self.print_function_calls(ctx, sub)
            return

        if isinstance(event, FunctionToolCallEvent):
            tool_name = event.part.tool_name
            args = event.part.args
            print(f"TOOL CALL ({self.agent_name}): {tool_name}({args})")

    async def __call__(self, ctx, event):
        return await self.print_function_calls(ctx, event)

In [13]:
# need to save the summary and the content

# index tool
from minsearch import AppendableIndex

index = AppendableIndex(
    text_fields=["title", "summary", "details"],
    keyword_fields=["category"]
)



In [14]:
from typing import Any, Dict, List

class SearchTools:

    def __init__(self, index):
        self.index = index
    
    def search(self, query: str, num_results: int = 5) -> List[Dict[str, Any]]:
        """
        Search for documents in the index related to the given query.

        Args:
            query (str): The search query string.
            num_results (int, optional): The maximum number of search results to return. Default is 5.

        Returns:
            List[Dict[str, Any]]: 
                A list of search results, where each result is represented as a dictionary containing 
                the document’s metadata and content fields (e.g., title, summary, details).
                
        Example:
            >>> tools.search("LeBron James")
            [
                {"title": "LeBron James", "summary": "NBA player...", "details": "..."},
                {"title": "Michael Jordan", "summary": "Former NBA player...", "details": "..."}
            ]
        """
        boost = {"title": 2.0, "summary": 1.0, "details": 0.5}
        
        results = self.index.search(
            query=query,
            boost_dict=boost,
            num_results=num_results,
        )
        return results

    def add_entry(self, title: str, category: str, summary: str, details: str, url: str) -> None:
        """
        Add a new document entry to the index for future search and retrieval.

        Args:
            title (str): The title of the document or topic.
            category (str): The category or type of content (e.g., 'Wikipedia', 'News', 'UserNote').
            summary (str): A short summary of the content.
            details (str): Detailed information or extended description.

        Returns:
            None

        Example:
            >>> tools.add_entry(
            ...     title="LeBron James",
            ...     category="Wikipedia",
            ...     summary="American professional basketball player.",
            ...     details="Born in 1984, LeBron has played for the Cavaliers, Heat, and Lakers..."
            ... )
        """
        doc = {
            "title": title,
            "category": category,
            "summary": summary,
            "details": details,
            "section": "user_added"
        }
        self.index.append(doc)


tools = SearchTools(index)

In [56]:

class WikipediaSummaryOutput(BaseModel):
    """
    A single, verifiable citation to a transcript snippet or video segment.
    Must correspond to a real snippet returned by the `search()` tool.
    """
    title: str = Field(..., description="The title of the Wikipedia page, e.g., 'LeBron James'.")
    category: str = Field(..., description="What is the category of this Wikpedia page, e.g, ['Person', 'Idea', 'Company', 'Method']")
    summary: str = Field(..., description="A concise summary of the main topic and key facts from the page.")
    details: str = Field(..., description="A more detailed overview that includes important background, achievements, or related information not covered in the short summary.")
    user_query: str = Field(..., description="User input query.")
    results: str = Field(...,
        description=(
            "Answer based on a list of search results. If no relevant data is found, return an empty list."
        )
    )


# class SearchOutput(BaseModel):
#     """
#     Structured output for the Search Agent.

#     This model defines the expected response after the agent searches the local database.
#     It clearly indicates whether relevant information was found and provides the matching entries if any.
#     """
#     user_query: str = Field(..., description="User input query.")
#     results: str = Field(...,
#         description=(
#             "Answer based on a list of search results containing the fields 'title', 'summary', and 'details'. If no relevant data is found, return an empty list."
#         )
#     )

summary_instruction = """
You are the orchestrator agent that manages the workflow of searching, fetching, summarizing, and storing knowledge.

**Primary Rule:**  
You must ALWAYS begin by using the `tools.search` tool before doing anything else.
- If the user's query can be found from the database, retreive the summary from the documents.
- If the user's query can be answered from the search results, answer it directly.  
- Do not proceed to fetching or summarizing if the answer can be provided from the search results.


**Workflow Steps:**

1. **Search the database**
   - Use the `tools.search` tool to check if relevant knowledge exists for the user's query.  
   - If relevant content is found:
       - Answer the user's question using this content.
       - Return the summary and details from the search.
       - Mark the task as complete.  
   - If the search result does not contain relevant information, proceed to step 2.

2. **Fetch Wikipedia content and summarize**
   - Use the `fetch_url` tool to retrieve raw Wikipedia content for the topic or URL.  
   - Pass the fetched content to the summary agent to generate a concise summary and detailed context.  
   - Use `tools.add_entry` to add the summarized knowledge to the database for future queries.  

3. **Output**
   - Return a short, clear summary of the results.
   - Indicate whether the answer came from the database or was newly summarized.  

   
**Rules & Constraints**
- Only perform **one search at the beginning**.  
- Do not fabricate information; only use retrieved or summarized content.  
- Ensure all outputs are structured for downstream processing and database storage.
"""


summary_agent = Agent(
    name='Summarizer',
    instructions=summary_instruction,
    tools=[tools.search, fetch_url, tools.add_entry],
    model='gpt-4o-mini',
    output_type=WikipediaSummaryOutput
)
callback = NamedCallback(summary_agent)


In [57]:
question = "What is this page about? https://en.wikipedia.org/wiki/Capybara"

result = await summary_agent.run(
        user_prompt=question,
        event_stream_handler=callback,
    )

TOOL CALL (Summarizer): search({"query":"Capybara","num_results":5})


In [59]:
result

AgentRunResult(output=WikipediaSummaryOutput(title='Capybara', category='Wikipedia', summary='The capybara is the largest living rodent, native to South America. It is a semiaquatic mammal known for its social behavior, often found in groups near water bodies.', details='The capybara (_Hydrochoerus hydrochaeris_) can grow up to 134 cm in length and weigh between 35 to 66 kg. It has a heavy, barrel-shaped body, reddish-brown fur, and is excellent at swimming. Capybaras are herbivores, primarily consuming grasses and aquatic plants. They are social animals typically living in groups of 10-20 but can form larger groups during dry seasons. Although not endangered, they are hunted for their meat and skin. The species adapts well to urbanization, found in many zoos and parks.', user_query='What is this page about? https://en.wikipedia.org/wiki/Capybara', results="[{'title':'Capybara','category':'Wikipedia','summary':'The capybara is the largest living rodent, native to South America. It is a

In [60]:
result.output.results     

"[{'title':'Capybara','category':'Wikipedia','summary':'The capybara is the largest living rodent, native to South America. It is a semiaquatic mammal known for its social behavior, often found in groups near water bodies.','details':'The capybara (_Hydrochoerus hydrochaeris_) can grow up to 134 cm in length and weigh between 35 to 66 kg. It has a heavy, barrel-shaped body, reddish-brown fur, and is excellent at swimming. Capybaras are herbivores, primarily consuming grasses and aquatic plants. They are social animals typically living in groups of 10-20 but can form larger groups during dry seasons. Although not endangered, they are hunted for their meat and skin. The species adapts well to urbanization, found in many zoos and parks.'}]"

In [52]:
index.search("Capybara")

[{'title': 'Capybara',
  'category': 'Wikipedia',
  'summary': 'The capybara is the largest living rodent, native to South America. It is a semiaquatic mammal known for its social behavior, often found in groups near water bodies.',
  'details': 'The capybara (_Hydrochoerus hydrochaeris_) can grow up to 134 cm in length and weigh between 35 to 66 kg. It has a heavy, barrel-shaped body, reddish-brown fur, and is excellent at swimming. Capybaras are herbivores, primarily consuming grasses and aquatic plants. They are social animals typically living in groups of 10-20 but can form larger groups during dry seasons. Although not endangered, they are hunted for their meat and skin. The species adapts well to urbanization, found in many zoos and parks.',
  'section': 'user_added'},
 {'title': 'Capybara',
  'category': 'Wikipedia',
  'summary': 'The capybara, or greater capybara (_Hydrochoerus hydrochaeris_), is the largest living rodent, native to South America. It is social and can be found 

In [53]:
question = "https://en.wikipedia.org/wiki/Hydrochoerus"
result = results = await summary_agent.run(
        user_prompt=question,
        event_stream_handler=callback,
    )

TOOL CALL (Summarizer): fetch_url({"url":"https://en.wikipedia.org/wiki/Hydrochoerus"})
TOOL CALL (Summarizer): add_entry({"title":"Hydrochoerus","category":"Wikipedia","summary":"Hydrochoerus is a genus of rodents that includes the capybara, the largest living rodent, and its close relatives. They are semiaquatic and social animals found mainly in South America.","details":"The genus Hydrochoerus contains two living species, the capybara (_Hydrochoerus hydrochaeris_) and the lesser capybara (_Hydrochoerus isthmius_), as well as three extinct species. Capybaras are semiaquatic and can be found in lakes, rivers, and swamps, primarily in South America. They are known for their social behavior, living in groups of up to 100, and feeding mainly on grasses. The genus name derives from Greek words meaning 'water pig', reflecting their habitat and appearance. The species range includes South America and parts of Central America, while their extinct relatives lived in various areas including A

In [43]:
print(result.output.title)
print(result.output.category)
print(result.output.summary)
print(result.output.details)

Hydrochoerus
Wikipedia
The genus Hydrochoerus contains two living species, the capybara and the lesser capybara, and three extinct species. Capybaras are the largest living rodents and are semiaquatic, found near lakes and rivers in South America.
Hydrochoerus, which translates to 'water pig' in Ancient Greek, includes the capybara (_Hydrochoerus hydrochaeris_), the largest rodent, and the lesser capybara (_Hydrochoerus isthmius_). They are known for their social behavior, living in groups of up to 100 and communicating through vocalizations. Their diet consists primarily of grasses, and they have a gestation period of 130–150 days with litters of 2 to 8 young. Fossil records show that extinct species of Hydrochoerus were present in North America during the Pliocene to Pleistocene eras.


In [54]:
question = "What are threats to capybara populations?"

result = await summary_agent.run(
        user_prompt=question,
        event_stream_handler=callback,
    )

TOOL CALL (Summarizer): search({"query":"threats to capybara populations"})


In [55]:
result.output.results     

'Capybaras face several threats to their populations, including habitat loss and hunting, but currently, their populations are considered stable.'