# Day 4: Agents and Tools

Today we are going to explore the ways we can enable agents to use tools.

In [12]:
#Let's add the markdown download code here
import io
import zipfile
import requests
import frontmatter

def read_repo_data(repo_owner, repo_name):
    """
    Download and parse all markdown files from a GitHub repository.

    Args:
        repo_owner: GitHub username or organization
        repo_name: Repository name

    Returns:
        List of dictionaries containing file content and metadata
    """
    prefix = 'https://codeload.github.com'
    url = f'{prefix}/{repo_owner}/{repo_name}/zip/refs/heads/main'
    resp = requests.get(url)

    if resp.status_code != 200:
        raise Exception(f"Failed to download repository: {resp.status_code}")

    repository_data = []
    zf = zipfile.ZipFile(io.BytesIO(resp.content))

    for file_info in zf.infolist():
        filename = file_info.filename
        filename_lower = filename.lower()

        if not (filename_lower.endswith('.md')
            or filename_lower.endswith('.mdx')):
            continue

        try:
            with zf.open(file_info) as f_in:
                content = f_in.read().decode('utf-8', errors='ignore')
                post = frontmatter.loads(content)
                data = post.to_dict()
                data['filename'] = filename
                repository_data.append(data)
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            continue

    zf.close()
    return repository_data

In [13]:
print("Hello from course!")
balsam_faq = read_repo_data('Zesky665', 'balsam')
evidently_docs = read_repo_data('evidentlyai', 'docs')
dtc_faq = read_repo_data('DataTalksClub', 'faq')
    
print(f"FAQ documents: {len(balsam_faq)}")
print(f"Evidently documents: {len(evidently_docs)}")
    

Hello from course!
FAQ documents: 3
Evidently documents: 95


In [4]:
# Now we can start chunking
# There are multiple ways of chunking a document.
# Here they are in order of complexity:
# - Simple Chunking
# - Token Based Chunking
# - Sematinc Chunking
# - Paragrapgh Splitting
# - Section Splitting
# - AI-powered Splitting

In [14]:
# The most commonly used simple chunking method is sliding window, which is chunking with overlap.
def sliding_window(seq, size, step):
    if size <= 0 or step <= 0:
        raise ValueError("size and step must be positive")

    n = len(seq)
    result = []
    for i in range(0, n, step):
        chunk = seq[i:i+size]
        result.append({'start': i, 'chunk': chunk})
        if i + size >= n:
            break

    return result

In [15]:
evidently_chunks = []

for doc in evidently_docs:
    doc_copy = doc.copy()
    doc_content = doc_copy['content']
    chunks = sliding_window(doc_content, 2000, 1000)
    for chunk in chunks:
        chunk.update(doc_copy)
    evidently_chunks.extend(chunks)

In [16]:
# Section Splitting
import re
text = evidently_docs[45]['content']
paragraphs = re.split(r"\n\s*\n", text.strip())

In [17]:
import re

def split_markdown_by_level(text, level=2):
    """
    Split markdown text by a specific header level.

    :param text: Markdown text as a string
    :param level: Header level to split on
    :return: List of sections as strings
    """
    # This regex matches markdown headers
    # For level 2, it matches lines starting with "## "
    header_pattern = r'^(#{' + str(level) + r'} )(.+)$'
    pattern = re.compile(header_pattern, re.MULTILINE)

    # Split and keep the headers
    parts = pattern.split(text)

    sections = []
    for i in range(1, len(parts), 3):
        # We step by 3 because regex.split() with
        # capturing groups returns:
        # [before_match, group1, group2, after_match, ...]
        # here group1 is "## ", group2 is the header text
        header = parts[i] + parts[i+1]  # "## " + "Title"
        header = header.strip()

        # Get the content after this header
        content = ""
        if i+2 < len(parts):
            content = parts[i+2].strip()

        if content:
            section = f'{header}\n\n{content}'
        else:
            section = header
        sections.append(section)

    return sections

In [18]:
evidently_chunks = []

for doc in evidently_docs:
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')
    sections = split_markdown_by_level(doc_content, level=2)
    for section in sections:
        section_doc = doc_copy.copy()
        section_doc['section'] = section
        evidently_chunks.append(section_doc)

In [19]:
# LLM Based Chunking
import os
from mistralai import Mistral

api_key=os.environ["MISTRAL_API_KEY"]
model = "mistral-large-latest"

client = Mistral(api_key=api_key)

def llm(prompt, model="mistral-large-latest"):
    chat_response = client.chat.complete(
        model= model,
        messages = [
            {
                "role": "user",
                "content": "What is the best French cheese?",
            },
        ]
    )
    print(chat_response.choices[0].message.content)
    return chat_response.choices[0].message.content



KeyError: 'MISTRAL_API_KEY'

In [20]:
prompt_template = """
Split the provided document into logical sections
that make sense for a Q&A system.

Each section should be self-contained and cover
a specific topic or concept.

<DOCUMENT>
{document}
</DOCUMENT>

Use this format:

## Section Name

Section content with all relevant details

---

## Another Section Name

Another section content

---
""".strip()

In [21]:
def intelligent_chunking(text):
    prompt = prompt_template.format(document=text)
    response = llm(prompt_template)
    sections = response.split('---')
    sections = [s.strip() for s in sections if s.strip()]
    return sections

In [22]:
from tqdm.auto import tqdm

evidently_chunks = []

for doc in tqdm(evidently_docs):
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')

    sections = intelligent_chunking(doc_content)
    for section in sections:
        section_doc = doc_copy.copy()
        section_doc['section'] = section
        evidently_chunks.append(section_doc)

  0%|          | 0/95 [00:00<?, ?it/s]

NameError: name 'llm' is not defined

In [None]:
# Search
# Text, Vector and Sematic

In [23]:
evidently_docs = read_repo_data('evidentlyai', 'docs')

evidently_chunks = []

for doc in evidently_docs:
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')
    chunks = sliding_window(doc_content, 2000, 1000)
    for chunk in chunks:
        chunk.update(doc_copy)
    evidently_chunks.extend(chunks)

In [24]:
# Text Search
from minsearch import Index

index = Index(
    text_fields=["chunk", "title", "description", "filename"],
    keyword_fields=[]
)

index.fit(evidently_chunks)

<minsearch.minsearch.Index at 0x115ffec90>

In [25]:
query = 'What should be in a test dataset for AI evaluation?'
results = index.search(query)

In [26]:
# Vector Search
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer('multi-qa-distilbert-cos-v1')

dtc_faq = read_repo_data('DataTalksClub', 'faq')

de_dtc_faq = [d for d in dtc_faq if 'data-engineering' in d['filename']]

faq_index = Index(
    text_fields=["question", "content"],
    keyword_fields=[]
)

faq_index.fit(de_dtc_faq)

record = de_dtc_faq[2]
text = record['question'] + ' ' + record['content']
v_doc = embedding_model.encode(text)

In [27]:
query = 'I just found out about the course. Can I enroll now?'
v_query = embedding_model.encode(query)

In [28]:
similarity = v_query.dot(v_doc)

In [30]:
from tqdm.auto import tqdm
import numpy as np

faq_embeddings = []

for d in tqdm(de_dtc_faq):
    text = d['question'] + ' ' + d['content']
    v = embedding_model.encode(text)
    faq_embeddings.append(v)

faq_embeddings = np.array(faq_embeddings)

  0%|          | 0/449 [00:00<?, ?it/s]

In [31]:
from minsearch import VectorSearch

faq_vindex = VectorSearch()
faq_vindex.fit(faq_embeddings, de_dtc_faq)

<minsearch.vector.VectorSearch at 0x150cb93a0>

In [32]:
query = 'Can I join the course now?'
q = embedding_model.encode(query)
results = faq_vindex.search(q)

In [33]:
evidently_embeddings = []

for d in tqdm(evidently_chunks):
    v = embedding_model.encode(d['chunk'])
    evidently_embeddings.append(v)

evidently_embeddings = np.array(evidently_embeddings)

evidently_vindex = VectorSearch()
evidently_vindex.fit(evidently_embeddings, evidently_chunks)

  0%|          | 0/575 [00:00<?, ?it/s]

<minsearch.vector.VectorSearch at 0x1305b7e30>

In [34]:
# Hybrid Search
query = 'Can I join the course now?'

text_results = faq_index.search(query, num_results=5)

q = embedding_model.encode(query)
vector_results = faq_vindex.search(q, num_results=5)

final_results = text_results + vector_results

In [35]:
def text_search(query):
    return faq_index.search(query, num_results=5)

def vector_search(query):
    q = embedding_model.encode(query)
    return faq_vindex.search(q, num_results=5)

def hybrid_search(query):
    text_results = text_search(query)
    vector_results = vector_search(query)

    # Combine and deduplicate results
    seen_ids = set()
    combined_results = []

    for result in text_results + vector_results:
        if result['filename'] not in seen_ids:
            seen_ids.add(result['filename'])
            combined_results.append(result)

    return combined_results

In [65]:
# Agents and Tool Use
import os
from mistralai import Mistral

api_key= "tzrdB1gE59KBnu3zt0o3dSHhGzsOXfCi"#os.environ["MISTRAL_API_KEY"]
model = "mistral-small-latest"

client = Mistral(api_key=api_key)

def llm(prompt, model="mistral-small-latest"):
    chat_response = client.chat.complete(
        model= model,
        messages = [
            {
                "role": "user",
                "content": prompt,
            },
        ]
    )
    print(chat_response.choices[0].message.content)
    return chat_response.choices[0].message.content

In [66]:
user_prompt = "I just discovered the course, can I join now?"
resp = llm(user_prompt, model)
resp

Of course! You can join the course now. Here’s what you can do next:

1. **Enroll**: If the course is on a platform like Coursera, Udemy, or edX, simply click the "Enroll" or "Join Now" button.
2. **Access Materials**: Once enrolled, you’ll get immediate access to the course content (videos, readings, assignments, etc.).
3. **Start Learning**: Begin with the first module or lesson—most courses are self-paced, so you can go at your own speed.

If you have any questions about the course structure, prerequisites, or payment, feel free to ask! Welcome aboard—happy learning! 🎓


'Of course! You can join the course now. Here’s what you can do next:\n\n1. **Enroll**: If the course is on a platform like Coursera, Udemy, or edX, simply click the "Enroll" or "Join Now" button.\n2. **Access Materials**: Once enrolled, you’ll get immediate access to the course content (videos, readings, assignments, etc.).\n3. **Start Learning**: Begin with the first module or lesson—most courses are self-paced, so you can go at your own speed.\n\nIf you have any questions about the course structure, prerequisites, or payment, feel free to ask! Welcome aboard—happy learning! 🎓'

In [67]:
def text_search(query):
    return faq_index.search(query, num_results=5)

In [68]:
text_search_tool = {
    "type": "function",
    "function": {
        "name": "text_search",
        "description": "Search the FAQ database",
        "parameters": {
            "type": "object",
            "properties": {
                "query": {
                    "type": "string",
                    "description": "Search query text to look up in the course FAQ."
                }
            },
            "required": ["query"],
            "additionalProperties": False
        }
    }
}

In [69]:
system_prompt = """
You are a helpful assistant for a course.
"""

question = "I just discovered the course, can I join now?"

chat_messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": question}
]

tools = [text_search_tool]

chat_response = client.chat.complete(
        model= model,
        messages = chat_messages,
        tools = tools,
        tool_choice = "any",
        parallel_tool_calls = False,
    )

In [70]:
chat_response

ChatCompletionResponse(id='b4c0d3ddc3d5433680f7431ad606c6c8', object='chat.completion', model='mistral-small-latest', usage=UsageInfo(prompt_tokens=104, completion_tokens=16, total_tokens=120, prompt_audio_seconds=Unset()), created=1759444129, choices=[ChatCompletionChoice(index=0, message=AssistantMessage(content='', tool_calls=[ToolCall(function=FunctionCall(name='text_search', arguments='{"query": "can I join the course now"}'), id='zl9Jjin3B', type=None, index=0)], prefix=False, role='assistant'), finish_reason='tool_calls')])

In [71]:
import json

call = chat_response.choices[0].message.tool_calls[0]

arguments = json.loads(call.function.arguments)
result = text_search(**arguments)

call_output = {
    "type": "function_call_output",
    "call_id": call.id,
    "output": json.dumps(result),
}

In [72]:
call_output

{'type': 'function_call_output',
 'call_id': 'zl9Jjin3B',
 'output': '[{"id": "3f1424af17", "question": "Course: Can I still join the course after the start date?", "sort_order": 3, "content": "Yes, even if you don\'t register, you\'re still eligible to submit the homework.\\n\\nBe aware, however, that there will be deadlines for turning in homeworks and the final projects. So don\'t leave everything for the last minute.", "filename": "faq-main/_questions/data-engineering-zoomcamp/general/003_3f1424af17_course-can-i-still-join-the-course-after-the-start.md"}, {"id": "9e508f2212", "question": "Course: When does the course start?", "sort_order": 1, "content": "The next cohort starts January 13th, 2025. More info at [DTC](https://datatalks.club/blog/guide-to-free-online-courses-at-datatalks-club.html).\\n\\n- Register before the course starts using this [link](https://airtable.com/shr6oVXeQvSI5HuWD).\\n- Join the [course Telegram channel with announcements](https://t.me/dezoomcamp).\\n- D

In [75]:
# Append the assistant's message with tool calls
chat_messages.append({
    "role": "assistant",
    "tool_calls": [call]
})

# Append the tool result
chat_messages.append({
    "role": "tool",
    "name": call.function.name,
    "content": call_output["output"],
    "tool_call_id": call.id
})

chat_response = client.chat.complete(
        model= model,
        messages = chat_messages,
        tools = tools,
        tool_choice = "auto",
        parallel_tool_calls = False,
    )

print(chat_response)

id='c48ace4bd12a412b8b6fe8ef53c21976' object='chat.completion' model='mistral-small-latest' usage=UsageInfo(prompt_tokens=2624, completion_tokens=50, total_tokens=2674, prompt_audio_seconds=Unset()) created=1759444228 choices=[ChatCompletionChoice(index=0, message=AssistantMessage(content="Yes, even if you don't register, you're still eligible to submit the homework.\n\nBe aware, however, that there will be deadlines for turning in homeworks and the final projects. So don't leave everything for the last minute.", tool_calls=None, prefix=False, role='assistant'), finish_reason='stop')]


In [76]:
# System prompt
chat_messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": question}
]

chat_response = client.chat.complete(
        model= model,
        messages = chat_messages,
        tools = tools,
        tool_choice = "auto",
        parallel_tool_calls = False,
    )

In [77]:
chat_response

ChatCompletionResponse(id='296c142f8ac440459fa59eb4f6cae741', object='chat.completion', model='mistral-small-latest', usage=UsageInfo(prompt_tokens=104, completion_tokens=16, total_tokens=120, prompt_audio_seconds=Unset()), created=1759445822, choices=[ChatCompletionChoice(index=0, message=AssistantMessage(content='', tool_calls=[ToolCall(function=FunctionCall(name='text_search', arguments='{"query": "can I join the course now"}'), id='v2oNBJdjZ', type=None, index=0)], prefix=False, role='assistant'), finish_reason='tool_calls')])

In [79]:
system_prompt = """
You are a helpful assistant for a course.

Use the search tool to find relevant information from the course materials before answering questions.

If you can find specific information through search, use it to provide accurate answers.
If the search doesn't return relevant results, let the user know and provide general guidance.
"""

In [80]:
system_prompt = """
You are a helpful assistant for a course.

Always search for relevant information before answering.
If the first search doesn't give you enough information, try different search terms.

Make multiple searches if needed to provide comprehensive answers.
"""

In [81]:
from typing import List, Any

def text_search(query: str) -> List[Any]:
    """
    Perform a text-based search on the FAQ index.

    Args:
        query (str): The search query string.

    Returns:
        List[Any]: A list of up to 5 search results returned by the FAQ index.
    """
    return faq_index.search(query, num_results=5)

In [87]:
from pydantic_ai import Agent
import os
os.environ['MISTRAL_API_KEY'] = api_key

agent = Agent(
    name="faq_agent",
    instructions=system_prompt,
    tools=[text_search],
    model='mistral:mistral-small-latest'
)

In [88]:
question = "I just discovered the course, can I join now?"

result = await agent.run(user_prompt=question)

In [89]:
result

AgentRunResult(output="Yes, you can still join the course after the start date. You can submit homework even if you don't register, but be aware of the deadlines for homework and final projects. Don't leave everything for the last minute.")

In [90]:
result.new_messages()

[ModelRequest(parts=[UserPromptPart(content='I just discovered the course, can I join now?', timestamp=datetime.datetime(2025, 10, 2, 23, 5, 25, 669076, tzinfo=datetime.timezone.utc))], instructions="You are a helpful assistant for a course.\n\nAlways search for relevant information before answering.\nIf the first search doesn't give you enough information, try different search terms.\n\nMake multiple searches if needed to provide comprehensive answers."),
 ModelResponse(parts=[ToolCallPart(tool_name='text_search', args='{"query": "can I join the course now"}', tool_call_id='1Yo0fJswk')], usage=RequestUsage(input_tokens=179, output_tokens=16), model_name='mistral-small-latest', timestamp=datetime.datetime(2025, 10, 2, 23, 5, 25, tzinfo=TzInfo(UTC)), provider_name='mistral', provider_details={'finish_reason': 'tool_calls'}, provider_response_id='103cc7f4ee7543eaad28fbb1395481f8', finish_reason='tool_call'),
 ModelRequest(parts=[ToolReturnPart(tool_name='text_search', content=[{'id': '3f