# Day 5: Evaluation
Today we are going to explore the ways we can evaluate our agents.

In [1]:
#Let's add the markdown download code here
import io
import zipfile
import requests
import frontmatter

def read_repo_data(repo_owner, repo_name):
    """
    Download and parse all markdown files from a GitHub repository.

    Args:
        repo_owner: GitHub username or organization
        repo_name: Repository name

    Returns:
        List of dictionaries containing file content and metadata
    """
    prefix = 'https://codeload.github.com'
    url = f'{prefix}/{repo_owner}/{repo_name}/zip/refs/heads/main'
    resp = requests.get(url)

    if resp.status_code != 200:
        raise Exception(f"Failed to download repository: {resp.status_code}")

    repository_data = []
    zf = zipfile.ZipFile(io.BytesIO(resp.content))

    for file_info in zf.infolist():
        filename = file_info.filename
        filename_lower = filename.lower()

        if not (filename_lower.endswith('.md')
            or filename_lower.endswith('.mdx')):
            continue

        try:
            with zf.open(file_info) as f_in:
                content = f_in.read().decode('utf-8', errors='ignore')
                post = frontmatter.loads(content)
                data = post.to_dict()
                data['filename'] = filename
                repository_data.append(data)
        except Exception as e:
            print(f"Error processing {filename}: {e}")
            continue

    zf.close()
    return repository_data

In [3]:
print("Hello from course!")
balsam_faq = read_repo_data('Zesky665', 'balsam')
evidently_docs = read_repo_data('evidentlyai', 'docs')
dtc_faq = read_repo_data('DataTalksClub', 'faq')
    
print(f"FAQ documents: {len(balsam_faq)}")
print(f"Evidently documents: {len(evidently_docs)}")
    

Hello from course!
FAQ documents: 3
Evidently documents: 95


In [4]:
# Now we can start chunking
# There are multiple ways of chunking a document.
# Here they are in order of complexity:
# - Simple Chunking
# - Token Based Chunking
# - Sematinc Chunking
# - Paragrapgh Splitting
# - Section Splitting
# - AI-powered Splitting

In [4]:
# The most commonly used simple chunking method is sliding window, which is chunking with overlap.
def sliding_window(seq, size, step):
    if size <= 0 or step <= 0:
        raise ValueError("size and step must be positive")

    n = len(seq)
    result = []
    for i in range(0, n, step):
        chunk = seq[i:i+size]
        result.append({'start': i, 'chunk': chunk})
        if i + size >= n:
            break

    return result

In [5]:
evidently_chunks = []

for doc in evidently_docs:
    doc_copy = doc.copy()
    doc_content = doc_copy['content']
    chunks = sliding_window(doc_content, 2000, 1000)
    for chunk in chunks:
        chunk.update(doc_copy)
    evidently_chunks.extend(chunks)

In [6]:
# Section Splitting
import re
text = evidently_docs[45]['content']
paragraphs = re.split(r"\n\s*\n", text.strip())

In [7]:
import re

def split_markdown_by_level(text, level=2):
    """
    Split markdown text by a specific header level.

    :param text: Markdown text as a string
    :param level: Header level to split on
    :return: List of sections as strings
    """
    # This regex matches markdown headers
    # For level 2, it matches lines starting with "## "
    header_pattern = r'^(#{' + str(level) + r'} )(.+)$'
    pattern = re.compile(header_pattern, re.MULTILINE)

    # Split and keep the headers
    parts = pattern.split(text)

    sections = []
    for i in range(1, len(parts), 3):
        # We step by 3 because regex.split() with
        # capturing groups returns:
        # [before_match, group1, group2, after_match, ...]
        # here group1 is "## ", group2 is the header text
        header = parts[i] + parts[i+1]  # "## " + "Title"
        header = header.strip()

        # Get the content after this header
        content = ""
        if i+2 < len(parts):
            content = parts[i+2].strip()

        if content:
            section = f'{header}\n\n{content}'
        else:
            section = header
        sections.append(section)

    return sections

In [8]:
evidently_chunks = []

for doc in evidently_docs:
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')
    sections = split_markdown_by_level(doc_content, level=2)
    for section in sections:
        section_doc = doc_copy.copy()
        section_doc['section'] = section
        evidently_chunks.append(section_doc)

In [10]:
# LLM Based Chunking
import os
from mistralai import Mistral

# api_key=os.environ["MISTRAL_API_KEY"]
model = "mistral-large-latest"

client = Mistral(api_key=api_key)

def llm(prompt, model="mistral-large-latest"):
    chat_response = client.chat.complete(
        model= model,
        messages = [
            {
                "role": "user",
                "content": "What is the best French cheese?",
            },
        ]
    )
    print(chat_response.choices[0].message.content)
    return chat_response.choices[0].message.content



In [11]:
prompt_template = """
Split the provided document into logical sections
that make sense for a Q&A system.

Each section should be self-contained and cover
a specific topic or concept.

<DOCUMENT>
{document}
</DOCUMENT>

Use this format:

## Section Name

Section content with all relevant details

---

## Another Section Name

Another section content

---
""".strip()

In [12]:
def intelligent_chunking(text):
    prompt = prompt_template.format(document=text)
    response = llm(prompt_template)
    sections = response.split('---')
    sections = [s.strip() for s in sections if s.strip()]
    return sections

In [13]:
from tqdm.auto import tqdm

evidently_chunks = []

for doc in tqdm(evidently_docs):
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')

    sections = intelligent_chunking(doc_content)
    for section in sections:
        section_doc = doc_copy.copy()
        section_doc['section'] = section
        evidently_chunks.append(section_doc)

  0%|          | 0/95 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
# Search
# Text, Vector and Sematic

In [14]:
evidently_docs = read_repo_data('evidentlyai', 'docs')

evidently_chunks = []

for doc in evidently_docs:
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')
    chunks = sliding_window(doc_content, 2000, 1000)
    for chunk in chunks:
        chunk.update(doc_copy)
    evidently_chunks.extend(chunks)

In [15]:
# Text Search
from minsearch import Index

index = Index(
    text_fields=["chunk", "title", "description", "filename"],
    keyword_fields=[]
)

index.fit(evidently_chunks)

<minsearch.minsearch.Index at 0x1115f3b60>

In [16]:
query = 'What should be in a test dataset for AI evaluation?'
results = index.search(query)

In [19]:
# Vector Search
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer('multi-qa-distilbert-cos-v1')

dtc_faq = read_repo_data('DataTalksClub', 'faq')

de_dtc_faq = [d for d in dtc_faq if 'data-engineering' in d['filename']]

faq_index = Index(
    text_fields=["question", "content"],
    keyword_fields=[]
)

faq_index.fit(de_dtc_faq)

record = de_dtc_faq[2]
text = record['question'] + ' ' + record['content']
v_doc = embedding_model.encode(text)

In [20]:
query = 'I just found out about the course. Can I enroll now?'
v_query = embedding_model.encode(query)

In [21]:
similarity = v_query.dot(v_doc)

In [22]:
from tqdm.auto import tqdm
import numpy as np

faq_embeddings = []

for d in tqdm(de_dtc_faq):
    text = d['question'] + ' ' + d['content']
    v = embedding_model.encode(text)
    faq_embeddings.append(v)

faq_embeddings = np.array(faq_embeddings)

  0%|          | 0/449 [00:00<?, ?it/s]

In [23]:
from minsearch import VectorSearch

faq_vindex = VectorSearch()
faq_vindex.fit(faq_embeddings, de_dtc_faq)

<minsearch.vector.VectorSearch at 0x323864b00>

In [24]:
query = 'Can I join the course now?'
q = embedding_model.encode(query)
results = faq_vindex.search(q)

In [25]:
evidently_embeddings = []

for d in tqdm(evidently_chunks):
    v = embedding_model.encode(d['chunk'])
    evidently_embeddings.append(v)

evidently_embeddings = np.array(evidently_embeddings)

evidently_vindex = VectorSearch()
evidently_vindex.fit(evidently_embeddings, evidently_chunks)

  0%|          | 0/575 [00:00<?, ?it/s]

<minsearch.vector.VectorSearch at 0x323858cb0>

In [26]:
# Hybrid Search
query = 'Can I join the course now?'

text_results = faq_index.search(query, num_results=5)

q = embedding_model.encode(query)
vector_results = faq_vindex.search(q, num_results=5)

final_results = text_results + vector_results

In [27]:
def text_search(query):
    return faq_index.search(query, num_results=5)

def vector_search(query):
    q = embedding_model.encode(query)
    return faq_vindex.search(q, num_results=5)

def hybrid_search(query):
    text_results = text_search(query)
    vector_results = vector_search(query)

    # Combine and deduplicate results
    seen_ids = set()
    combined_results = []

    for result in text_results + vector_results:
        if result['filename'] not in seen_ids:
            seen_ids.add(result['filename'])
            combined_results.append(result)

    return combined_results

In [29]:
# Agents and Tool Use
import os
from mistralai import Mistral

# api_key= os.environ["MISTRAL_API_KEY"]
model = "mistral-small-latest"

client = Mistral(api_key=api_key)

def llm(prompt, model="mistral-small-latest"):
    chat_response = client.chat.complete(
        model= model,
        messages = [
            {
                "role": "user",
                "content": prompt,
            },
        ]
    )
    print(chat_response.choices[0].message.content)
    return chat_response.choices[0].message.content

In [30]:
user_prompt = "I just discovered the course, can I join now?"
resp = llm(user_prompt, model)
resp

Yes! You can join the course now. Most online courses allow you to enroll at any time, and you can start learning immediately. However, here are a few things to check:

1. **Course Start Date** – Some courses have fixed start dates (especially if they're instructor-led or cohort-based). If that's the case, you may need to wait for the next intake.
2. **Self-Paced vs. Scheduled** – If the course is self-paced, you can begin right away. If it's scheduled, check the next available session.
3. **Enrollment Deadline** – Some courses have a cutoff date for joining.

If you're unsure, check the course description or contact the course provider for details.


"Yes! You can join the course now. Most online courses allow you to enroll at any time, and you can start learning immediately. However, here are a few things to check:\n\n1. **Course Start Date** – Some courses have fixed start dates (especially if they're instructor-led or cohort-based). If that's the case, you may need to wait for the next intake.\n2. **Self-Paced vs. Scheduled** – If the course is self-paced, you can begin right away. If it's scheduled, check the next available session.\n3. **Enrollment Deadline** – Some courses have a cutoff date for joining.\n\nIf you're unsure, check the course description or contact the course provider for details."

In [31]:
def text_search(query):
    return faq_index.search(query, num_results=5)

In [32]:
text_search_tool = {
    "type": "function",
    "function": {
        "name": "text_search",
        "description": "Search the FAQ database",
        "parameters": {
            "type": "object",
            "properties": {
                "query": {
                    "type": "string",
                    "description": "Search query text to look up in the course FAQ."
                }
            },
            "required": ["query"],
            "additionalProperties": False
        }
    }
}

In [33]:
system_prompt = """
You are a helpful assistant for a course.
"""

question = "I just discovered the course, can I join now?"

chat_messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": question}
]

tools = [text_search_tool]

chat_response = client.chat.complete(
        model= model,
        messages = chat_messages,
        tools = tools,
        tool_choice = "any",
        parallel_tool_calls = False,
    )

In [34]:
chat_response

ChatCompletionResponse(id='83e53a81a7504ca3a985bbc30a2f99bf', object='chat.completion', model='mistral-small-latest', usage=UsageInfo(prompt_tokens=104, completion_tokens=16, total_tokens=120, prompt_audio_seconds=Unset()), created=1759521877, choices=[ChatCompletionChoice(index=0, message=AssistantMessage(content='', tool_calls=[ToolCall(function=FunctionCall(name='text_search', arguments='{"query": "can I join the course now"}'), id='fUxnup9U2', type=None, index=0)], prefix=False, role='assistant'), finish_reason='tool_calls')])

In [35]:
import json

call = chat_response.choices[0].message.tool_calls[0]

arguments = json.loads(call.function.arguments)
result = text_search(**arguments)

call_output = {
    "type": "function_call_output",
    "call_id": call.id,
    "output": json.dumps(result),
}

In [36]:
call_output

{'type': 'function_call_output',
 'call_id': 'fUxnup9U2',
 'output': '[{"id": "3f1424af17", "question": "Course: Can I still join the course after the start date?", "sort_order": 3, "content": "Yes, even if you don\'t register, you\'re still eligible to submit the homework.\\n\\nBe aware, however, that there will be deadlines for turning in homeworks and the final projects. So don\'t leave everything for the last minute.", "filename": "faq-main/_questions/data-engineering-zoomcamp/general/003_3f1424af17_course-can-i-still-join-the-course-after-the-start.md"}, {"id": "9e508f2212", "question": "Course: When does the course start?", "sort_order": 1, "content": "The next cohort starts January 13th, 2025. More info at [DTC](https://datatalks.club/blog/guide-to-free-online-courses-at-datatalks-club.html).\\n\\n- Register before the course starts using this [link](https://airtable.com/shr6oVXeQvSI5HuWD).\\n- Join the [course Telegram channel with announcements](https://t.me/dezoomcamp).\\n- D

In [37]:
# Append the assistant's message with tool calls
chat_messages.append({
    "role": "assistant",
    "tool_calls": [call]
})

# Append the tool result
chat_messages.append({
    "role": "tool",
    "name": call.function.name,
    "content": call_output["output"],
    "tool_call_id": call.id
})

chat_response = client.chat.complete(
        model= model,
        messages = chat_messages,
        tools = tools,
        tool_choice = "auto",
        parallel_tool_calls = False,
    )

print(chat_response)

id='2af5cf5bd98f4a939682f365ba60b563' object='chat.completion' model='mistral-small-latest' usage=UsageInfo(prompt_tokens=946, completion_tokens=50, total_tokens=996, prompt_audio_seconds=Unset()) created=1759521881 choices=[ChatCompletionChoice(index=0, message=AssistantMessage(content="Yes, even if you don't register, you're still eligible to submit the homework.\n\nBe aware, however, that there will be deadlines for turning in homeworks and the final projects. So don't leave everything for the last minute.", tool_calls=None, prefix=False, role='assistant'), finish_reason='stop')]


In [38]:
# System prompt
chat_messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": question}
]

chat_response = client.chat.complete(
        model= model,
        messages = chat_messages,
        tools = tools,
        tool_choice = "auto",
        parallel_tool_calls = False,
    )

In [39]:
chat_response

ChatCompletionResponse(id='7014c320879f408cafd86b3f2ed0ff5d', object='chat.completion', model='mistral-small-latest', usage=UsageInfo(prompt_tokens=104, completion_tokens=16, total_tokens=120, prompt_audio_seconds=Unset()), created=1759521884, choices=[ChatCompletionChoice(index=0, message=AssistantMessage(content='', tool_calls=[ToolCall(function=FunctionCall(name='text_search', arguments='{"query": "can I join the course now"}'), id='fX8tZUjTf', type=None, index=0)], prefix=False, role='assistant'), finish_reason='tool_calls')])

In [40]:
system_prompt = """
You are a helpful assistant for a course.

Use the search tool to find relevant information from the course materials before answering questions.

If you can find specific information through search, use it to provide accurate answers.
If the search doesn't return relevant results, let the user know and provide general guidance.
"""

In [41]:
system_prompt = """
You are a helpful assistant for a course.

Always search for relevant information before answering.
If the first search doesn't give you enough information, try different search terms.

Make multiple searches if needed to provide comprehensive answers.
"""

In [42]:
from typing import List, Any

def text_search(query: str) -> List[Any]:
    """
    Perform a text-based search on the FAQ index.

    Args:
        query (str): The search query string.

    Returns:
        List[Any]: A list of up to 5 search results returned by the FAQ index.
    """
    return faq_index.search(query, num_results=5)

In [44]:
from pydantic_ai import Agent
import os
os.environ['MISTRAL_API_KEY'] = api_key

agent = Agent(
    name="faq_agent",
    instructions=system_prompt,
    tools=[text_search],
    model='mistral:mistral-small-latest'
)

In [45]:
question = "I just discovered the course, can I join now?"

result = await agent.run(user_prompt=question)

In [46]:
result

AgentRunResult(output="Yes, you can still join the course after the start date. You can submit homework even if you don't register, but be aware of the deadlines for homework and final projects. Don't leave everything for the last minute.")

In [47]:
result.new_messages()

[ModelRequest(parts=[UserPromptPart(content='I just discovered the course, can I join now?', timestamp=datetime.datetime(2025, 10, 3, 20, 5, 6, 369912, tzinfo=datetime.timezone.utc))], instructions="You are a helpful assistant for a course.\n\nAlways search for relevant information before answering.\nIf the first search doesn't give you enough information, try different search terms.\n\nMake multiple searches if needed to provide comprehensive answers."),
 ModelResponse(parts=[ToolCallPart(tool_name='text_search', args='{"query": "can I join the course now"}', tool_call_id='X6DOnnioH')], usage=RequestUsage(input_tokens=179, output_tokens=16), model_name='mistral-small-latest', timestamp=datetime.datetime(2025, 10, 3, 20, 5, 6, tzinfo=TzInfo(UTC)), provider_name='mistral', provider_details={'finish_reason': 'tool_calls'}, provider_response_id='5dbb23aafd1b433bbcfe14f15bd22435', finish_reason='tool_call'),
 ModelRequest(parts=[ToolReturnPart(tool_name='text_search', content=[{'id': '3f14

In [48]:
# Logging

In [116]:
from typing import List, Any
from pydantic_ai import Agent


def text_search(query: str) -> List[Any]:
    """
    Perform a text-based search on the FAQ index.

    Args:
        query (str): The search query string.

    Returns:
        List[Any]: A list of up to 5 search results returned by the FAQ index.
    """
    return faq_index.search(query, num_results=5)


system_prompt = """
You are a helpful assistant for a  course.

Use the search tool to find relevant information from the course materials before answering questions.

If you can find specific information through search, use it to provide accurate answers.
If the search doesn't return relevant results, let the user know and provide general guidance.
"""

from pydantic_ai import Agent

agent = Agent(
    name="faq_agent",
    instructions=system_prompt,
    tools=[text_search],
    model='mistral:mistral-small-latest'
)

In [117]:
question = "how do I install Kafka in Python?"
result = await agent.run(user_prompt=question)

In [118]:
from pydantic_ai.messages import ModelMessagesTypeAdapter


def log_entry(agent, messages, source="user"):
    tools = []

    for ts in agent.toolsets:
        tools.extend(ts.tools.keys())

    dict_messages = ModelMessagesTypeAdapter.dump_python(messages)

    return {
        "agent_name": agent.name,
        "system_prompt": agent._instructions,
        "provider": agent.model.system,
        "model": agent.model.model_name,
        "tools": tools,
        "messages": dict_messages,
        "source": source
    }

In [120]:
import json
import secrets
from pathlib import Path
from datetime import datetime


LOG_DIR = Path('logs')
LOG_DIR.mkdir(exist_ok=True)


def serializer(obj):
    if isinstance(obj, datetime):
        return obj.isoformat()
    raise TypeError(f"Type {type(obj)} not serializable")


def log_interaction_to_file(agent, messages, source='user'):
    entry = log_entry(agent, messages, source)

    ts = entry['messages'][-1]['timestamp']
    ts_str = ts.strftime("%Y%m%d_%H%M%S")
    rand_hex = secrets.token_hex(3)

    filename = f"{agent.name}_{ts_str}_{rand_hex}.json"
    filepath = LOG_DIR / filename

    with filepath.open("w", encoding="utf-8") as f_out:
        json.dump(entry, f_out, indent=2, default=serializer)

    return filepath

In [121]:
question = input()
result = await agent.run(user_prompt=question)
print(result.output)
log_interaction_to_file(agent, result.new_messages())

I apologize, but I couldn't find specific information on how to run Flink in Python from the course materials. However, I can provide you with general guidance on the topic.

Apache Flink is a powerful framework for distributed stream processing, and while it is primarily written in Java and Scala, it also has support for Python through PyFlink. Here are the general steps to run Flink in Python:

1. **Install PyFlink**: You can install PyFlink using pip. Open your terminal or command prompt and run:
   ```bash
   pip install apache-flink
   ```

2. **Set Up Your Environment**: Make sure you have Java installed, as Flink requires it. You can download and install Java from the [Oracle website](https://www.oracle.com/java/technologies/javase-downloads.html) or use an open-source version like OpenJDK.

3. **Write Your Flink Program**: Create a Python script for your Flink program. Here is a simple example:
   ```python
   from pyflink.datastream import StreamExecutionEnvironment
   from py

PosixPath('logs/faq_agent_20251003_202910_e92585.json')

In [122]:
system_prompt = """
You are a helpful assistant for a course.

Use the search tool to find relevant information from the course materials before answering questions.

If you can find specific information through search, use it to provide accurate answers.

Always include references by citing the filename of the source material you used.
When citing the reference, replace "faq-main" by the full path to the GitHub repository: "https://github.com/DataTalksClub/faq/blob/main/"
Format: [LINK TITLE](FULL_GITHUB_LINK)

If the search doesn't return relevant results, let the user know and provide general guidance.
""".strip()

# Create another version of agent, let's call it faq_agent_v2
agent = Agent(
    name="faq_agent_v2",
    instructions=system_prompt,
    tools=[text_search],
    model='mistral:mistral-small-latest'
)

In [123]:
evaluation_prompt = """
Use this checklist to evaluate the quality of an AI agent's answer (<ANSWER>) to a user question (<QUESTION>).
We also include the entire log (<LOG>) for analysis.

For each item, check if the condition is met.

Checklist:

- instructions_follow: The agent followed the user's instructions (in <INSTRUCTIONS>)
- instructions_avoid: The agent avoided doing things it was told not to do
- answer_relevant: The response directly addresses the user's question
- answer_clear: The answer is clear and correct
- answer_citations: The response includes proper citations or sources when required
- completeness: The response is complete and covers all key aspects of the request
- tool_call_search: Is the search tool invoked?

Output true/false for each check and provide a short explanation for your judgment.
""".strip()

In [124]:
# Create another version of agent, let's call it faq_agent_v2
agent = Agent(
    name="faq_agent_v3",
    instructions=evaluation_prompt,
    tools=[text_search],
    model='mistral:mistral-small-latest'
)

In [125]:
from pydantic import BaseModel

class EvaluationCheck(BaseModel):
    check_name: str
    justification: str
    check_pass: bool

class EvaluationChecklist(BaseModel):
    checklist: list[EvaluationCheck]
    summary: str

In [126]:
eval_agent = Agent(
    name='eval_agent',
    model='mistral:mistral-small-latest',
    instructions=evaluation_prompt,
    output_type=EvaluationChecklist
)

In [127]:
user_prompt_format = """
<INSTRUCTIONS>{instructions}</INSTRUCTIONS>
<QUESTION>{question}</QUESTION>
<ANSWER>{answer}</ANSWER>
<LOG>{log}</LOG>
""".strip()

In [128]:
def load_log_file(log_file):
    with open(log_file, 'r') as f_in:
        log_data = json.load(f_in)
        log_data['log_file'] = log_file
        return log_data

In [129]:
log_record = load_log_file('/Users/zhare/Documents/GitHub/ai_agent_crash_course/aihero/Day_5/logs/faq_agent_20251003_202910_e92585.json')

instructions = log_record['system_prompt']
question = log_record['messages'][0]['parts'][0]['content']
answer = log_record['messages'][-1]['parts'][0]['content']
log = json.dumps(log_record['messages'])

user_prompt = user_prompt_format.format(
    instructions=instructions,
    question=question,
    answer=answer,
    log=log
)

In [130]:
result = await eval_agent.run(user_prompt, output_type=EvaluationChecklist)

checklist = result.output
print(checklist.summary)

for check in checklist.checklist:
    print(check)

The agent's response is highly evaluated. It followed instructions, provided a relevant, clear, and complete answer, included proper citations, and used the search tool effectively.
check_name='instructions_follow' justification='The agent used the search tool to find relevant information from the course materials before answering the question.' check_pass=True
check_name='instructions_avoid' justification='The agent did not do anything it was told not to do.' check_pass=True
check_name='answer_relevant' justification="The response directly addresses the user's question about running Flink in Python." check_pass=True
check_name='answer_clear' justification='The answer is clear and provides a step-by-step guide on how to run Flink in Python.' check_pass=True
check_name='answer_citations' justification='The response includes links to official documentation for further reference.' check_pass=True
check_name='completeness' justification='The response covers all key aspects of the request, 

In [131]:
def simplify_log_messages(messages):
    log_simplified = []

    for m in messages:
        parts = []

        for original_part in m['parts']:
            part = original_part.copy()
            kind = part['part_kind']

            if kind == 'user-prompt':
                del part['timestamp']
            if kind == 'tool-call':
                del part['tool_call_id']
            if kind == 'tool-return':
                del part['tool_call_id']
                del part['metadata']
                del part['timestamp']
                # Replace actual search results with placeholder to save tokens
                part['content'] = 'RETURN_RESULTS_REDACTED'
            if kind == 'text':
                del part['id']

            parts.append(part)

        message = {
            'kind': m['kind'],
            'parts': parts
        }

        log_simplified.append(message)
    return log_simplified

In [133]:
async def evaluate_log_record(eval_agent, log_record):
    messages = log_record['messages']

    instructions = log_record['system_prompt']
    question = messages[0]['parts'][0]['content']
    answer = messages[-1]['parts'][0]['content']

    log_simplified = simplify_log_messages(messages)
    log = json.dumps(log_simplified)

    user_prompt = user_prompt_format.format(
        instructions=instructions,
        question=question,
        answer=answer,
        log=log
    )

    result = await eval_agent.run(user_prompt, output_type=EvaluationChecklist)
    return result.output


log_record = load_log_file('/Users/zhare/Documents/GitHub/ai_agent_crash_course/aihero/Day_5/logs/faq_agent_20251003_202910_e92585.json')
eval1 = await evaluate_log_record(eval_agent, log_record)

In [134]:
question_generation_prompt = """
You are helping to create test questions for an AI agent that answers questions about a data engineering course.

Based on the provided FAQ content, generate realistic questions that students might ask.

The questions should:

- Be natural and varied in style
- Range from simple to complex
- Include both specific technical questions and general course questions

Generate one question for each record.
""".strip()

class QuestionsList(BaseModel):
    questions: list[str]

question_generator = Agent(
    name="question_generator",
    instructions=question_generation_prompt,
    model='mistral:mistral-small-latest',
    output_type=QuestionsList
)

In [135]:
import random

sample = random.sample(de_dtc_faq, 10)
prompt_docs = [d['content'] for d in sample]
prompt = json.dumps(prompt_docs)

result = await question_generator.run(prompt)
questions = result.output.questions

In [136]:
from tqdm.auto import tqdm

for q in tqdm(questions):
    print(q)

    result = await agent.run(user_prompt=q)
    print(result.output)

    log_interaction_to_file(
        agent,
        result.new_messages(),
        source='ai-generated'
    )

    print()

  0%|          | 0/10 [00:00<?, ?it/s]

How do I create a new branch to edit in dbt? Are there any resources available to guide me through this process?
To create a new branch in dbt (data build tool), you'll typically want to use Git, the version control system, rather than dbt itself. Here's a step-by-step guide:

1. **Open your terminal or command prompt**: Navigate to your dbt project directory.

2. **Check your current branch**: Before creating a new branch, it's good practice to check which branch you're currently on. You can do this by using the command `git branch`. The current branch will be highlighted with an asterisk (*).

3. **Create a new branch**: To create a new branch, use the command `git checkout -b <branch-name>`. Replace `<branch-name>` with the name you want to give to your new branch. This command will create a new branch and switch you to it.

4. **Push the new branch to the remote repository**: If you want to push your new branch to the remote repository (e.g., GitHub, GitLab, Bitbucket), use the com

In [143]:
eval_set = []

for log_file in LOG_DIR.glob('*.json'):
    if 'faq_agent_v3' not in log_file.name:
        continue

    log_record = load_log_file(log_file)
    if log_record['source'] != 'ai-generated':
        continue

    eval_set.append(log_record)

In [144]:
eval_set

[{'agent_name': 'faq_agent_v3',
  'system_prompt': ["Use this checklist to evaluate the quality of an AI agent's answer (<ANSWER>) to a user question (<QUESTION>).\nWe also include the entire log (<LOG>) for analysis.\n\nFor each item, check if the condition is met.\n\nChecklist:\n\n- instructions_follow: The agent followed the user's instructions (in <INSTRUCTIONS>)\n- instructions_avoid: The agent avoided doing things it was told not to do\n- answer_relevant: The response directly addresses the user's question\n- answer_clear: The answer is clear and correct\n- answer_citations: The response includes proper citations or sources when required\n- completeness: The response is complete and covers all key aspects of the request\n- tool_call_search: Is the search tool invoked?\n\nOutput true/false for each check and provide a short explanation for your judgment."],
  'provider': 'mistral',
  'model': 'mistral-small-latest',
  'tools': ['text_search'],
  'messages': [{'parts': [{'content':

In [145]:
eval_results = []

for log_record in tqdm(eval_set):
    eval_result = await evaluate_log_record(eval_agent, log_record)
    eval_results.append((log_record, eval_result))

  0%|          | 0/10 [00:00<?, ?it/s]

In [146]:
rows = []

for log_record, eval_result in eval_results:
    messages = log_record['messages']

    row = {
        'file': log_record['log_file'].name,
        'question': messages[0]['parts'][0]['content'],
        'answer': messages[-1]['parts'][0]['content'],
    }

    checks = {c.check_name: c.check_pass for c in eval_result.checklist}
    row.update(checks)

    rows.append(row)

In [147]:
import pandas as pd

df_evals = pd.DataFrame(rows)

In [148]:
df_evals.mean(numeric_only=True)

instructions_follow    1.0
instructions_avoid     0.9
answer_relevant        1.0
answer_clear           1.0
answer_citations       0.5
completeness           0.8
tool_call_search       0.2
dtype: float64