In [1]:
import sys
sys.path.insert(0, '..')

In [2]:
import main
agent = main.agent

In [3]:
import pandas as pd

df_ground_truth = pd.read_csv('ground_truth_evidently.csv')
ground_truth = df_ground_truth.to_dict(orient='records')

In [5]:
q = ground_truth[10]
print(q)

{'question': 'column type defaults Evidently', 'summary_answer': 'The article lists default column types applied during automated mapping when no explicit `DataDefinition` is provided, enhancing user understanding.', 'difficulty': 'beginner', 'intent': 'text', 'filename': 'docs/library/data_definition.mdx'}


In [6]:
# test agent with simple question: 

result = await agent.run(q['question'])
print(result.output.format_article())

# Evidently's Default Column Types and Configurations

## Default Column Types in Evaluations

In Evidently, when creating evaluations, the library uses reasonable defaults for various column types. These defaults allow users to run evaluations with minimal setup, including supporting both structured tabular data and workflows for machine learning (ML) and large language models (LLMs). The system is designed around the concept of presets that include pre-configured column types, metrics, and evaluations, making it easier for users to focus on specific analysis without requiring extensive configuration. Users can also customize these defaults based on their particular datasets and evaluation needs.

### References

- Evidently Overview (https://github.com/evidentlyai/docs/blob/main/faq/why_evidently.mdx)

- Evaluations in Evidently (https://github.com/evidentlyai/docs/blob/main/metrics/introduction.mdx)



## Configuration of Column Types

Evidently allows for a modular approach to data

In [7]:
# select 50 questions to work with instead of entire dataset
import random
random.seed(1)

ground_truth_sample = random.sample(ground_truth, 50)

In [8]:
# save sample
import pickle

with open('sample.bin', 'wb') as f_out:
    pickle.dump(ground_truth_sample, f_out)

The plan is to evaluate the agent against all ground truth data.

But what if it breaks while evaluating? It'd be pity if at 80% it breaks with a network error (timeout or something like that), and we need to re-run the whole thing .

So let's put things into a try/except block:

In [9]:
import traceback

async def run_agent(q):
    try:
        result = await agent.run(q['question'])
        return (q, result)
    except:
        print(f'error processing {q}')
        traceback.print_exc()
        return (None, None)

### Parallel Processing Setup

To efficiently process multiple queries, we'll use asynchronous processing 

In [10]:
import asyncio
from tqdm.auto import tqdm

async def map_progress(seq, f, max_concurrency=6):
    """Asynchronously map async function f over seq with progress bar."""
    semaphore = asyncio.Semaphore(max_concurrency)

    async def run(el):
        async with semaphore:
            return await f(el)

    # create one coroutine per element
    coros = [run(el) for el in seq]

    # turn them into tasks that complete as they finish
    completed = asyncio.as_completed(coros)

    results = []

    for coro in tqdm(completed, total=len(seq)):
        result = await coro
        results.append(result)

    return results

### Initial evanuation of sample dataset

In [11]:
all_results = await map_progress(ground_truth_sample, run_agent)

  0%|          | 0/50 [00:00<?, ?it/s]

error processing {'question': 'validating SQL syntax', 'summary_answer': 'Lists the IsValidSQL() function, which validates if submitted SQL queries are syntactically correct without executing them.', 'difficulty': 'intermediate', 'intent': 'code', 'filename': 'metrics/all_descriptors.mdx'}


Traceback (most recent call last):
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/pydantic_ai/models/openai.py", line 487, in _completions_create
    return await self.client.chat.completions.create(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/resources/chat/completions/completions.py", line 2585, in create
    return await self._post(
           ^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1794, in post
    return await self.request(cast_to, opts, stream=stream, stream_cls=stream_cls)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1594, in request
    raise self._make_status_error_from_response(err.response) from None
openai.RateLimitError: Error code: 429 - {'error': {'message': 'Ra

error processing {'question': 'using trace_event decorator', 'summary_answer': 'You can use the `trace_event` decorator to collect traces for specific functions, with examples provided for logging various function arguments.', 'difficulty': 'beginner', 'intent': 'code', 'filename': 'docs/platform/tracing_setup.mdx'}


Traceback (most recent call last):
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/pydantic_ai/models/openai.py", line 487, in _completions_create
    return await self.client.chat.completions.create(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/resources/chat/completions/completions.py", line 2585, in create
    return await self._post(
           ^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1794, in post
    return await self.request(cast_to, opts, stream=stream, stream_cls=stream_cls)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1594, in request
    raise self._make_status_error_from_response(err.response) from None
openai.RateLimitError: Error code: 429 - {'error': {'message': 'Ra

error processing {'question': 'automatic column mapping Evidently', 'summary_answer': 'The article details how to use an empty `DataDefinition()` for automatic mapping of columns by type and name when creating a `Dataset` object.', 'difficulty': 'intermediate', 'intent': 'text', 'filename': 'docs/library/data_definition.mdx'}


Traceback (most recent call last):
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/pydantic_ai/models/openai.py", line 487, in _completions_create
    return await self.client.chat.completions.create(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/resources/chat/completions/completions.py", line 2585, in create
    return await self._post(
           ^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1794, in post
    return await self.request(cast_to, opts, stream=stream, stream_cls=stream_cls)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1594, in request
    raise self._make_status_error_from_response(err.response) from None
openai.RateLimitError: Error code: 429 - {'error': {'message': 'Ra

error processing {'question': 'data drift monitoring evidently', 'summary_answer': 'The Evidently library includes features to detect data drift, allowing users to monitor changes in data distribution and maintain model reliability.', 'difficulty': 'intermediate', 'intent': 'code', 'filename': 'docs/library/overview.mdx'}


Traceback (most recent call last):
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/pydantic_ai/models/openai.py", line 487, in _completions_create
    return await self.client.chat.completions.create(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/resources/chat/completions/completions.py", line 2585, in create
    return await self._post(
           ^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1794, in post
    return await self.request(cast_to, opts, stream=stream, stream_cls=stream_cls)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1594, in request
    raise self._make_status_error_from_response(err.response) from None
openai.RateLimitError: Error code: 429 - {'error': {'message': 'Ra

error processing {'question': 'Evidently Dataset setup example', 'summary_answer': 'It provides a step-by-step setup for creating a `Dataset` object using a `DataDefinition`, including code snippets to guide implementation.', 'difficulty': 'beginner', 'intent': 'code', 'filename': 'docs/library/data_definition.mdx'}


Traceback (most recent call last):
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/pydantic_ai/models/openai.py", line 487, in _completions_create
    return await self.client.chat.completions.create(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/resources/chat/completions/completions.py", line 2585, in create
    return await self._post(
           ^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1794, in post
    return await self.request(cast_to, opts, stream=stream, stream_cls=stream_cls)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1594, in request
    raise self._make_status_error_from_response(err.response) from None
openai.RateLimitError: Error code: 429 - {'error': {'message': 'Ra

error processing {'question': 'importance of precision-recall curve', 'summary_answer': 'The article elaborates on the precision-recall curve and its relevance in understanding the balance between precision and recall for different classification thresholds.', 'difficulty': 'beginner', 'intent': 'text', 'filename': 'metrics/explainer_classification.mdx'}


Traceback (most recent call last):
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/pydantic_ai/models/openai.py", line 487, in _completions_create
    return await self.client.chat.completions.create(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/resources/chat/completions/completions.py", line 2585, in create
    return await self._post(
           ^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1794, in post
    return await self.request(cast_to, opts, stream=stream, stream_cls=stream_cls)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1594, in request
    raise self._make_status_error_from_response(err.response) from None
openai.RateLimitError: Error code: 429 - {'error': {'message': 'Ra

### Processing Results for Analysis

When we run it on many queries, we can spot some problems. For example, for some queries the agent is making too many search queries.

In [12]:
# create a helper functions to simplify the message structure

import json

def simplify_messages(messages):
    messages_simplified = []

    for m in messages:
        parts = []

        for original_part in m.parts:
            kind = original_part.part_kind
            # print(original_part)
            part = {
                'kind': kind
            }
            if kind == 'user-prompt':
                part['content'] = original_part.content
            if kind == 'tool-call':
                if original_part.tool_name == 'final_result':
                    continue
    
                part['tool_name'] = original_part.tool_name
                part['args'] = json.loads(original_part.args)
            if kind == 'tool-return':
                continue
            if kind == 'text':
                part['content'] = original_part.content

            parts.append(part)

        if len(parts) > 0:
            messages_simplified.extend(parts)

    return messages_simplified

In [13]:
# Now let's count the number of tool calls to understand agent behavior

def count_tool_calls(messages):
    cnt = 0 
    for m in messages:
        if m['kind'] == 'tool-call':
            cnt = cnt + 1
    return cnt

In [14]:
# process all the records

def process_result(q, result):
    row = {}

    row['question'] = q['question']
    row['answer'] = result.output.format_article()
    row['messages'] = simplify_messages(result.new_messages())
    row['num_tool_calls'] = count_tool_calls(row['messages']) 

    row['original_question'] = q
    row['original_result'] = result

    return row


rows = []

for q, result in all_results:
    if result is None:
        continue

    row = process_result(q, result)
    rows.append(row)

In [15]:
df_logs = pd.DataFrame(rows)

### Identifying Performance Issues

During our analysis, we discovered a problem: When it can't find something, it keeps searching and searching.

We need to stop it and just explicitly say: "can't find the information you're asking". To address it, we'll ask it to limit search to 6 queries. If it can't find anything, then we'll ask it to just say it.