In [1]:
import sys
sys.path.insert(0, '..')

In [2]:
import main
agent = main.agent

In [3]:
import pandas as pd

df_ground_truth = pd.read_csv('ground_truth_evidently.csv')
ground_truth = df_ground_truth.to_dict(orient='records')

In [4]:
q = ground_truth[10]
print(q)

{'question': 'column type defaults Evidently', 'summary_answer': 'The article lists default column types applied during automated mapping when no explicit `DataDefinition` is provided, enhancing user understanding.', 'difficulty': 'beginner', 'intent': 'text', 'filename': 'docs/library/data_definition.mdx'}


In [6]:
# test agent with simple question: 

result = await agent.run(q['question'])
print(result.output.format_article())

# Evidently Column Type Defaults

## Column Type Defaults in Evidently

In Evidently, column types can be assigned defaults automatically based on the data input, allowing users to run evaluations with minimal configuration. Each column type can have a default inferred from the data provided or be manually specified. The library supports a variety of data types including numerical, categorical, textual, and more, promoting flexible integration with different datasets and evaluation needs. Evaluations in Evidently utilize these column types to assess model performance and data quality effectively.

### References

- Why Evidently? (https://github.com/evidentlyai/docs/blob/main/faq/why_evidently.mdx)

- Evidently Cloud (https://github.com/evidentlyai/docs/blob/main/docs/setup/cloud.mdx)



## All References

- Why Evidently? (faq/why_evidently.mdx)

- Evidently Cloud (docs/setup/cloud.mdx)



### Preparing Sample Data

In [7]:
# select 50 questions to work with instead of entire dataset

import random
random.seed(1)

ground_truth_sample = random.sample(ground_truth, 50)

In [8]:
# save sample

import pickle

with open('sample.bin', 'wb') as f_out:
    pickle.dump(ground_truth_sample, f_out)

### Error Handling

The plan is to evaluate the agent against all ground truth data.

But what if it breaks while evaluating? It'd be pity if at 80% it breaks with a network error (timeout or something like that), and we need to re-run the whole thing .

So let's put things into a try/except block:

In [9]:
import traceback

async def run_agent(q):
    try:
        result = await agent.run(q['question'])
        return (q, result)
    except:
        print(f'error processing {q}')
        traceback.print_exc()
        return (None, None)

### Parallel Processing Setup

To efficiently process multiple queries, we'll use asynchronous processing 

In [10]:
import asyncio
from tqdm.auto import tqdm

async def map_progress(seq, f, max_concurrency=6):
    """Asynchronously map async function f over seq with progress bar."""
    semaphore = asyncio.Semaphore(max_concurrency)

    async def run(el):
        async with semaphore:
            return await f(el)

    # create one coroutine per element
    coros = [run(el) for el in seq]

    # turn them into tasks that complete as they finish
    completed = asyncio.as_completed(coros)

    results = []

    for coro in tqdm(completed, total=len(seq)):
        result = await coro
        results.append(result)

    return results

### Initial evanuation of sample dataset

In [12]:
all_results = await map_progress(ground_truth_sample, run_agent)

  0%|          | 0/50 [00:00<?, ?it/s]

error processing {'question': 'importance of column names in LLM evaluation', 'summary_answer': 'The article emphasizes how defining aliases for column names impacts the results returned by the LLM evaluations in Evidently.', 'difficulty': 'intermediate', 'intent': 'text', 'filename': 'metrics/customize_llm_judge.mdx'}


Traceback (most recent call last):
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/pydantic_ai/models/openai.py", line 487, in _completions_create
    return await self.client.chat.completions.create(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/resources/chat/completions/completions.py", line 2585, in create
    return await self._post(
           ^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1794, in post
    return await self.request(cast_to, opts, stream=stream, stream_cls=stream_cls)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1594, in request
    raise self._make_status_error_from_response(err.response) from None
openai.RateLimitError: Error code: 429 - {'error': {'message': 'Ra

error processing {'question': 'comparison of old and new LLM responses', 'summary_answer': 'The article illustrates processes for comparing LLM outputs before and after changes to assess for significant variations.', 'difficulty': 'intermediate', 'intent': 'text', 'filename': 'examples/LLM_regression_testing.mdx'}


Traceback (most recent call last):
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/pydantic_ai/models/openai.py", line 487, in _completions_create
    return await self.client.chat.completions.create(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/resources/chat/completions/completions.py", line 2585, in create
    return await self._post(
           ^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1794, in post
    return await self.request(cast_to, opts, stream=stream, stream_cls=stream_cls)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1594, in request
    raise self._make_status_error_from_response(err.response) from None
openai.RateLimitError: Error code: 429 - {'error': {'message': 'Ra

error processing {'question': 'Evidently Python library examples', 'summary_answer': 'The article provides insights into using the Evidently Python library for running local evaluations and uploading datasets, along with descriptions of its functionalities.', 'difficulty': 'intermediate', 'intent': 'code', 'filename': 'docs/platform/overview.mdx'}


Traceback (most recent call last):
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/pydantic_ai/models/openai.py", line 487, in _completions_create
    return await self.client.chat.completions.create(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/resources/chat/completions/completions.py", line 2585, in create
    return await self._post(
           ^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1794, in post
    return await self.request(cast_to, opts, stream=stream, stream_cls=stream_cls)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1594, in request
    raise self._make_status_error_from_response(err.response) from None
openai.RateLimitError: Error code: 429 - {'error': {'message': 'Ra

error processing {'question': 'how to use evidently for model evaluation', 'summary_answer': 'It details how Evidently provides over 100 built-in evaluations, allowing users to run assessments without preparing metrics from scratch, making the evaluation process straightforward.', 'difficulty': 'beginner', 'intent': 'code', 'filename': 'faq/why_evidently.mdx'}


Traceback (most recent call last):
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/pydantic_ai/models/openai.py", line 487, in _completions_create
    return await self.client.chat.completions.create(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/resources/chat/completions/completions.py", line 2585, in create
    return await self._post(
           ^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1794, in post
    return await self.request(cast_to, opts, stream=stream, stream_cls=stream_cls)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1594, in request
    raise self._make_status_error_from_response(err.response) from None
openai.RateLimitError: Error code: 429 - {'error': {'message': 'Ra

error processing {'question': 'customize drift tests in Evidently', 'summary_answer': 'A brief mention is made of customizing drift tests, allowing users to adjust methods and thresholds for tailored data evaluations.', 'difficulty': 'advanced', 'intent': 'text', 'filename': 'quickstart_ml.mdx'}


Traceback (most recent call last):
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/pydantic_ai/models/openai.py", line 487, in _completions_create
    return await self.client.chat.completions.create(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/resources/chat/completions/completions.py", line 2585, in create
    return await self._post(
           ^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1794, in post
    return await self.request(cast_to, opts, stream=stream, stream_cls=stream_cls)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1594, in request
    raise self._make_status_error_from_response(err.response) from None
openai.RateLimitError: Error code: 429 - {'error': {'message': 'Ra

error processing {'question': 'automatic column mapping Evidently', 'summary_answer': 'The article details how to use an empty `DataDefinition()` for automatic mapping of columns by type and name when creating a `Dataset` object.', 'difficulty': 'intermediate', 'intent': 'text', 'filename': 'docs/library/data_definition.mdx'}


Traceback (most recent call last):
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/pydantic_ai/models/openai.py", line 487, in _completions_create
    return await self.client.chat.completions.create(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/resources/chat/completions/completions.py", line 2585, in create
    return await self._post(
           ^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1794, in post
    return await self.request(cast_to, opts, stream=stream, stream_cls=stream_cls)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1594, in request
    raise self._make_status_error_from_response(err.response) from None
openai.RateLimitError: Error code: 429 - {'error': {'message': 'Ra

error processing {'question': 'HuggingFace text detection capabilities', 'summary_answer': 'It describes how to use HuggingFace for text detection, particularly focusing on detecting GPT-2 generated text and relevant parameters.', 'difficulty': 'advanced', 'intent': 'code', 'filename': 'metrics/customize_hf_descriptor.mdx'}


Traceback (most recent call last):
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/pydantic_ai/models/openai.py", line 487, in _completions_create
    return await self.client.chat.completions.create(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/resources/chat/completions/completions.py", line 2585, in create
    return await self._post(
           ^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1794, in post
    return await self.request(cast_to, opts, stream=stream, stream_cls=stream_cls)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1594, in request
    raise self._make_status_error_from_response(err.response) from None
openai.RateLimitError: Error code: 429 - {'error': {'message': 'Ra

error processing {'question': 'Evidently Dataset setup example', 'summary_answer': 'It provides a step-by-step setup for creating a `Dataset` object using a `DataDefinition`, including code snippets to guide implementation.', 'difficulty': 'beginner', 'intent': 'code', 'filename': 'docs/library/data_definition.mdx'}


Traceback (most recent call last):
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/pydantic_ai/models/openai.py", line 487, in _completions_create
    return await self.client.chat.completions.create(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/resources/chat/completions/completions.py", line 2585, in create
    return await self._post(
           ^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1794, in post
    return await self.request(cast_to, opts, stream=stream, stream_cls=stream_cls)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1594, in request
    raise self._make_status_error_from_response(err.response) from None
openai.RateLimitError: Error code: 429 - {'error': {'message': 'Ra

error processing {'question': 'using LLM for text evaluation', 'summary_answer': 'You can use built-in LLM-based descriptors to evaluate text and return scores or labels based on external language model outputs.', 'difficulty': 'intermediate', 'intent': 'code', 'filename': 'docs/library/descriptors.mdx'}


Traceback (most recent call last):
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/pydantic_ai/models/openai.py", line 487, in _completions_create
    return await self.client.chat.completions.create(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/resources/chat/completions/completions.py", line 2585, in create
    return await self._post(
           ^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1794, in post
    return await self.request(cast_to, opts, stream=stream, stream_cls=stream_cls)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1594, in request
    raise self._make_status_error_from_response(err.response) from None
openai.RateLimitError: Error code: 429 - {'error': {'message': 'Ra

error processing {'question': 'how to set up Evidently for data drift', 'summary_answer': 'It details the steps to set up your environment and install the Evidently library needed for data drift evaluation.', 'difficulty': 'beginner', 'intent': 'code', 'filename': 'quickstart_ml.mdx'}


Traceback (most recent call last):
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/pydantic_ai/models/openai.py", line 487, in _completions_create
    return await self.client.chat.completions.create(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/resources/chat/completions/completions.py", line 2585, in create
    return await self._post(
           ^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1794, in post
    return await self.request(cast_to, opts, stream=stream, stream_cls=stream_cls)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1594, in request
    raise self._make_status_error_from_response(err.response) from None
openai.RateLimitError: Error code: 429 - {'error': {'message': 'Ra

error processing {'question': 'exporting results from descriptors', 'summary_answer': 'Results from evaluations with descriptors can be exported or summarized, allowing for further analysis or reporting.', 'difficulty': 'intermediate', 'intent': 'code', 'filename': 'docs/library/descriptors.mdx'}


Traceback (most recent call last):
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/pydantic_ai/models/openai.py", line 487, in _completions_create
    return await self.client.chat.completions.create(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/resources/chat/completions/completions.py", line 2585, in create
    return await self._post(
           ^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1794, in post
    return await self.request(cast_to, opts, stream=stream, stream_cls=stream_cls)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1594, in request
    raise self._make_status_error_from_response(err.response) from None
openai.RateLimitError: Error code: 429 - {'error': {'message': 'Ra

error processing {'question': 'data drift prerequisites', 'summary_answer': 'Before using the DataDriftPreset, users need to know how to prepare data and create reports, which are briefly mentioned in the article as prerequisites.', 'difficulty': 'beginner', 'intent': 'text', 'filename': 'metrics/preset_data_drift.mdx'}


Traceback (most recent call last):
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/pydantic_ai/models/openai.py", line 487, in _completions_create
    return await self.client.chat.completions.create(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/resources/chat/completions/completions.py", line 2585, in create
    return await self._post(
           ^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1794, in post
    return await self.request(cast_to, opts, stream=stream, stream_cls=stream_cls)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1594, in request
    raise self._make_status_error_from_response(err.response) from None
openai.RateLimitError: Error code: 429 - {'error': {'message': 'Ra

error processing {'question': 'text data evaluation descriptors', 'summary_answer': 'Descriptors are a universal interface for evaluating text data, allowing you to compute scores or labels for each row in your dataset.', 'difficulty': 'beginner', 'intent': 'text', 'filename': 'docs/library/descriptors.mdx'}


Traceback (most recent call last):
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/pydantic_ai/models/openai.py", line 487, in _completions_create
    return await self.client.chat.completions.create(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/resources/chat/completions/completions.py", line 2585, in create
    return await self._post(
           ^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1794, in post
    return await self.request(cast_to, opts, stream=stream, stream_cls=stream_cls)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1594, in request
    raise self._make_status_error_from_response(err.response) from None
openai.RateLimitError: Error code: 429 - {'error': {'message': 'Ra

error processing {'question': 'list of actions tracked in evidently telemetry', 'summary_answer': 'The article lists various actions that are tracked, such as startup, project dashboard views, and report listings to understand feature usage.', 'difficulty': 'intermediate', 'intent': 'text', 'filename': 'faq/telemetry.mdx'}


Traceback (most recent call last):
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/pydantic_ai/models/openai.py", line 487, in _completions_create
    return await self.client.chat.completions.create(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/resources/chat/completions/completions.py", line 2585, in create
    return await self._post(
           ^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1794, in post
    return await self.request(cast_to, opts, stream=stream, stream_cls=stream_cls)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1594, in request
    raise self._make_status_error_from_response(err.response) from None
openai.RateLimitError: Error code: 429 - {'error': {'message': 'Ra

error processing {'question': 'detecting PII in text', 'summary_answer': 'Lists the PIILLMEval() function which detects Personally Identifiable Information in text, useful for compliance and privacy.', 'difficulty': 'intermediate', 'intent': 'code', 'filename': 'metrics/all_descriptors.mdx'}


Traceback (most recent call last):
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/pydantic_ai/models/openai.py", line 487, in _completions_create
    return await self.client.chat.completions.create(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/resources/chat/completions/completions.py", line 2585, in create
    return await self._post(
           ^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1794, in post
    return await self.request(cast_to, opts, stream=stream, stream_cls=stream_cls)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1594, in request
    raise self._make_status_error_from_response(err.response) from None
openai.RateLimitError: Error code: 429 - {'error': {'message': 'Ra

error processing {'question': 'Python code for text descriptors', 'summary_answer': 'The article provides Python code snippets to import necessary modules and create descriptors for evaluating text datasets.', 'difficulty': 'beginner', 'intent': 'code', 'filename': 'docs/library/descriptors.mdx'}


Traceback (most recent call last):
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/pydantic_ai/models/openai.py", line 487, in _completions_create
    return await self.client.chat.completions.create(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/resources/chat/completions/completions.py", line 2585, in create
    return await self._post(
           ^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1794, in post
    return await self.request(cast_to, opts, stream=stream, stream_cls=stream_cls)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1594, in request
    raise self._make_status_error_from_response(err.response) from None
openai.RateLimitError: Error code: 429 - {'error': {'message': 'Ra

error processing {'question': 'classification quality preset example', 'summary_answer': 'The article includes code examples showing how to implement the ClassificationPreset for evaluating performance metrics on datasets, illustrating its usage with sample code snippets.', 'difficulty': 'beginner', 'intent': 'code', 'filename': 'metrics/preset_classification.mdx'}


Traceback (most recent call last):
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/pydantic_ai/models/openai.py", line 487, in _completions_create
    return await self.client.chat.completions.create(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/resources/chat/completions/completions.py", line 2585, in create
    return await self._post(
           ^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1794, in post
    return await self.request(cast_to, opts, stream=stream, stream_cls=stream_cls)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1594, in request
    raise self._make_status_error_from_response(err.response) from None
openai.RateLimitError: Error code: 429 - {'error': {'message': 'Ra

error processing {'question': 'ColumnCount metric', 'summary_answer': 'ColumnCount() counts the number of columns in a dataset, providing important data quality information.', 'difficulty': 'intermediate', 'intent': 'code', 'filename': 'metrics/all_metrics.mdx'}


Traceback (most recent call last):
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/pydantic_ai/models/openai.py", line 487, in _completions_create
    return await self.client.chat.completions.create(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/resources/chat/completions/completions.py", line 2585, in create
    return await self._post(
           ^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1794, in post
    return await self.request(cast_to, opts, stream=stream, stream_cls=stream_cls)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1594, in request
    raise self._make_status_error_from_response(err.response) from None
openai.RateLimitError: Error code: 429 - {'error': {'message': 'Ra

error processing {'question': 'create custom data drift method', 'summary_answer': 'Instructions on how to implement your own custom drift detection method using the StatTest class are provided in the article.', 'difficulty': 'advanced', 'intent': 'code', 'filename': 'metrics/customize_data_drift.mdx'}


Traceback (most recent call last):
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/pydantic_ai/models/openai.py", line 487, in _completions_create
    return await self.client.chat.completions.create(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/resources/chat/completions/completions.py", line 2585, in create
    return await self._post(
           ^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1794, in post
    return await self.request(cast_to, opts, stream=stream, stream_cls=stream_cls)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1594, in request
    raise self._make_status_error_from_response(err.response) from None
openai.RateLimitError: Error code: 429 - {'error': {'message': 'Ra

error processing {'question': 'Evidently Cloud setup for LLM testing', 'summary_answer': 'Instructions are provided for setting up Evidently Cloud and running evaluations using Python and API keys.', 'difficulty': 'beginner', 'intent': 'code', 'filename': 'examples/LLM_regression_testing.mdx'}


Traceback (most recent call last):
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/pydantic_ai/models/openai.py", line 487, in _completions_create
    return await self.client.chat.completions.create(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/resources/chat/completions/completions.py", line 2585, in create
    return await self._post(
           ^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1794, in post
    return await self.request(cast_to, opts, stream=stream, stream_cls=stream_cls)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1594, in request
    raise self._make_status_error_from_response(err.response) from None
openai.RateLimitError: Error code: 429 - {'error': {'message': 'Ra

error processing {'question': 'using custom templates with LLM', 'summary_answer': 'The article explains how to create and apply custom prompt templates to evaluate specific criteria for text data using LLM.', 'difficulty': 'intermediate', 'intent': 'code', 'filename': 'metrics/customize_llm_judge.mdx'}


Traceback (most recent call last):
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/pydantic_ai/models/openai.py", line 487, in _completions_create
    return await self.client.chat.completions.create(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/resources/chat/completions/completions.py", line 2585, in create
    return await self._post(
           ^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1794, in post
    return await self.request(cast_to, opts, stream=stream, stream_cls=stream_cls)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/workspaces/ai-bootcamp/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1594, in request
    raise self._make_status_error_from_response(err.response) from None
openai.RateLimitError: Error code: 429 - {'error': {'message': 'Ra

### Processing Results for Analysis

When we run it on many queries, we can spot some problems. For example, for some queries the agent is making too many search queries.

In [13]:
# create a helper functions to simplify the message structure

import json

def simplify_messages(messages):
    messages_simplified = []

    for m in messages:
        parts = []

        for original_part in m.parts:
            kind = original_part.part_kind
            # print(original_part)
            part = {
                'kind': kind
            }
            if kind == 'user-prompt':
                part['content'] = original_part.content
            if kind == 'tool-call':
                if original_part.tool_name == 'final_result':
                    continue
    
                part['tool_name'] = original_part.tool_name
                part['args'] = json.loads(original_part.args)
            if kind == 'tool-return':
                continue
            if kind == 'text':
                part['content'] = original_part.content

            parts.append(part)

        if len(parts) > 0:
            messages_simplified.extend(parts)

    return messages_simplified

In [14]:
# Now let's count the number of tool calls to understand agent behavior

def count_tool_calls(messages):
    cnt = 0 
    for m in messages:
        if m['kind'] == 'tool-call':
            cnt = cnt + 1
    return cnt

In [15]:
# process all the records

def process_result(q, result):
    row = {}

    row['question'] = q['question']
    row['answer'] = result.output.format_article()
    row['messages'] = simplify_messages(result.new_messages())
    row['num_tool_calls'] = count_tool_calls(row['messages']) 

    row['original_question'] = q
    row['original_result'] = result

    return row


rows = []

for q, result in all_results:
    if result is None:
        continue

    row = process_result(q, result)
    rows.append(row)

In [16]:
df_logs = pd.DataFrame(rows)

### Identifying Performance Issues

During our analysis, we discovered a problem: When it can't find something, it keeps searching and searching.

We need to stop it and just explicitly say: "can't find the information you're asking". To address it, we'll ask it to limit search to 6 queries. If it can't find anything, then we'll ask it to just say it.

### Final Evaluation Run

In [26]:
all_results = await map_progress(ground_truth_sample, run_agent, max_concurrency=1)

  0%|          | 0/50 [00:00<?, ?it/s]

In [27]:
rows = []

for q, result in all_results:
    if result is None:
        continue

    row = process_result(q, result)
    rows.append(row)


In [28]:
with open('sample_eval_rows.bin', 'wb') as f_out:
    pickle.dump(rows, f_out)