In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys

sys.path.insert(0, '..')

In [4]:
import json
from minsearch import Index
import docs

from tqdm.auto import tqdm

In [5]:
import pandas as pd

df_ground_truth = pd.read_csv('../evals/ground_truth_evidently.csv')
ground_truth = df_ground_truth.to_dict(orient='records')

In [7]:
ground_truth[10]

{'question': 'Evidently mapping roles and types',
 'summary_answer': 'The article discusses how to accurately specify roles and types for each column in your dataset to avoid errors in evaluations.',
 'difficulty': 'advanced',
 'intent': 'text',
 'filename': 'docs/library/data_definition.mdx'}

In [6]:
github_data = docs.read_github_data()
parsed_data = docs.parse_data(github_data)


In [8]:
import tiktoken

encoding = tiktoken.encoding_for_model("gpt-4o-mini")

def calculate_num_tokens(search_results):
    json_result = json.dumps(search_results)
    num_tokens = len(encoding.encode(json_result))
    return num_tokens

In [9]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)


def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)
                break

    return total_score / len(relevance_total)

In [11]:
def evaluate(
        ground_truth,
        search_function,
        question_column='question',
        id_column='filename'
):
    relevance_total = []
    tokens = []

    for q in ground_truth:
        doc_id = q[id_column]
        results = search_function(q[question_column])
        num_tokens = calculate_num_tokens(results)
        tokens.append(num_tokens)
        relevance = [d[id_column] == doc_id for d in results]
        relevance_total.append(relevance)

    avg_tokens = sum(tokens) / len(tokens)
    
    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
        'num_tokens': avg_tokens
    }

In [None]:
size = 2000
step = 1000
top_k = 5

def evaluate_params(size, step, top_k):
    chunks = docs.chunk_documents(parsed_data, size=size, step=step)
    
    index = Index(
        text_fields=["content", "filename", "title", "description"],
    )
    
    index.fit(chunks)
    
    def search(query: str):
        return index.search(
            query=query,
            num_results=top_k,
        )
    
    return evaluate(ground_truth, search)

In [13]:
sizes = [1000, 2000, 3000, 5000]
steps = [1000, 2000, 3000]
top_ks = [5, 10, 15]

results = []

for size in sizes:
    for step in steps:
        if step > size:
            continue

        for top_k in top_ks:
            result = evaluate_params(size, step, top_k)
            record = {
                'size': size,
                'step': step,
                'top_k': top_k,
                'hit_rate': result['hit_rate'],
                'num_tokens': result['num_tokens']
            }
            print(record)
            results.append(record)
            

{'size': 1000, 'step': 1000, 'top_k': 5, 'hit_rate': 0.4279918864097363, 'num_tokens': 1351.870182555781}
{'size': 1000, 'step': 1000, 'top_k': 10, 'hit_rate': 0.5294117647058824, 'num_tokens': 2747.5943204868154}
{'size': 1000, 'step': 1000, 'top_k': 15, 'hit_rate': 0.6105476673427992, 'num_tokens': 4089.4624746450304}
{'size': 2000, 'step': 1000, 'top_k': 5, 'hit_rate': 0.43610547667342797, 'num_tokens': 2488.4543610547667}
{'size': 2000, 'step': 1000, 'top_k': 10, 'hit_rate': 0.537525354969574, 'num_tokens': 5075.137931034483}
{'size': 2000, 'step': 1000, 'top_k': 15, 'hit_rate': 0.6308316430020284, 'num_tokens': 7576.789046653144}
{'size': 2000, 'step': 2000, 'top_k': 5, 'hit_rate': 0.4787018255578093, 'num_tokens': 2361.9269776876267}
{'size': 2000, 'step': 2000, 'top_k': 10, 'hit_rate': 0.6064908722109533, 'num_tokens': 4684.156186612576}
{'size': 2000, 'step': 2000, 'top_k': 15, 'hit_rate': 0.6754563894523327, 'num_tokens': 7019.182555780933}
{'size': 3000, 'step': 1000, 'top_k'

In [15]:
import pandas as pd

df = pd.DataFrame(results, columns=['size', 'step', 'top_k', 'hit_rate', 'num_tokens'])

alpha = 2
beta = 0.5
df['score'] = (df.hit_rate ** alpha) / ((df.num_tokens / 1000) ** beta)

df = df.sort_values(by='score', ascending=False)

In [16]:
df.head()

Unnamed: 0,size,step,top_k,hit_rate,num_tokens,score
2,1000,1000,15,0.610548,4089.462475,0.184334
8,2000,2000,15,0.675456,7019.182556,0.172207
7,2000,2000,10,0.606491,4684.156187,0.169955
1,1000,1000,10,0.529412,2747.59432,0.169087
17,3000,3000,15,0.703854,9399.275862,0.161591


In [14]:
# This represents "retrieval quality adjusted for the cost of processing tokens".
"""

            hit_rate ** alpha
score = ------------------------
        (num_tokens/1000) ** beta
"""

'\n\n            hit_rate ** alpha\nscore = ------------------------\n        (num_tokens/1000) ** beta\n'

In [17]:
import search_agent

In [19]:
config = search_agent.AgentConfig(
    chunk_size=1000,
    chunk_step=1000,
    top_k=15,
)

In [21]:
agent = search_agent.create_agent(config)
agent

Agent(model=OpenAIChatModel(), name='search', end_strategy='early', model_settings=None, output_type=<class 'search_agent.SearchResultArticle'>, instrument=None)

In [22]:
from evals.eval_orchestrator import run_full_evaluation

In [23]:
await run_full_evaluation(agent, csv_path='../evals/ground_truth_sample_25_2025-10-30-21-03.csv')


Start time: 2025-10-30 22:05:35
Configuration:
  Ground truth: ../evals/ground_truth_sample_25_2025-10-30-21-03.csv
  Agent model: gpt-4o-mini
  Judge model: gpt-5-nano
  Max concurrency: 10

Loaded 25 ground truth questions
Running agent evaluation...


  0%|          | 0/25 [00:00<?, ?it/s]

forcing output
forcing output
forcing output
forcing output
Error processing {'question': 'custom text evaluator in Python', 'summary_answer': 'The article outlines how to create a custom text evaluator using the Evidently framework by implementing functions that evaluate data in specified columns.', 'difficulty': 'beginner', 'intent': 'code', 'filename': 'metrics/customize_descriptor.mdx'}: status_code: 429, model_name: gpt-4o-mini, body: {'message': 'Rate limit reached for gpt-4o-mini in organization org-Dtqe0FuXWhvEjasqV8q84gMy on tokens per min (TPM): Limit 200000, Used 200000, Requested 265. Please try again in 79ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}


Traceback (most recent call last):
  File "/Users/yenchunchen/Desktop/Project/ai-bootcamp/week3/code/.venv/lib/python3.12/site-packages/pydantic_ai/models/openai.py", line 493, in _completions_create
    return await self.client.chat.completions.create(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/yenchunchen/Desktop/Project/ai-bootcamp/week3/code/.venv/lib/python3.12/site-packages/openai/resources/chat/completions/completions.py", line 2603, in create
    return await self._post(
           ^^^^^^^^^^^^^^^^^
  File "/Users/yenchunchen/Desktop/Project/ai-bootcamp/week3/code/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1794, in post
    return await self.request(cast_to, opts, stream=stream, stream_cls=stream_cls)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/yenchunchen/Desktop/Project/ai-bootcamp/week3/code/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1594, in request
  

Error processing {'question': 'local workspace report saving', 'summary_answer': 'The article outlines how to save reports to both your local workspace and Evidently Cloud, including specific code examples for each.', 'difficulty': 'beginner', 'intent': 'code', 'filename': 'docs/library/output_formats.mdx'}: status_code: 429, model_name: gpt-4o-mini, body: {'message': 'Rate limit reached for gpt-4o-mini in organization org-Dtqe0FuXWhvEjasqV8q84gMy on tokens per min (TPM): Limit 200000, Used 194065, Requested 7851. Please try again in 574ms. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}


Traceback (most recent call last):
  File "/Users/yenchunchen/Desktop/Project/ai-bootcamp/week3/code/.venv/lib/python3.12/site-packages/pydantic_ai/models/openai.py", line 493, in _completions_create
    return await self.client.chat.completions.create(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/yenchunchen/Desktop/Project/ai-bootcamp/week3/code/.venv/lib/python3.12/site-packages/openai/resources/chat/completions/completions.py", line 2603, in create
    return await self._post(
           ^^^^^^^^^^^^^^^^^
  File "/Users/yenchunchen/Desktop/Project/ai-bootcamp/week3/code/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1794, in post
    return await self.request(cast_to, opts, stream=stream, stream_cls=stream_cls)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/yenchunchen/Desktop/Project/ai-bootcamp/week3/code/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1594, in request
  

forcing output
Error processing {'question': 'examples of data validation metrics', 'summary_answer': 'The article provides comprehensive examples of validation metrics that ensure data meets quality and integrity standards.', 'difficulty': 'beginner', 'intent': 'text', 'filename': 'metrics/all_metrics.mdx'}: status_code: 429, model_name: gpt-4o-mini, body: {'message': 'Rate limit reached for gpt-4o-mini in organization org-Dtqe0FuXWhvEjasqV8q84gMy on tokens per min (TPM): Limit 200000, Used 198784, Requested 15764. Please try again in 4.364s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'tokens', 'param': None, 'code': 'rate_limit_exceeded'}


Traceback (most recent call last):
  File "/Users/yenchunchen/Desktop/Project/ai-bootcamp/week3/code/.venv/lib/python3.12/site-packages/pydantic_ai/models/openai.py", line 493, in _completions_create
    return await self.client.chat.completions.create(
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/yenchunchen/Desktop/Project/ai-bootcamp/week3/code/.venv/lib/python3.12/site-packages/openai/resources/chat/completions/completions.py", line 2603, in create
    return await self._post(
           ^^^^^^^^^^^^^^^^^
  File "/Users/yenchunchen/Desktop/Project/ai-bootcamp/week3/code/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1794, in post
    return await self.request(cast_to, opts, stream=stream, stream_cls=stream_cls)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/yenchunchen/Desktop/Project/ai-bootcamp/week3/code/.venv/lib/python3.12/site-packages/openai/_base_client.py", line 1594, in request
  

Total cost: $0.0563
Results saved to reports/eval-run-2025-10-30-22-06.bin

✓ Agent evaluation completed
  Evaluated: 22 questions
  Results saved: reports/eval-run-2025-10-30-22-06.bin
  Agent Run Costs:
    Input tokens cost:  $  0.0479
    Output tokens cost: $  0.0083
    Total cost:         $  0.0563

Loading evaluation results from reports/eval-run-2025-10-30-22-06.bin...
Loaded 22 evaluation results
Loading reference documents...
Creating judge agent...
Running judge evaluation...


  0%|          | 0/22 [00:00<?, ?it/s]

Total cost: $0.0276
Judge results saved to: reports/eval-judge-2025-10-30-22-06.bin

✓ Judge evaluation completed
  Evaluated: 22 results
  Judge results saved: reports/eval-judge-2025-10-30-22-06.bin
  Judge Evaluation Costs:
    Input tokens cost:  $  0.0050
    Output tokens cost: $  0.0226
    Total cost:         $  0.0276


Execution Time:
  Duration: 102.2 seconds

Dataset:
  Questions evaluated: 22

Evaluation Metrics:
  ✗ CheckName.instructions_follow  50.0%
  ✓ CheckName.instructions_avoid 100.0%
  ✓ CheckName.answer_relevant 100.0%
  ✓ CheckName.answer_clear    100.0%
  ✓ CheckName.answer_match     95.0%
  ✓ CheckName.answer_citations 100.0%
  ✓ CheckName.completeness     85.0%
  ⚠ CheckName.tool_call_search  63.6%

Overall Score: 86.7%

Agent Run Costs:
  Input tokens cost:  $  0.0479
  Output tokens cost: $  0.0083
  Total cost:         $  0.0563
Judge Evaluation Costs:
  Input tokens cost:  $  0.0050
  Output tokens cost: $  0.0226
  Total cost:         $  0.0276
TOTAL Cos

{'run_results_path': 'reports/eval-run-2025-10-30-22-06.bin',
 'judge_results_path': 'reports/eval-judge-2025-10-30-22-06.bin',
 'run_cost': CostInfo(input_cost=0.0479394, output_cost=0.0083412, total_cost=0.0562806),
 'judge_cost': CostInfo(input_cost=0.004963950000000001, output_cost=0.0225952, total_cost=0.02755915),
 'total_cost': CostInfo(input_cost=0.05290335, output_cost=0.0309364, total_cost=0.08383975),
 'df_run':                                        question  \
 0                                  what is NDCG   
 1         different types of Tests in Evidently   
 2       Evidently packages installation for LLM   
 3         multi-class classification evaluation   
 4             add pie charts to dashboard panel   
 5                    evidently library features   
 6      evaluate text emotions using huggingface   
 7                        output formats options   
 8      predicted vs actual values visualization   
 9                      using OpenAI API for LLM   
 1