In [None]:
import sys

sys.path.insert(0, '..')

import docs

In [7]:
import docs

raw_documents = docs.read_github_data()
documents = docs.parse_data(raw_documents)
len(documents)


95

In [8]:
num_questions_total = 0

selected_documents = []

for doc in documents[5:]:
    if 'title' not in doc:
        continue

    title = doc['title']
    if 'unpublished' in title.lower():
        continue
    if 'legacy' in title.lower():
        continue
    if 'leftovers' in title.lower():
        continue

    content = doc.get('content', '').strip()
    if len(content) <= 1000:
        continue

    num_questions = len(content) // 1000
    print(doc.get('title'))
    print(len(content), num_questions)
    num_questions_total = num_questions_total + num_questions
    print('------------')

    selected_documents.append(doc)

print(num_questions_total)


Data definition
11046 11
------------
Descriptors
12588 12
------------
Overview
3231 3
------------
Metric generators
2949 2
------------
Output formats
1584 1
------------
Introduction
22069 22
------------
Report
4989 4
------------
Add tags and metadata
2340 2
------------
Tests
9154 9
------------
Alerts
1282 1
------------
Add dashboard panels (API)
13004 13
------------
Add dashboard panels (UI)
4258 4
------------
Overview
2735 2
------------
Overview
2216 2
------------
Work with datasets
2114 2
------------
Run evals via API
2162 2
------------
Explore view
1899 1
------------
No code evals
4377 4
------------
Overview
2138 2
------------
Batch monitoring
2384 2
------------
Overview
3768 3
------------
Introduction
2408 2
------------
Manage Projects
4614 4
------------
Overview
1392 1
------------
Overview
1507 1
------------
Set up tracing
10120 10
------------
Evidently Cloud
1218 1
------------
Self-hosting
5515 5
------------
Evidently and GitHub actions
1375 1
--------

In [10]:
import json

In [33]:
instructions = """
You are given a technical article. Your task is to imagine what a person might type into a search engine 
before finding and reading this article.

Generate realistic, human-like search queries — not formal questions. 
They should sound like what people actually type into Google or Stack Overflow 
when trying to solve a problem, learn a concept, or find code examples.

Guidelines:
- Avoid full-sentence questions with punctuation like "What is..." or "How do I...".
- Use short, natural search phrases instead, such as:
  - "evidently data definition example"
  - "map target and prediction columns evidently"
  - "difference between timestamp and datetime evidently"
- Make queries varied and spontaneous, not repetitive or over-polished.
- Assume users of different knowledge levels:
  - beginner: broad or basic understanding
  - intermediate: knows basic terms but seeks clarification or examples
  - advanced: familiar with the tool, looking for details, edge cases, or integration options

Distribution rules:
- 60% of the queries should target beginner-level users
- 30% should target intermediate-level users
- 10% should target advanced-level users
- 75% of queries should have an intent of "code" (looking for examples or implementation)
- 25% should have an intent of "text" (looking for conceptual or theoretical explanations)

For each generated query, include:
- question: the natural, human-style search phrase
- summary_answer: a short 1–2 sentence summary of how the article addresses it
- difficulty: one of ["beginner", "intermediate", "advanced"]
- intent: one of ["text", "code"]

Also include a description summarizing what kind of article the questions are about.
""".strip()

In [41]:
from dotenv import load_dotenv
load_dotenv()
from openai import OpenAI

openai_client = OpenAI()

def llm_structured(instructions, user_prompt, output_format, model="gpt-4o-mini"):
    messages = [
        {"role": "system", "content": instructions},
        {"role": "user", "content": user_prompt}
    ]

    response = openai_client.responses.parse(
        model=model,
        input=messages,
        text_format=output_format
    )

    return response.output_parsed, response.usage


In [42]:
from pydantic import BaseModel, Field
from typing import List, Literal

class Question(BaseModel):
    """
    Represents a realistic search-engine-style query a user might type before finding the article.
    Each question captures the likely search phrase, a short summary answer,
    the user's assumed skill level, and their intent (conceptual or code-focused).
    """
    question: str = Field(
        ...,
        description="A natural, short search query — not a full-sentence question — phrased like something typed into Google."
    )
    summary_answer: str = Field(
        ...,
        description="A concise 1–2 sentence summary of how the article addresses the query."
    )
    difficulty: Literal["beginner", "intermediate", "advanced"] = Field(
        ...,
        description="The assumed knowledge level of the user making the query."
    )
    intent: Literal["text", "code"] = Field(
        ...,
        description="Specifies if the user's intent is to get a theoretical explanation ('text') or an implementation example ('code')."
    )


class GeneratedQuestions(BaseModel):
    """
    A structured collection of human-like search queries derived from a given article.
    Includes a brief description of the article topic and a list of generated queries.
    Difficulty distribution: 60% beginner, 30% intermediate, 10% advanced.
    Intent distribution: 75% code-focused, 25% concept-focused.
    """
    description: str = Field(
        ...,
        description="A summary of the article or topic these search-style questions were generated for."
    )
    questions: List[Question] = Field(
        ...,
        description="A list of realistic search queries with short summaries, difficulty levels, and user intent."
    )



In [49]:
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor

def map_progress(pool, seq, f):
    """Map function f over seq using the provided executor pool while
    displaying a tqdm progress bar. Returns a list of results in submission order.
    """
    results = []
    
    with tqdm(total=len(seq)) as progress:
        futures = []
    
        for el in seq:
            future = pool.submit(f, el)
            future.add_done_callback(lambda p: progress.update())
            futures.append(future)

        for future in futures:
            result = future.result()
            results.append(result)
        
        return results

In [50]:
import json

def process_document(doc):
    content = doc['content']
    num_questions = len(content) // 1000
    user_prompt = f"""generate {num_questions} for this document:
    <document>{json.dumps(doc)}</document>
    """
    response, usage = llm_structured(
        instructions=instructions,
        user_prompt=user_prompt,
        output_format=GeneratedQuestions
    )
    return {
        'doc': doc,
        'questions': response.questions,
        'usage': usage
    }

doc = selected_documents[0]
result = process_document(doc)


In [52]:
with ThreadPoolExecutor(max_workers=6) as pool:
    all_results = map_progress(pool, selected_documents, process_document)


100%|██████████| 68/68 [01:11<00:00,  1.06s/it]


In [57]:
all_results[0]

{'doc': {'title': 'Data definition',
  'description': 'How to map the input data.',
  'content': 'To run evaluations, you must create a `Dataset` object with a `DataDefinition`, which maps:\n\n- **Column types** (e.g., categorical, numerical, text).\n- **Column roles** (e.g., id, prediction, target).\n\nThis allows Evidently to process the data correctly. Some evaluations need specific columns and will fail if they\'re missing. You can define the mapping using the Python API or by assigning columns visually when uploading data to the Evidently platform.\n\n## Basic flow\n\n**Step 1. Imports.** Import the following modules:\n\n```python\nfrom evidently import Dataset\nfrom evidently import DataDefinition\n```\n\n**Step 2. Prepare your data.** Use a pandas.DataFrame.\n\n<Info>\n  Your data can have [flexible structure](/docs/library/overview#dataset) with any mix of categorical, numerical or text columns. Check the [Reference table](/metrics/all_metrics) for data requirements in specific

In [None]:
# for doc in selected_documents:
#     content = doc['content']
#     num_questions = len(content) // 1000

#     user_prompt = f"""
#     generate {num_questions} questions for this document:

#     {json.dumps(doc)}    
#     """.strip()
    
#     output, usage = llm_structured(
#         instructions=instructions,
#         user_prompt=user_prompt,
#         output_format=GeneratedQuestions
#     )

#     break
    

In [53]:
# for q in output.questions:
#     print(q)

In [54]:
# # usage = output[1]
# usage

In [55]:
# from toyaikit.pricing import PricingConfig

# pricing = PricingConfig()
# pricing.calculate_cost('gpt-4o-mini', usage.input_tokens, usage.output_tokens)

In [58]:
from toyaikit.pricing import PricingConfig
pricing = PricingConfig()

total_input = 0
total_output = 0

for res in all_results:
    usage = res['usage']
    total_input = total_input + usage.input_tokens
    total_output = total_output + usage.output_tokens

pricing.calculate_cost('gpt-4o-mini', total_input, total_output)


CostInfo(input_cost=0.0261153, output_cost=0.0147876, total_cost=0.0409029)

In [61]:
all_results[0]

{'doc': {'title': 'Data definition',
  'description': 'How to map the input data.',
  'content': 'To run evaluations, you must create a `Dataset` object with a `DataDefinition`, which maps:\n\n- **Column types** (e.g., categorical, numerical, text).\n- **Column roles** (e.g., id, prediction, target).\n\nThis allows Evidently to process the data correctly. Some evaluations need specific columns and will fail if they\'re missing. You can define the mapping using the Python API or by assigning columns visually when uploading data to the Evidently platform.\n\n## Basic flow\n\n**Step 1. Imports.** Import the following modules:\n\n```python\nfrom evidently import Dataset\nfrom evidently import DataDefinition\n```\n\n**Step 2. Prepare your data.** Use a pandas.DataFrame.\n\n<Info>\n  Your data can have [flexible structure](/docs/library/overview#dataset) with any mix of categorical, numerical or text columns. Check the [Reference table](/metrics/all_metrics) for data requirements in specific

In [59]:
all_questions = []

for res in all_results:
    doc = res['doc']
    questions = res['questions']
    for q in questions:
        q_dict = q.model_dump()
        q_dict['filename'] = doc['filename']
        all_questions.append(q_dict)


In [60]:
all_questions

[{'question': 'data definition mapping example',
  'summary_answer': 'The article provides code snippets for defining how to map input data using the `DataDefinition` class, including examples for text, numerical, and categorical columns.',
  'difficulty': 'beginner',
  'intent': 'code',
  'filename': 'docs/library/data_definition.mdx'},
 {'question': 'create Dataset object Evidently',
  'summary_answer': 'To create a `Dataset` object in Evidently, you can use the `Dataset.from_pandas()` method with a data definition to specify the roles and types of your data columns.',
  'difficulty': 'beginner',
  'intent': 'code',
  'filename': 'docs/library/data_definition.mdx'},
 {'question': 'Evidently Dataset column roles',
  'summary_answer': 'The article outlines various column roles in Evidently, including target and prediction columns, essential for accurate data evaluations.',
  'difficulty': 'beginner',
  'intent': 'text',
  'filename': 'docs/library/data_definition.mdx'},
 {'question': '

In [62]:
len(all_questions)

493

In [63]:
import pandas as pd
df_questions = pd.DataFrame(all_questions)

In [65]:
df_questions.to_csv('ground_truth_evidently.csv', index=False)
