# Step 1: Load and Filter Documents

In [8]:
import sys
sys.path.insert(0, '..')


from wikiagent.tools import WikipediaTools

wiki_tools = WikipediaTools()

test_topics = [
    "capybara",
    "Amazon rainforest",
    "climate change",
    "artificial intelligence",
    "nuclear energy"
]

raw_documents = []

print("Loading documents from Wikipedia...\n")
for topic in test_topics:
    results = wiki_tools.search(topic)
    for result in results:
        raw_documents.append({
            'title': result['title'],
            'snippet': result.get('snippet', ''),
            'topic': topic
        })

print(f"Total raw documents loaded: {len(raw_documents)}\n")


Loading documents from Wikipedia...

Error searching Wikipedia: 403 Client Error: Forbidden for url: https://en.wikipedia.org/w/api.php?action=query&format=json&list=search&srsearch=capybara
Error searching Wikipedia: 403 Client Error: Forbidden for url: https://en.wikipedia.org/w/api.php?action=query&format=json&list=search&srsearch=Amazon%2Brainforest
Error searching Wikipedia: 403 Client Error: Forbidden for url: https://en.wikipedia.org/w/api.php?action=query&format=json&list=search&srsearch=climate%2Bchange
Error searching Wikipedia: 403 Client Error: Forbidden for url: https://en.wikipedia.org/w/api.php?action=query&format=json&list=search&srsearch=artificial%2Bintelligence
Error searching Wikipedia: 403 Client Error: Forbidden for url: https://en.wikipedia.org/w/api.php?action=query&format=json&list=search&srsearch=nuclear%2Benergy
Total raw documents loaded: 0



In [9]:
# Step 2: Filter documents following your criteria
selected_documents = []
num_questions_total = 0

print("Filtering documents...\n")

for doc in raw_documents:
    # Skip if no title
    if 'title' not in doc:
        continue
    
    title = doc['title']
    
    # Skip unpublished, legacy, leftovers (similar to your filtering)
    if 'unpublished' in title.lower():
        continue
    if 'legacy' in title.lower():
        continue
    if 'leftovers' in title.lower():
        continue
    
    # Fetch full content from Wikipedia
    content = wiki_tools.get_page(title)
    
    # Only keep substantial documents (over 1000 characters, like your criteria)
    if len(content) <= 1000:
        continue
    
    # Calculate approximate number of questions (1 per 1000 characters)
    num_questions = len(content) // 1000
    
    print(f"{doc.get('title')}")
    print(f"  Content length: {len(content)} characters")
    print(f"  Estimated questions: {num_questions}")
    print('  ' + '-' * 40)
    
    num_questions_total = num_questions_total + num_questions
    
    selected_documents.append({
        'title': doc['title'],
        'content': content,
        'topic': doc['topic'],
        'content_length': len(content)
    })

print(f"\nTotal documents after filtering: {len(selected_documents)}")
print(f"Total estimated questions: {num_questions_total}")

Filtering documents...


Total documents after filtering: 0
Total estimated questions: 0


# Step 2: Generate Synthetic Questions Using LLM

Now we'll use the LLM to generate realistic search-style questions for each filtered document.

In [None]:
from pydantic import BaseModel, Field
from typing import List, Literal
from openai import OpenAI
import json

# Initialize OpenAI client
openai_client = OpenAI()

# Define structured output models
class Question(BaseModel):
    """Represents a realistic search-engine-style query."""
    question: str = Field(
        ...,
        description="A natural, short search query â€” not a full-sentence question."
    )
    summary_answer: str = Field(
        ...,
        description="A concise 1â€“2 sentence summary of how the article addresses the query."
    )
    difficulty: Literal["beginner", "intermediate", "advanced"] = Field(
        ...,
        description="The assumed knowledge level of the user."
    )
    intent: Literal["text", "code"] = Field(
        ...,
        description="Whether the user wants a conceptual explanation ('text') or code example ('code')."
    )

class GeneratedQuestions(BaseModel):
    """Collection of human-like search queries derived from an article."""
    description: str = Field(
        ...,
        description="A summary of the article/topic these questions were generated for."
    )
    questions: List[Question] = Field(
        ...,
        description="List of realistic search queries with metadata."
    )

# Instructions for question generation
generator_instructions = """
You are given a technical article. Your task is to imagine what a person might type into a search engine 
before finding and reading this article.

Generate realistic, human-like search queries â€” not formal questions. 
They should sound like what people actually type into Google or Stack Overflow 
when trying to solve a problem, learn a concept, or find code examples.

Guidelines:
- Avoid full-sentence questions with punctuation like "What is..." or "How do I...".
- Use short, natural search phrases instead.
- Make queries varied and spontaneous, not repetitive or over-polished.
- Assume users of different knowledge levels.

Distribution rules:
- 60% of the queries should target beginner-level users
- 30% should target intermediate-level users
- 10% should target advanced-level users
- 75% of queries should have an intent of "code" (looking for examples or implementation)
- 25% should have an intent of "text" (looking for conceptual or theoretical explanations)

For each generated query, include:
- question: the natural, human-style search phrase
- summary_answer: a short 1â€“2 sentence summary of how the article addresses it
- difficulty: one of ["beginner", "intermediate", "advanced"]
- intent: one of ["text", "code"]

Also include a description summarizing what kind of article the questions are about.
"""

def llm_structured(instructions, user_prompt, output_format, model="gpt-4o-mini"):
    """Call OpenAI with structured output parsing."""
    messages = [
        {"role": "system", "content": instructions},
        {"role": "user", "content": user_prompt}
    ]
    
    response = openai_client.beta.chat.completions.parse(
        model=model,
        messages=messages,
        response_format=output_format
    )
    
    return response.choices[0].message.parsed

def process_document(doc):
    """Generate questions for a single document."""
    content = doc['content']
    num_questions = max(1, len(content) // 1000)  # At least 1 question
    
    user_prompt = f"""Generate {num_questions} realistic search queries for this article:

Title: {doc['title']}
Topic: {doc['topic']}

Content preview:
{content[:2000]}...
"""
    
    response = llm_structured(
        instructions=generator_instructions,
        user_prompt=user_prompt,
        output_format=GeneratedQuestions
    )
    
    return {
        'doc': doc,
        'questions': response.questions,
        'description': response.description
    }

# Process each selected document
print(f"Generating questions for {len(selected_documents)} documents...\n")

all_results = []
for i, doc in enumerate(selected_documents, 1):
    print(f"Processing document {i}/{len(selected_documents)}: {doc['title']}")
    try:
        result = process_document(doc)
        all_results.append(result)
        print(f"  âœ“ Generated {len(result['questions'])} questions\n")
    except Exception as e:
        print(f"  âœ— Error: {e}\n")

# Flatten all questions
all_questions = []
for res in all_results:
    doc = res['doc']
    for q in res['questions']:
        q_dict = q.model_dump()
        q_dict['title'] = doc['title']
        q_dict['topic'] = doc['topic']
        all_questions.append(q_dict)

print(f"Total questions generated: {len(all_questions)}")

# Step 3: Run Your Agent on Generated Questions

In [None]:
import asyncio
from wikiagent.wikipagent import SearchAndFetchAgent

# Initialize your agent
agent = SearchAndFetchAgent(top_k=3)

# Extract just the questions for the agent to answer
questions_to_ask = [q['question'] for q in all_questions]

print(f"Running agent on {len(questions_to_ask)} questions...\n")

responses = []

for i, question in enumerate(questions_to_ask, 1):
    print(f"[{i}/{len(questions_to_ask)}] {question}")
    try:
        response = await agent.answer(question)
        answer = response.get("summary", "No answer returned.")
        responses.append({
            "question": question,
            "answer": answer
        })
        print(f"  âœ“ Answer received\n")
    except Exception as e:
        print(f"  âœ— Error: {e}\n")
        responses.append({
            "question": question,
            "answer": f"Error: {str(e)}"
        })

print(f"Collected {len(responses)} responses")

# Step 4: Create Ground Truth Evaluation Data

Combine generated questions with agent responses for manual evaluation.

In [None]:
import asyncio
from wikiagent.wikipagent import SearchAndFetchAgent

# Initialize the agent
agent = SearchAndFetchAgent(top_k=3)

# Store responses
responses = []

# Run the agent on each question
print("Running agent on all 10 questions...\n")
for i, question in enumerate(questions, 1):
    print(f"Question {i}: {question}")
    try:
        response = await agent.answer(question)
        answer = response.get("summary", "No answer returned.")
        responses.append({
            "question": question,
            "answer": answer
        })
        print(f"Agent's answer: {str(answer)[:200]}...\n")
    except Exception as e:
        print(f"Error: {e}\n")
        responses.append({
            "question": question,
            "answer": f"Error: {str(e)}"
        })

print(f"\nCollected {len(responses)} responses")

# Step 4: Create Ground Truth Data for Analysis

Let's create a structure for you to manually rate each response as:
- **Correct**: Does the answer accurately address the question based on the documents?
- **Complete**: Does the answer cover the key information from the documents?

In [None]:
import pandas as pd

# Create evaluation dataframe combining all data
evaluation_data = {
    "question": [],
    "summary_answer": [],
    "difficulty": [],
    "intent": [],
    "title": [],
    "topic": [],
    "agent_answer": [],
    "correct": [],
    "complete": [],
    "notes": []
}

# Merge question metadata with agent responses
for i, q_data in enumerate(all_questions):
    if i < len(responses):
        evaluation_data["question"].append(q_data['question'])
        evaluation_data["summary_answer"].append(q_data.get('summary_answer', ''))
        evaluation_data["difficulty"].append(q_data.get('difficulty', ''))
        evaluation_data["intent"].append(q_data.get('intent', ''))
        evaluation_data["title"].append(q_data.get('title', ''))
        evaluation_data["topic"].append(q_data.get('topic', ''))
        evaluation_data["agent_answer"].append(str(responses[i]['answer'])[:500])
        evaluation_data["correct"].append("")  # To be filled manually
        evaluation_data["complete"].append("")  # To be filled manually
        evaluation_data["notes"].append("")

df_evaluation = pd.DataFrame(evaluation_data)

print("\nGround Truth Evaluation Data:")
print("=" * 100)
print(df_evaluation[["question", "difficulty", "intent", "title"]].head(10).to_string())
print("=" * 100)
print(f"\nTotal rows: {len(df_evaluation)}")

In [None]:
# Save to CSV for use in Google Sheets, Excel, or LibreOffice Calc
csv_path = "ground_truth_wikipedia_agent.csv"
df_evaluation.to_csv(csv_path, index=False)

print(f"\nâœ… Ground truth evaluation template saved to: {csv_path}")
print("\nNext steps:")
print("1. Open this CSV file in Google Sheets, Excel, or LibreOffice Calc")
print("2. For each response, manually rate:")
print("   - correct (Y/N): Is the answer factually accurate?")
print("   - complete (Y/N): Does it cover all key points?")
print("3. Add notes for any observations or issues")
print("4. Analyze the results to see how well your agent is performing")

# Also display summary statistics
print(f"\nðŸ“Š Summary Statistics:")
print(f"   Total questions: {len(df_evaluation)}")
print(f"   By difficulty:")
for diff in ["beginner", "intermediate", "advanced"]:
    count = len(df_evaluation[df_evaluation['difficulty'] == diff])
    pct = (count / len(df_evaluation) * 100) if len(df_evaluation) > 0 else 0
    print(f"     - {diff}: {count} ({pct:.1f}%)")
print(f"   By intent:")
for intent in ["text", "code"]:
    count = len(df_evaluation[df_evaluation['intent'] == intent])
    pct = (count / len(df_evaluation) * 100) if len(df_evaluation) > 0 else 0
    print(f"     - {intent}: {count} ({pct:.1f}%)")