In [None]:
# Mock Setup - Hidden in rendered documentation
import sys
import tempfile
from pathlib import Path

# Add karenina to path
sys.path.insert(0, "/Users/carli/Projects/karenina-monorepo/karenina/src")

# Temporary directory for file operations
TEMP_DIR = Path(tempfile.mkdtemp(prefix="karenina_docs_"))

# Import karenina
from karenina import Benchmark

# Create a sample benchmark for demonstration
benchmark = Benchmark.create(
    name="Access and Filtering Demo",
    description="Sample benchmark for demonstrating data access patterns",
    version="1.0.0",
    creator="Documentation",
)

# Define sample questions with various metadata
questions_data = [
    {
        "question": "How many chromosomes are in the human genome?",
        "raw_answer": "46",
        "answer_template": """class Answer(BaseAnswer):
    count: int = Field(description="The number of chromosomes")
    def model_post_init(self, __context):
        self.correct = {"count": 46}
    def verify(self) -> bool:
        return self.count == 46""",
        "finished": True,
        "author": {"name": "Dr. Smith", "email": "smith@example.com"},
        "custom_metadata": {"category": "biology", "difficulty": "easy", "tags": ["genetics", "basics"]},
    },
    {
        "question": "What is the target of the drug venetoclax?",
        "raw_answer": "BCL2",
        "answer_template": '''class Answer(BaseAnswer):
    target: str = Field(description="Target protein")
    def model_post_init(self, __context):
        self.correct = {"target": "BCL2"}
    def verify(self) -> bool:
        return self.target.upper() == "BCL2"''',
        "finished": True,
        "author": {"name": "Dr. Jones", "email": "jones@example.com"},
        "custom_metadata": {"category": "pharmacology", "difficulty": "medium", "tags": ["cancer", "target"]},
    },
    {
        "question": "How many subunits does hemoglobin have?",
        "raw_answer": "4",
        "answer_template": """class Answer(BaseAnswer):
    subunits: int = Field(description="Number of subunits")
    def model_post_init(self, __context):
        self.correct = {"subunits": 4}
    def verify(self) -> bool:
        return self.subunits == 4""",
        "finished": True,
        "author": {"name": "Dr. Smith", "email": "smith@example.com"},
        "custom_metadata": {"category": "biology", "difficulty": "easy", "tags": ["proteins"]},
    },
    {
        "question": "What is machine learning?",
        "raw_answer": "A subset of artificial intelligence",
        "finished": False,
        "custom_metadata": {"category": "computer science", "difficulty": "medium", "tags": ["AI", "algorithms"]},
    },
    {
        "question": "Explain quantum mechanics in simple terms",
        "raw_answer": "The study of matter and energy at the smallest scales",
        "finished": False,
        "custom_metadata": {"category": "physics", "difficulty": "hard", "tags": ["quantum"]},
    },
    {
        "question": "Describe DNA replication and RNA synthesis processes",
        "raw_answer": "DNA replication copies genetic material; RNA synthesis transcribes it",
        "answer_template": """class Answer(BaseAnswer):
    processes: list = Field(description="List of processes described")
    def verify(self) -> bool:
        return "replication" in str(self.processes).lower() and "synthesis" in str(self.processes).lower()""",
        "finished": True,
        "author": {"name": "Dr. Smith", "email": "smith@example.com"},
        "custom_metadata": {"category": "biology", "difficulty": "hard", "tags": ["molecular biology"]},
    },
    {
        "question": "What is Python used for in data science?",
        "raw_answer": "Data analysis, visualization, and machine learning",
        "finished": False,
        "custom_metadata": {"category": "computer science", "difficulty": "medium", "tags": ["Python", "data science"]},
    },
    {
        "question": "Solve the calculus equation for the derivative",
        "raw_answer": "The derivative is computed using chain rule",
        "finished": False,
        "custom_metadata": {"category": "mathematics", "difficulty": "hard", "tags": ["calculus", "derivatives"]},
    },
]

# Add questions to benchmark
for q_data in questions_data:
    benchmark.add_question(
        question=q_data["question"],
        raw_answer=q_data["raw_answer"],
        answer_template=q_data.get("answer_template"),
        finished=q_data.get("finished", False),
        author=q_data.get("author"),
        custom_metadata=q_data.get("custom_metadata", {}),
    )


def temp_path(filename: str) -> Path:
    """Helper to create paths in temp directory."""
    return TEMP_DIR / filename


# Cleanup
import atexit
import shutil


def _cleanup():
    shutil.rmtree(TEMP_DIR, ignore_errors=True)


atexit.register(_cleanup)

print("✓ Mock setup complete")
print(f"✓ Temp directory: {TEMP_DIR}")
print(f"✓ Created benchmark with {len(benchmark)} questions")
print("✓ Sample data ready for demonstration")

# Accessing and Filtering Questions

This guide covers how to access, filter, and search through questions in your benchmark for analysis and management.

**Quick Navigation:**

- [Accessing Questions](#accessing-questions) - Basic access patterns and iteration
- [Filtering by Status](#filtering-by-status) - Finished vs unfinished, template status
- [Searching Questions by Content](#searching-questions-by-content) - Text search, regex, advanced search
- [Filtering by Metadata](#filtering-by-metadata) - Category, difficulty, multi-criteria filtering
- [Sorting Questions](#sorting-questions) - Sort by metadata, content length
- [Advanced Query Patterns](#advanced-query-patterns) - Complex filtering and statistics
- [Bulk Operations](#bulk-operations-on-filtered-questions) - Update metadata, generate templates

---

## Understanding Question Metadata

Each question in a Karenina benchmark has two types of metadata:

### System Metadata (Built-in Fields)

These are standard fields managed by Karenina:

- `id` - Unique question identifier
- `question` - The question text
- `raw_answer` - The expected answer
- `finished` - Boolean flag for template completion status
- `answer_template` - The Answer class code for verification
- `date_created` - Creation timestamp
- `date_modified` - Last modification timestamp
- `author` - Author information (optional dict)
- `sources` - Source documents (optional list)
- `question_rubric` - Question-specific rubric traits

In [None]:
# Access system metadata directly from question dictionary
question_ids = benchmark.get_question_ids()
question = benchmark.get_question(question_ids[0])
print(f"Question ID: {question['id']}")
print(f"Question text: {question['question']}")
print(f"Finished status: {question.get('finished', False)}")
print(f"Author: {question.get('author', {})}")
print(f"Date created: {question.get('date_created', 'N/A')}")

### Custom Metadata (User-defined Fields)

The `custom_metadata` field is a **dictionary** where you can store any arbitrary key-value pairs specific to your use case.

In [None]:
# Access custom metadata
question = benchmark.get_question(question_ids[0])
custom = question.get("custom_metadata", {})
print(f"Category: {custom.get('category')}")
print(f"Difficulty: {custom.get('difficulty')}")
print(f"Tags: {custom.get('tags', [])}")

# The custom_metadata structure is completely flexible
# You can add any fields you need for your use case

**Important:** Built-in filtering methods (`filter_questions`) work with system metadata. For custom metadata, use the generic filtering methods described below.

---

## Built-in Methods Overview

The Benchmark class provides several built-in methods for accessing and filtering questions:

### Access Methods
- `get_all_questions(ids_only)` - Get all questions (objects by default, IDs if `ids_only=True`)
- `get_question(question_id)` - Get a specific question by ID
- `get_question_ids()` - Get list of all question IDs

### System Metadata Filtering
- `filter_questions(finished, has_template, has_rubric, author, custom_filter)` - Filter by system fields or custom lambda
- `get_unfinished_questions(ids_only)` - Get unfinished questions
- `get_finished_questions(ids_only)` - Get finished questions
- `get_questions_by_author(author)` - Filter by author name
- `get_questions_with_rubric()` - Get questions with rubrics

### Custom Metadata Filtering
- `filter_by_custom_metadata(**criteria)` - Filter by custom fields with AND/OR logic
- `filter_by_metadata(field_path, value, match_mode)` - Generic field filtering with dot notation
- `count_by_field(field_path)` - Count questions by any field value

### Search Methods
- `search_questions(query, match_all, fields, case_sensitive, regex)` - Unified search supporting single/multi-term, regex, case-sensitive

### Template Methods
- `has_template(question_id)` - Check if question has a template
- `get_missing_templates(ids_only)` - Get questions without templates

---

## Accessing Questions

### Basic Access Patterns

In [None]:
# Get all questions as dictionaries
all_questions = benchmark.get_all_questions()
print(f"Total questions: {len(all_questions)}")

# Get question count (using len)
question_count = len(benchmark)
print(f"Question count via len(): {question_count}")

# Get list of question IDs
question_ids = benchmark.get_question_ids()
print(f"Question IDs: {question_ids}")

# Get a specific question by ID
question = benchmark.get_question(question_ids[0])
print(f"\nFirst question: {question['question']}")

In [None]:
# Iterate through questions
print("All questions in the benchmark:")
for question in benchmark.get_all_questions():
    status = "✓" if question.get("finished", False) else "○"
    print(f"  {status} {question['id'][:30]}...: {question['question'][:50]}...")

### Square Bracket Access

Karenina supports convenient square bracket notation for accessing questions:

In [None]:
# Access by index - returns SchemaOrgQuestion object\nquestion_obj = benchmark[0]\nprint(f\"First question via index: {question_obj.text}\")\n\n# For dictionary access, use get_question() instead\nquestion_dict = benchmark.get_question(benchmark.get_question_ids()[0])\nprint(f\"\\nVia get_question(): {question_dict['question']}\")\n\n# Slice access - returns list of SchemaOrgQuestion objects\nfirst_three = benchmark[0:3]\nprint(f\"\\nFirst 3 questions: {[q.id[:30] for q in first_three]}\")"

---

## Filtering by Status

### Finished vs Unfinished Questions

Questions are considered "finished" when they have both a template and verification results:

> **Note:** When adding questions through the backend API, questions are marked as "finished" by default. The frontend GUI behaves differently and marks questions as "unfinished" until templates are generated. This distinction is important when programmatically creating benchmarks versus using the web interface.

In [None]:
# Get unfinished questions (returns list of question objects by default)
unfinished_questions = benchmark.get_unfinished_questions()
print(f"Unfinished questions: {len(unfinished_questions)}")

# Iterate directly over the question objects
print("\nUnfinished questions:")
for question in unfinished_questions:
    print(f"  ○ {question['id'][:30]}...: {question['question'][:50]}...")
    print(f"    Answer: {question.get('raw_answer', 'N/A')[:50]}...")

In [None]:
# Get only question IDs if needed
unfinished_ids = benchmark.get_unfinished_questions(ids_only=True)
print(f"Unfinished question IDs: {unfinished_ids}")

# Get finished questions
finished_questions = benchmark.get_finished_questions()
print(f"\nFinished questions: {len(finished_questions)}")

In [None]:
# Check status for all questions
print("Status check for all questions:")
for question in benchmark.get_all_questions():
    is_finished = question.get("finished", False)
    has_template = benchmark.has_template(question["id"])
    status = "finished" if is_finished and has_template else "unfinished"
    print(f"  {question['id'][:30]}...: {status}")

### Template Status Filtering

Use the built-in `filter_questions` method for template-based filtering:

In [None]:
# Questions with generated templates
templated = benchmark.filter_questions(has_template=True)
print(f"Questions with templates: {len(templated)}")

# Questions needing templates
needs_templates = benchmark.filter_questions(has_template=False)
print(f"Questions needing templates: {len(needs_templates)}")

### Combined Status Filtering

The `filter_questions` method supports multiple criteria:

In [None]:
# Filter by finished status only
finished = benchmark.filter_questions(finished=True)
print(f"Finished questions: {len(finished)}")

# Filter by multiple criteria (finished, has template)
ready = benchmark.filter_questions(finished=True, has_template=True)
print(f"Finished with templates: {len(ready)}")

# Get all finished questions without templates
needs_work = benchmark.filter_questions(finished=True, has_template=False)
print(f"Finished but needs templates: {len(needs_work)}")

---

## Searching Questions by Content

The `search_questions()` method provides flexible text search with support for single/multi-term queries, regex, and case-sensitive matching.

### Simple Text Search

In [None]:
# Search in question text (default)
ml_questions = benchmark.search_questions("machine learning")
print(f"Questions matching 'machine learning': {len(ml_questions)}")
for q in ml_questions:
    print(f"  - {q['question']}")

### Multi-term Search

In [None]:
# AND logic: question must contain all terms
quantum_mechanics = benchmark.search_questions(["quantum", "mechanics"], match_all=True)
print(f"Questions with 'quantum' AND 'mechanics': {len(quantum_mechanics)}")
for q in quantum_mechanics:
    print(f"  - {q['question']}")

In [None]:
# OR logic: question contains any term
stem_terms = benchmark.search_questions(["DNA", "RNA", "protein"], match_all=False)
print(f"\nQuestions with DNA, RNA, OR protein: {len(stem_terms)}")
for q in stem_terms:
    print(f"  - {q['id'][:30]}...: {q['question'][:50]}...")

### Search in Multiple Fields

In [None]:
# Search in both question and answer
algorithm_content = benchmark.search_questions("data", fields=["question", "raw_answer"])
print(f"Questions with 'data' in question or answer: {len(algorithm_content)}")
for q in algorithm_content:
    print(f"  - {q['id'][:30]}...: {q['question'][:50]}...")

### Advanced Search Options

In [None]:
# Case-sensitive search
python_qs = benchmark.search_questions("Python", case_sensitive=True)
print(f"Questions with capital 'Python': {len(python_qs)}")
for q in python_qs:
    print(f"  - {q['question']}")

In [None]:
# Regex search
explanation_qs = benchmark.search_questions(r"\b(explain|describe|what is)\b", regex=True)
print(f"Questions asking for explanation: {len(explanation_qs)}")
for q in explanation_qs:
    print(f"  - {q['question'][:50]}...")

---

## Filtering by Metadata

### Filtering by System Metadata

Filter by built-in Karenina fields using `filter_questions()`:

In [None]:
# Filter by finished status
finished = benchmark.filter_questions(finished=True)
unfinished = benchmark.filter_questions(finished=False)
print(f"Finished: {len(finished)}, Unfinished: {len(unfinished)}")

In [None]:
# Filter by author
johns_questions = benchmark.filter_questions(author="Dr. Smith")
print(f"\nQuestions by Dr. Smith: {len(johns_questions)}")
for q in johns_questions:
    print(f"  - {q['question'][:50]}...")

# Or use the convenience method
johns_questions_alt = benchmark.get_questions_by_author("Dr. Smith")
print(f"(via convenience method: {len(johns_questions_alt)} questions)")

### Filtering by Custom Metadata

Use built-in methods to filter by your custom metadata fields:

In [None]:
# Filter by single custom metadata field (AND logic)
bio_easy = benchmark.filter_by_custom_metadata(category="biology", difficulty="easy")
print(f"Biology + Easy questions: {len(bio_easy)}")
for q in bio_easy:
    print(f"  - {q['id'][:30]}...: {q['question'][:50]}...")

In [None]:
# OR logic for custom metadata (match any criterion)
stem_subjects = benchmark.filter_by_custom_metadata(match_all=False, category="mathematics", category2="physics")
print(f"Math OR Physics questions: {len(stem_subjects)}")
for q in stem_subjects:
    print(f"  - {q['id'][:30]}...: {q['question'][:50]}...")

In [None]:
# Using generic field path filtering with dot notation
bio_qs = benchmark.filter_by_metadata("custom_metadata.category", "biology")
print(f"Biology questions: {len(bio_qs)}")

# Filter by value in a list (for tags/arrays)
genetics_tagged = benchmark.filter_by_metadata("custom_metadata.tags", "genetics", match_mode="in")
print(f"\nQuestions with 'genetics' tag: {len(genetics_tagged)}")
for q in genetics_tagged:
    print(f"  - {q['id'][:30]}...: {q['question'][:50]}...")

In [None]:
# Substring matching
bio_qs = benchmark.filter_by_metadata("custom_metadata.category", "bio", match_mode="contains")
print(f"Category containing 'bio': {len(bio_qs)}")

# Regex matching on custom fields
hard_qs = benchmark.filter_by_metadata("custom_metadata.difficulty", r"(hard|advanced)", match_mode="regex")
print(f"Hard/Advanced questions: {len(hard_qs)}")
for q in hard_qs:
    print(f"  - {q['id'][:30]}...: {q['question'][:50]}...")

### Complex Custom Filtering with Lambda

For complex logic, use the `custom_filter` parameter:

In [None]:
# Complex logic on custom metadata
bio_hard = benchmark.filter_questions(
    custom_filter=lambda q: (
        q.get("custom_metadata", {}).get("category") == "biology"
        and q.get("custom_metadata", {}).get("difficulty") == "hard"
    )
)
print(f"Biology + Hard questions: {len(bio_hard)}")
for q in bio_hard:
    print(f"  - {q['id'][:30]}...: {q['question'][:50]}...")

In [None]:
# Combine system and custom metadata filtering
hard_finished = benchmark.filter_questions(
    finished=True, custom_filter=lambda q: q.get("custom_metadata", {}).get("difficulty") == "easy"
)
print(f"\nFinished + Easy questions: {len(hard_finished)}")
for q in hard_finished:
    print(f"  - {q['id'][:30]}...: {q['question'][:50]}...")

### Statistics with Custom Metadata

Use `count_by_field()` for statistics on any field:

In [None]:
# Count by custom metadata field
category_counts = benchmark.count_by_field("custom_metadata.category")
print("Category distribution:")
for category, count in category_counts.items():
    print(f"  {category}: {count}")

In [None]:
# Count finished vs unfinished
status_counts = benchmark.count_by_field("finished")
print(f"\nStatus distribution: {status_counts}")

# Count difficulty distribution
difficulty_counts = benchmark.count_by_field("custom_metadata.difficulty")
print(f"\nDifficulty distribution: {difficulty_counts}")

In [None]:
# Count on filtered subset
bio_qs = benchmark.filter_by_custom_metadata(category="biology")
bio_difficulty_counts = benchmark.count_by_field("custom_metadata.difficulty", questions=bio_qs)
print("Biology questions by difficulty:")
for difficulty, count in bio_difficulty_counts.items():
    print(f"  {difficulty}: {count}")

---

## Sorting Questions

You can sort questions using Python's `sorted()` function with custom key functions:

In [None]:
# Get all questions first
questions = benchmark.get_all_questions()

# Sort by custom metadata with custom order
difficulty_order = {"easy": 1, "medium": 2, "hard": 3}
sorted_by_difficulty = sorted(
    questions, key=lambda q: difficulty_order.get(q.get("custom_metadata", {}).get("difficulty", "medium"), 2)
)
print("Questions sorted by difficulty:")
for q in sorted_by_difficulty:
    difficulty = q.get("custom_metadata", {}).get("difficulty", "unknown")
    print(f"  [{difficulty}] {q['question'][:40]}...")

In [None]:
# Sort by category alphabetically
sorted_by_category = sorted(questions, key=lambda q: q.get("custom_metadata", {}).get("category", ""))
print("\nQuestions sorted by category:")
for q in sorted_by_category:
    category = q.get("custom_metadata", {}).get("category", "unknown")
    print(f"  [{category}] {q['id'][:30]}...")

In [None]:
# Sort by question length
sorted_by_length = sorted(questions, key=lambda q: len(q.get("question", "")))
print("\nQuestions sorted by length (shortest first):")
for q in sorted_by_length:
    print(f"  [{len(q['question'])} chars] {q['question'][:40]}...")

---

## Advanced Query Patterns

### Combining Filters and Search

In [None]:
# First filter, then search within results
bio_questions = benchmark.filter_by_custom_metadata(category="biology")
bio_with_genetics = [
    q for q in bio_questions if "genetics" in str(q.get("custom_metadata", {}).get("tags", [])).lower()
]
print(f"Biology questions with genetics tag: {len(bio_with_genetics)}")
for q in bio_with_genetics:
    print(f"  - {q['question']}")

In [None]:
# Or use lambda for the same thing
bio_genetics_lambda = benchmark.filter_questions(
    custom_filter=lambda q: (
        q.get("custom_metadata", {}).get("category") == "biology"
        and "genetics" in str(q.get("custom_metadata", {}).get("tags", [])).lower()
    )
)
print(f"\nVia lambda: {len(bio_genetics_lambda)} questions")

### Question Statistics

In [None]:
# Get distribution of any field
category_dist = benchmark.count_by_field("custom_metadata.category")
print("Category distribution:")
for cat, count in sorted(category_dist.items()):
    print(f"  {cat}: {count}")

difficulty_dist = benchmark.count_by_field("custom_metadata.difficulty")
print("\nDifficulty distribution:")
for diff, count in sorted(difficulty_dist.items()):
    print(f"  {diff}: {count}")

---

## Bulk Operations on Filtered Questions

### Update System Metadata

In [None]:
# Mark all finished questions as unfinished
# Use ids_only=True since mark_unfinished_batch expects IDs
finished_ids = benchmark.get_finished_questions(ids_only=True)
print(f"Finished question IDs: {finished_ids}")

# In practice, you would call:
# benchmark.mark_unfinished_batch(finished_ids)
print("\n(Note: mark_unfinished_batch would be called here in practice)")

In [None]:
# Update author for specific questions
bio_qs = benchmark.filter_by_custom_metadata(category="biology")
print(f"Biology questions to update author for: {len(bio_qs)}")

# In practice, you would iterate and update:
# for q in bio_qs:
#     benchmark.set_question_author(q["id"], {"name": "Bio Team", "email": "bio@example.com"})
print("(Author update would be performed here in practice)")

### Update Custom Metadata

In [None]:
# Add tags to all biology questions
bio_qs = benchmark.filter_by_custom_metadata(category="biology")
print(f"Biology questions to tag: {len(bio_qs)}")

for question in bio_qs:
    question_id = question["id"]
    # Get current custom metadata
    custom_meta = benchmark.get_question_metadata(question_id).get("custom_metadata", {})
    print(f"  {question_id[:30]}...: current tags = {custom_meta.get('tags', [])}")

    # In practice, you would add the tag:
    # if "tags" not in custom_meta:
    #     custom_meta["tags"] = []
    # if "reviewed" not in custom_meta["tags"]:
    #     custom_meta["tags"].append("reviewed")
    # benchmark.update_question_metadata(question_id, custom_metadata=custom_meta)

print("\n(Tag update would be performed here in practice)")

In [None]:
# Or use the convenience method for single properties
for q in bio_qs:
    # In practice:
    # benchmark.set_question_custom_property(q["id"], "reviewed", True)
    pass

print("Convenience method example: set_question_custom_property(question_id, 'reviewed', True)")

### Generate Templates for Filtered Questions

In [None]:
# Generate templates only for unfinished questions
# Use ids_only=True since generate_templates expects a list of IDs
unfinished_ids = benchmark.get_unfinished_questions(ids_only=True)
print(f"Unfinished question IDs needing templates: {unfinished_ids}")

# In practice, use the bulk generation method:
# results = benchmark.generate_templates(
#     question_ids=unfinished_ids,
#     model="gemini-2.0-flash",
#     model_provider="google_genai",
#     temperature=0
# )

# Check results
# successful = sum(1 for r in results.values() if r["success"])
# print(f"Generated {successful}/{len(unfinished_ids)} templates")

print("\n(Template generation would be performed here in practice)")
print("Requires LLM API credentials to execute.")

## Next Steps

Once you can effectively access and filter questions:

- [Set up templates](templates.md) for evaluation structure
- [Configure rubrics](rubrics.md) for assessment criteria
- [Run verification](verification.md) to evaluate responses