# Email Agent Evaluation: Agent-Generated Questions

This notebook implements the Week 3 approach for generating and evaluating email agent responses.

## Process Overview
1. **Generate Questions**: Use LLM to generate realistic test questions from agent documentation
2. **Run Evaluation**: Execute agent on each question and collect responses  
3. **Evaluate Responses**: Score responses using 8-point rubric
4. **Analyze Results**: Review metrics and identify improvement areas

## Section 1: Setup and Configuration

In [None]:
import os
import sys
from pathlib import Path
import json
from datetime import datetime, timedelta

# Add email_agent to path
sys.path.insert(0, str(Path.cwd()))

from dotenv import load_dotenv
import pandas as pd

# Load environment
load_dotenv()

print("‚úÖ Setup complete")

## Section 2: Generate Synthetic Email Data

Create realistic test emails based on your use case. These represent the data your agent will work with.

In [None]:
# Define synthetic email dataset for testing
# Based on typical business email scenarios: invoices, project updates, HR communications

synthetic_emails = [
    {
        'from': 'accounting@luxcmar.com',
        'to': 'me@example.com',
        'subject': 'Invoice #INV-2025-001 for January Services',
        'body': 'Please find attached invoice for services rendered in January. Total amount due: $5,250. Payment terms: Net 30 days.',
        'date': (datetime.now() - timedelta(days=5)).isoformat(),
        'has_attachments': True,
        'attachments': ['invoice_january.pdf']
    },
    {
        'from': 'accounting@luxcmar.com',
        'to': 'me@example.com',
        'subject': 'Invoice #INV-2025-002 for February Services',
        'body': 'Please find attached invoice for February services. Total: $7,500. Please remit payment within 30 days.',
        'date': (datetime.now() - timedelta(days=3)).isoformat(),
        'has_attachments': True,
        'attachments': ['invoice_february.pdf']
    },
    {
        'from': 'john.smith@company.com',
        'to': 'me@example.com',
        'subject': 'RE: Project Alpha Status Update',
        'body': 'The project is on schedule. We completed 60% of Phase 1. Team will present findings next week.',
        'date': (datetime.now() - timedelta(days=2)).isoformat(),
        'has_attachments': False,
        'attachments': []
    },
    {
        'from': 'john.smith@company.com',
        'to': 'me@example.com',
        'subject': 'Project Alpha - Technical Specifications',
        'body': 'Attached are the technical specs for Project Alpha. Please review and provide feedback by Friday.',
        'date': (datetime.now() - timedelta(days=1)).isoformat(),
        'has_attachments': True,
        'attachments': ['project_alpha_specs.pdf', 'requirements.docx']
    },
    {
        'from': 'hr@company.com',
        'to': 'me@example.com',
        'subject': 'Benefits Renewal - Action Required',
        'body': 'Your benefits renewal period is open. Please review available plans and enroll by March 31st.',
        'date': (datetime.now() - timedelta(days=7)).isoformat(),
        'has_attachments': True,
        'attachments': ['benefits_guide_2025.pdf']
    },
    {
        'from': 'hr@company.com',
        'to': 'me@example.com',
        'subject': 'RE: Benefits Renewal - Action Required',
        'body': 'Hi, just a reminder that enrollment closes on March 31st. Let me know if you have questions.',
        'date': (datetime.now() - timedelta(days=5)).isoformat(),
        'has_attachments': False,
        'attachments': []
    },
    {
        'from': 'sales@vendor.com',
        'to': 'me@example.com',
        'subject': 'Q1 2025 Contract - Review and Sign',
        'body': 'Please review and sign the attached Q1 contract. Total value: $25,000 for software licenses.',
        'date': (datetime.now() - timedelta(days=10)).isoformat(),
        'has_attachments': True,
        'attachments': ['contract_q1_2025.pdf']
    },
    {
        'from': 'alice.johnson@project.com',
        'to': 'me@example.com',
        'subject': 'Meeting Notes - Strategic Planning',
        'body': 'Please find meeting notes from today\'s strategic planning session. Action items assigned to team.',
        'date': (datetime.now() - timedelta(days=1)).isoformat(),
        'has_attachments': True,
        'attachments': ['meeting_notes_2025-01-24.docx']
    },
    {
        'from': 'alice.johnson@project.com',
        'to': 'me@example.com',
        'subject': 'RE: Meeting Notes - Strategic Planning',
        'body': 'Follow-up: Can you please provide your input on Section 3 by tomorrow?',
        'date': (datetime.now() - timedelta(hours=2)).isoformat(),
        'has_attachments': False,
        'attachments': []
    },
    {
        'from': 'compliance@company.com',
        'to': 'me@example.com',
        'subject': 'Annual Compliance Training - Complete by March 15',
        'body': 'All employees must complete annual compliance training. Click link to access course.',
        'date': (datetime.now() - timedelta(days=15)).isoformat(),
        'has_attachments': False,
        'attachments': []
    }
]

print(f"‚úÖ Generated {len(synthetic_emails)} synthetic emails")
print("\\nSample emails:")
for i, email in enumerate(synthetic_emails[:3], 1):
    print(f"  {i}. From: {email['from']} | Subject: {email['subject']}")

## Section 3: Create Test Questions Based on Synthetic Data

These questions are designed to test specific agent capabilities with the synthetic emails.

In [None]:
# Define test questions based on the synthetic data
# These test different agent capabilities

test_questions = [
    {
        'id': 'q1',
        'question': 'Show me all invoices from accounting@luxcmar.com',
        'type': 'sender_search',
        'expected_result': 'Should find 2 invoices (INV-2025-001 and INV-2025-002)',
        'tests': ['conversation_history', 'sender_filtering']
    },
    {
        'id': 'q2',
        'question': 'What is the total amount of invoices?',
        'type': 'data_extraction',
        'expected_result': 'Should identify $5,250 + $7,500 = $12,750 total',
        'tests': ['attachment_parsing', 'content_understanding']
    },
    {
        'id': 'q3',
        'question': 'What is the deadline for the HR benefits enrollment?',
        'type': 'deadline_extraction',
        'expected_result': 'Should find March 31st deadline',
        'tests': ['search', 'content_understanding']
    },
    {
        'id': 'q4',
        'question': 'Show all emails from john.smith@company.com',
        'type': 'sender_search',
        'expected_result': 'Should find 2 emails about Project Alpha',
        'tests': ['conversation_history', 'threading']
    },
    {
        'id': 'q5',
        'question': 'What files did john.smith@company.com attach?',
        'type': 'attachment_list',
        'expected_result': 'Should list project_alpha_specs.pdf and requirements.docx',
        'tests': ['attachment_handling', 'threading']
    },
    {
        'id': 'q6',
        'question': 'Find all emails with attachments',
        'type': 'filter_by_attachment',
        'expected_result': 'Should find 6 emails with attachments',
        'tests': ['search', 'attachment_filtering']
    },
    {
        'id': 'q7',
        'question': 'What is the contract value with the vendor?',
        'type': 'data_extraction',
        'expected_result': 'Should find $25,000 contract value',
        'tests': ['attachment_parsing', 'search']
    },
    {
        'id': 'q8',
        'question': 'Show me the conversation thread with alice.johnson@project.com',
        'type': 'thread_conversation',
        'expected_result': 'Should show both emails in chronological order (original + follow-up)',
        'tests': ['threading', 'conversation_history']
    },
    {
        'id': 'q9',
        'question': 'What are the action items from the strategic planning meeting?',
        'type': 'action_extraction',
        'expected_result': 'Should parse meeting notes and identify action items',
        'tests': ['attachment_parsing', 'content_understanding']
    },
    {
        'id': 'q10',
        'question': 'Which emails arrived in the last 3 days?',
        'type': 'date_filter',
        'expected_result': 'Should find emails from john.smith, alice.johnson (2), and alice follow-up',
        'tests': ['search', 'date_filtering']
    },
    {
        'id': 'q11',
        'question': 'Group my emails by sender',
        'type': 'grouping',
        'expected_result': 'Should group by unique senders (7 groups)',
        'tests': ['search', 'data_organization']
    },
    {
        'id': 'q12',
        'question': 'What is the project completion status mentioned by john.smith@company.com?',
        'type': 'content_extraction',
        'expected_result': 'Should find "60% of Phase 1 completed"',
        'tests': ['threading', 'content_understanding']
    }
]

print(f"‚úÖ Created {len(test_questions)} test questions\\n")
print("Test questions:")
for q in test_questions:
    print(f"  {q['id']}: {q['question']}")

## Section 4: Initialize Email Agent

In [None]:
from email_agent import EmailAgent, authenticate_gmail, ElasticsearchEmailStore

print("üîê Initializing agent...\\n")

try:
    # Initialize Gmail
    gmail_service = authenticate_gmail()
    print("‚úÖ Gmail authenticated")
    
    # Initialize Elasticsearch
    es_host = os.getenv("ES_HOST", "localhost")
    es_port = int(os.getenv("ES_PORT", "9200"))
    es_store = ElasticsearchEmailStore(host=es_host, port=es_port)
    print(f"‚úÖ Elasticsearch connected ({es_host}:{es_port})")
    
    # Create agent
    agent = EmailAgent(
        gmail_service=gmail_service,
        es_store=es_store,
        model="gpt-4o-mini"
    )
    print("‚úÖ Agent initialized\\n")
    print("Agent is ready to answer questions!")
    
except Exception as e:
    print(f"‚ö†Ô∏è  Agent initialization failed: {e}")
    print("\\nMake sure:")
    print("  1. Elasticsearch is running (docker run -d --name elasticsearch ...)")
    print("  2. Gmail credentials are configured (.env file)")
    print("  3. OPENAI_API_KEY is set")

## Section 5: Ask Questions and Evaluate Responses

**Key Question: Do the results make sense?**

Interact with the agent and record your findings for each question.

In [None]:
# Store results for analysis
evaluation_results = []

print("üöÄ Starting Evaluation\\n")
print("="*80)

for i, test_q in enumerate(test_questions, 1):
    print(f"\\n[Q{i}/{len(test_questions)}] {test_q['question']}")
    print(f"Type: {test_q['type']}")
    print(f"Expected: {test_q['expected_result']}")
    print("-" * 80)
    
    try:
        # Get agent response
        response = agent.chat(test_q['question'])
        
        # Display response
        print(f"\\nü§ñ Agent Response:")
        print(response[:500])  # Show first 500 chars
        if len(response) > 500:
            print("\\n[... response truncated ...]")
        
        # Record result
        evaluation_results.append({
            'q_id': test_q['id'],
            'question': test_q['question'],
            'expected': test_q['expected_result'],
            'response': response,
            'status': 'completed'
        })
        
    except Exception as e:
        print(f"\\n‚ùå Error: {str(e)}")
        evaluation_results.append({
            'q_id': test_q['id'],
            'question': test_q['question'],
            'expected': test_q['expected_result'],
            'response': f"ERROR: {str(e)}",
            'status': 'error'
        })

print("\\n" + "="*80)
print(f"\\n‚úÖ Evaluation complete! Tested {len(test_questions)} questions")

## Section 6: Document Your Findings

Record which questions worked, which didn't, and what needs to be noted.

In [None]:
# Create evaluation dataframe
df_eval = pd.DataFrame(evaluation_results)

print("\\nüìä EVALUATION SUMMARY\\n")
print(f"Total Questions: {len(df_eval)}")
print(f"Completed: {len(df_eval[df_eval['status'] == 'completed'])}")
print(f"Errors: {len(df_eval[df_eval['status'] == 'error'])}")

# Show results table
print("\\n" + "="*80)
print("QUESTIONS ASKED:")
print("="*80)

for idx, row in df_eval.iterrows():
    q_num = idx + 1
    status_icon = "‚úÖ" if row['status'] == 'completed' else "‚ùå"
    print(f"\\n{status_icon} [{q_num}] {row['question']}")
    print(f"   Expected: {row['expected']}")
    if row['status'] == 'completed':
        preview = row['response'][:150]
        print(f"   Got: {preview}..." if len(row['response']) > 150 else f"   Got: {row['response']}")
    else:
        print(f"   Error: {row['response']}")

## Section 7: Manual Evaluation - Do the Results Make Sense?

For each question, manually assess if the agent's response is correct and makes sense.

In [None]:
# Manual evaluation summary - fill this in after reviewing all responses

print("\\nüìù EVALUATION FINDINGS SUMMARY")
print("="*80)
print("\\nReview each question's response and answer:")
print("  1. Does the response make sense?")
print("  2. Does it match the expected result?")
print("  3. Any issues or missing features?\\n")

print("Example findings template:")
print("""
q1: Sender search from LuxC Mar
    Makes sense? ‚úÖ YES
    Matches expected? ‚úÖ YES - Found both invoices
    Issues? None

q2: Total invoice amount
    Makes sense? ‚ö†Ô∏è  PARTIAL - Found invoices but didn't calculate total
    Matches expected? ‚ùå NO - Expected sum but got list
    Issues: Agent should perform calculations on extracted amounts
""")

print("\\n" + "="*80)
print("YOUR FINDINGS (fill in after reviewing):")
print("="*80)

## Section 8: Save Evaluation Results

In [None]:
# Save evaluation results
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Save as CSV
csv_file = f"evaluation_results_{timestamp}.csv"
df_eval.to_csv(csv_file, index=False)
print(f"‚úÖ Saved results to {csv_file}")

# Save as JSON for detailed analysis
json_file = f"evaluation_results_{timestamp}.json"
with open(json_file, 'w') as f:
    json.dump(evaluation_results, f, indent=2)
print(f"‚úÖ Saved detailed results to {json_file}")

# Save test questions reference
questions_file = f"test_questions_{timestamp}.json"
with open(questions_file, 'w') as f:
    json.dump(test_questions, f, indent=2)
print(f"‚úÖ Saved test questions to {questions_file}")

## Section 1: Setup and Configuration

In [None]:
import os
import sys
from pathlib import Path
import json
from datetime import datetime, timedelta

# Add email_agent to path
sys.path.insert(0, str(Path.cwd()))

from dotenv import load_dotenv
import pandas as pd

# Load environment
load_dotenv()

print("‚úÖ Setup complete")

## Section 2: Generate Synthetic Email Data

Create realistic test emails based on your use case. These represent the data your agent will work with.

In [None]:
# Define synthetic email dataset for testing
# Based on typical business email scenarios: invoices, project updates, HR communications

synthetic_emails = [
    {
        'from': 'accounting@luxcmar.com',
        'to': 'me@example.com',
        'subject': 'Invoice #INV-2025-001 for January Services',
        'body': 'Please find attached invoice for services rendered in January. Total amount due: $5,250. Payment terms: Net 30 days.',
        'date': (datetime.now() - timedelta(days=5)).isoformat(),
        'has_attachments': True,
        'attachments': ['invoice_january.pdf']
    },
    {
        'from': 'accounting@luxcmar.com',
        'to': 'me@example.com',
        'subject': 'Invoice #INV-2025-002 for February Services',
        'body': 'Please find attached invoice for February services. Total: $7,500. Please remit payment within 30 days.',
        'date': (datetime.now() - timedelta(days=3)).isoformat(),
        'has_attachments': True,
        'attachments': ['invoice_february.pdf']
    },
    {
        'from': 'john.smith@company.com',
        'to': 'me@example.com',
        'subject': 'RE: Project Alpha Status Update',
        'body': 'The project is on schedule. We completed 60% of Phase 1. Team will present findings next week.',
        'date': (datetime.now() - timedelta(days=2)).isoformat(),
        'has_attachments': False,
        'attachments': []
    },
    {
        'from': 'john.smith@company.com',
        'to': 'me@example.com',
        'subject': 'Project Alpha - Technical Specifications',
        'body': 'Attached are the technical specs for Project Alpha. Please review and provide feedback by Friday.',
        'date': (datetime.now() - timedelta(days=1)).isoformat(),
        'has_attachments': True,
        'attachments': ['project_alpha_specs.pdf', 'requirements.docx']
    },
    {
        'from': 'hr@company.com',
        'to': 'me@example.com',
        'subject': 'Benefits Renewal - Action Required',
        'body': 'Your benefits renewal period is open. Please review available plans and enroll by March 31st.',
        'date': (datetime.now() - timedelta(days=7)).isoformat(),
        'has_attachments': True,
        'attachments': ['benefits_guide_2025.pdf']
    },
    {
        'from': 'hr@company.com',
        'to': 'me@example.com',
        'subject': 'RE: Benefits Renewal - Action Required',
        'body': 'Hi, just a reminder that enrollment closes on March 31st. Let me know if you have questions.',
        'date': (datetime.now() - timedelta(days=5)).isoformat(),
        'has_attachments': False,
        'attachments': []
    },
    {
        'from': 'sales@vendor.com',
        'to': 'me@example.com',
        'subject': 'Q1 2025 Contract - Review and Sign',
        'body': 'Please review and sign the attached Q1 contract. Total value: $25,000 for software licenses.',
        'date': (datetime.now() - timedelta(days=10)).isoformat(),
        'has_attachments': True,
        'attachments': ['contract_q1_2025.pdf']
    },
    {
        'from': 'alice.johnson@project.com',
        'to': 'me@example.com',
        'subject': 'Meeting Notes - Strategic Planning',
        'body': 'Please find meeting notes from today\'s strategic planning session. Action items assigned to team.',
        'date': (datetime.now() - timedelta(days=1)).isoformat(),
        'has_attachments': True,
        'attachments': ['meeting_notes_2025-01-24.docx']
    },
    {
        'from': 'alice.johnson@project.com',
        'to': 'me@example.com',
        'subject': 'RE: Meeting Notes - Strategic Planning',
        'body': 'Follow-up: Can you please provide your input on Section 3 by tomorrow?',
        'date': (datetime.now() - timedelta(hours=2)).isoformat(),
        'has_attachments': False,
        'attachments': []
    },
    {
        'from': 'compliance@company.com',
        'to': 'me@example.com',
        'subject': 'Annual Compliance Training - Complete by March 15',
        'body': 'All employees must complete annual compliance training. Click link to access course.',
        'date': (datetime.now() - timedelta(days=15)).isoformat(),
        'has_attachments': False,
        'attachments': []
    }
]

print(f"‚úÖ Generated {len(synthetic_emails)} synthetic emails")
print("\nSample emails:")
for i, email in enumerate(synthetic_emails[:3], 1):
    print(f"  {i}. From: {email['from']} | Subject: {email['subject']}")

## Section 3: Create Test Questions Based on Synthetic Data

These questions are designed to test specific agent capabilities with the synthetic emails.

In [None]:
# Define test questions based on the synthetic data
# These test different agent capabilities

test_questions = [
    {
        'id': 'q1',
        'question': 'Show me all invoices from accounting@luxcmar.com',
        'type': 'sender_search',
        'expected_result': 'Should find 2 invoices (INV-2025-001 and INV-2025-002)',
        'tests': ['conversation_history', 'sender_filtering']
    },
    {
        'id': 'q2',
        'question': 'What is the total amount of invoices?',
        'type': 'data_extraction',
        'expected_result': 'Should identify $5,250 + $7,500 = $12,750 total',
        'tests': ['attachment_parsing', 'content_understanding']
    },
    {
        'id': 'q3',
        'question': 'What is the deadline for the HR benefits enrollment?',
        'type': 'deadline_extraction',
        'expected_result': 'Should find March 31st deadline',
        'tests': ['search', 'content_understanding']
    },
    {
        'id': 'q4',
        'question': 'Show all emails from john.smith@company.com',
        'type': 'sender_search',
        'expected_result': 'Should find 2 emails about Project Alpha',
        'tests': ['conversation_history', 'threading']
    },
    {
        'id': 'q5',
        'question': 'What files did john.smith@company.com attach?',
        'type': 'attachment_list',
        'expected_result': 'Should list project_alpha_specs.pdf and requirements.docx',
        'tests': ['attachment_handling', 'threading']
    },
    {
        'id': 'q6',
        'question': 'Find all emails with attachments',
        'type': 'filter_by_attachment',
        'expected_result': 'Should find 6 emails with attachments',
        'tests': ['search', 'attachment_filtering']
    },
    {
        'id': 'q7',
        'question': 'What is the contract value with the vendor?',
        'type': 'data_extraction',
        'expected_result': 'Should find $25,000 contract value',
        'tests': ['attachment_parsing', 'search']
    },
    {
        'id': 'q8',
        'question': 'Show me the conversation thread with alice.johnson@project.com',
        'type': 'thread_conversation',
        'expected_result': 'Should show both emails in chronological order (original + follow-up)',
        'tests': ['threading', 'conversation_history']
    },
    {
        'id': 'q9',
        'question': 'What are the action items from the strategic planning meeting?',
        'type': 'action_extraction',
        'expected_result': 'Should parse meeting notes and identify action items',
        'tests': ['attachment_parsing', 'content_understanding']
    },
    {
        'id': 'q10',
        'question': 'Which emails arrived in the last 3 days?',
        'type': 'date_filter',
        'expected_result': 'Should find emails from john.smith, alice.johnson (2), and alice follow-up',
        'tests': ['search', 'date_filtering']
    },
    {
        'id': 'q11',
        'question': 'Group my emails by sender',
        'type': 'grouping',
        'expected_result': 'Should group by unique senders (7 groups)',
        'tests': ['search', 'data_organization']
    },
    {
        'id': 'q12',
        'question': 'What is the project completion status mentioned by john.smith@company.com?',
        'type': 'content_extraction',
        'expected_result': 'Should find "60% of Phase 1 completed"',
        'tests': ['threading', 'content_understanding']
    }
]

print(f"‚úÖ Created {len(test_questions)} test questions\n")
print("Test questions:")
for q in test_questions:
    print(f"  {q['id']}: {q['question']}")

## Section 4: Initialize Email Agent

In [None]:
from email_agent import EmailAgent, authenticate_gmail, ElasticsearchEmailStore

print("üîê Initializing agent...\n")

try:
    # Initialize Gmail
    gmail_service = authenticate_gmail()
    print("‚úÖ Gmail authenticated")
    
    # Initialize Elasticsearch
    es_host = os.getenv("ES_HOST", "localhost")
    es_port = int(os.getenv("ES_PORT", "9200"))
    es_store = ElasticsearchEmailStore(host=es_host, port=es_port)
    print(f"‚úÖ Elasticsearch connected ({es_host}:{es_port})")
    
    # Create agent
    agent = EmailAgent(
        gmail_service=gmail_service,
        es_store=es_store,
        model="gpt-4o-mini"
    )
    print("‚úÖ Agent initialized\n")
    print("Agent is ready to answer questions!")
    
except Exception as e:
    print(f"‚ö†Ô∏è  Agent initialization failed: {e}")
    print("\nMake sure:")
    print("  1. Elasticsearch is running (docker run -d --name elasticsearch ...)")
    print("  2. Gmail credentials are configured (.env file)")
    print("  3. OPENAI_API_KEY is set")

## Section 5: Ask Questions and Evaluate Responses

**Key Question: Do the results make sense?**

Interact with the agent and record your findings for each question.

In [None]:
# Store results for analysis
evaluation_results = []

print("üöÄ Starting Evaluation\n")
print("="*80)

for i, test_q in enumerate(test_questions, 1):
    print(f"\n[Q{i}/{len(test_questions)}] {test_q['question']}")
    print(f"Type: {test_q['type']}")
    print(f"Expected: {test_q['expected_result']}")
    print("-" * 80)
    
    try:
        # Get agent response
        response = agent.chat(test_q['question'])
        
        # Display response
        print(f"\nü§ñ Agent Response:")
        print(response[:500])  # Show first 500 chars
        if len(response) > 500:
            print("\n[... response truncated ...]")
        
        # Record result
        evaluation_results.append({
            'q_id': test_q['id'],
            'question': test_q['question'],
            'expected': test_q['expected_result'],
            'response': response,
            'status': 'completed'
        })
        
    except Exception as e:
        print(f"\n‚ùå Error: {str(e)}")
        evaluation_results.append({
            'q_id': test_q['id'],
            'question': test_q['question'],
            'expected': test_q['expected_result'],
            'response': f"ERROR: {str(e)}",
            'status': 'error'
        })

print("\n" + "="*80)
print(f"\n‚úÖ Evaluation complete! Tested {len(test_questions)} questions")

## Section 6: Document Your Findings

Record which questions worked, which didn't, and what needs to be noted.

In [None]:
# Create evaluation dataframe
df_eval = pd.DataFrame(evaluation_results)

print("\nüìä EVALUATION SUMMARY\n")
print(f"Total Questions: {len(df_eval)}")
print(f"Completed: {len(df_eval[df_eval['status'] == 'completed'])}")
print(f"Errors: {len(df_eval[df_eval['status'] == 'error'])}")

# Show results table
print("\n" + "="*80)
print("QUESTIONS ASKED:")
print("="*80)

for idx, row in df_eval.iterrows():
    q_num = idx + 1
    status_icon = "‚úÖ" if row['status'] == 'completed' else "‚ùå"
    print(f"\n{status_icon} [{q_num}] {row['question']}")
    print(f"   Expected: {row['expected']}")
    if row['status'] == 'completed':
        preview = row['response'][:150]
        print(f"   Got: {preview}..." if len(row['response']) > 150 else f"   Got: {row['response']}")
    else:
        print(f"   Error: {row['response']}")

## Section 7: Manual Evaluation - Do the Results Make Sense?

For each question, manually assess if the agent's response is correct and makes sense.

In [None]:
# Manual evaluation summary - fill this in after reviewing all responses

print("\nüìù EVALUATION FINDINGS SUMMARY")
print("="*80)
print("\nReview each question's response and answer:")
print("  1. Does the response make sense?")
print("  2. Does it match the expected result?")
print("  3. Any issues or missing features?\n")

print("Example findings template:")
print("""
q1: Sender search from LuxC Mar
    Makes sense? ‚úÖ YES
    Matches expected? ‚úÖ YES - Found both invoices
    Issues? None

q2: Total invoice amount
    Makes sense? ‚ö†Ô∏è  PARTIAL - Found invoices but didn't calculate total
    Matches expected? ‚ùå NO - Expected sum but got list
    Issues: Agent should perform calculations on extracted amounts
""")

print("\n" + "="*80)
print("YOUR FINDINGS (fill in after reviewing):")
print("="*80)

## Section 8: Save Evaluation Results

In [None]:
# Save evaluation results
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Save as CSV
csv_file = f"evaluation_results_{timestamp}.csv"
df_eval.to_csv(csv_file, index=False)
print(f"‚úÖ Saved results to {csv_file}")

# Save as JSON for detailed analysis
json_file = f"evaluation_results_{timestamp}.json"
with open(json_file, 'w') as f:
    json.dump(evaluation_results, f, indent=2)
print(f"‚úÖ Saved detailed results to {json_file}")

# Save test questions reference
questions_file = f"test_questions_{timestamp}.json"
with open(questions_file, 'w') as f:
    json.dump(test_questions, f, indent=2)
print(f"‚úÖ Saved test questions to {questions_file}")

## Section 1: Setup and Configuration

In [None]:
import os
import sys
from pathlib import Path

# Add email_agent to path
sys.path.insert(0, str(Path.cwd()))

from dotenv import load_dotenv
import pandas as pd

# Load environment
load_dotenv()

print(" Setup complete")

## Section 2: Generate Questions from Documentation

First, generate test questions using the LLM from your agent documentation.

In [None]:
from email_agent.generate_evaluation_data import main as generate_questions, Config

# Generate questions from README.md
doc_paths = ["README.md"]

config = Config(
    model="gpt-4o-mini",
    max_workers=2,
    output_file="evaluation_dataset.csv"
)

print("ü§ñ Generating test questions from documentation...")
print("   This uses the LLM to create realistic user queries\n")

generate_questions(doc_paths, config)

In [None]:
# Load and review generated questions
if Path("evaluation_dataset.csv").exists():
    df_questions = pd.read_csv("evaluation_dataset.csv")
    print(f"\n‚úÖ Generated {len(df_questions)} questions\n")
    print("Sample questions:")
    print(df_questions[['question', 'difficulty', 'intent']].head(10).to_string())
    print(f"\nDistribution:")
    print(df_questions['difficulty'].value_counts())
else:
    print("‚ö†Ô∏è  CSV file not created yet")

## Section 3: Initialize Email Agent

Set up the agent with Gmail and Elasticsearch for evaluation.

In [None]:
from email_agent import EmailAgent, authenticate_gmail, ElasticsearchEmailStore

print("üîê Initializing agent...")

# Initialize Gmail
gmail_service = authenticate_gmail()

# Initialize Elasticsearch
es_host = os.getenv("ES_HOST", "localhost")
es_port = int(os.getenv("ES_PORT", "9200"))
es_store = ElasticsearchEmailStore(host=es_host, port=es_port)

# Create agent
agent = EmailAgent(
    gmail_service=gmail_service,
    es_store=es_store,
    model="gpt-4o-mini"
)

print("‚úÖ Agent initialized successfully")

## Section 4: Run Evaluation

Execute the agent on all generated questions and evaluate responses.

In [None]:
from email_agent import ManualEvaluator

# Initialize evaluator
evaluator = ManualEvaluator(output_dir="./evaluations")

# Load dataset from generated CSV
dataset = evaluator.load_dataset_from_csv("evaluation_dataset.csv")

print(f"üìä Dataset loaded: {len(dataset.questions)} questions\n")

# Run evaluation
results = evaluator.run_evaluation(agent, dataset)

## Section 5: Review Evaluation Report

In [None]:
# Generate and display report
evaluator.print_report()

# Save results
timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")
evaluator.save_results(f"evaluations/results_{timestamp}.pkl")
evaluator.save_report(f"evaluations/report_{timestamp}.txt")
evaluator.save_results_csv(f"evaluations/results_{timestamp}.csv")

## Section 6: Analyze Results

Detailed analysis of evaluation metrics and per-question breakdown.

In [None]:
# Analyze per-question performance
if results:
    data = []
    for result in results:
        row = {
            'question': result.question[:50] + "...",
            'overall_score': result.overall_score,
            'checks_passed': sum(1 for c in result.checks if c.passed),
            'total_checks': len(result.checks),
            'avg_score': sum(c.score for c in result.checks) / len(result.checks) if result.checks else 0
        }
        data.append(row)
    
    df_results = pd.DataFrame(data)
    print("\nüìã Per-Question Analysis:\n")
    print(df_results.to_string())
    
    print(f"\n\nüìä Summary Statistics:")
    print(f"  Best score: {df_results['overall_score'].max():.1f}%")
    print(f"  Worst score: {df_results['overall_score'].min():.1f}%")
    print(f"  Average: {df_results['overall_score'].mean():.1f}%")
    print(f"  Median: {df_results['overall_score'].median():.1f}%")

In [None]:
# Create results dataframe for analysis
results_data = []
for result in results:
    row = {
        'question': result.question,
        'overall_score': result.overall_score,
    }
    for check in result.checks:
        row[check.name.value] = check.score
    results_data.append(row)

df_results = pd.DataFrame(results_data)

print("üìä Results Summary Statistics:")
print(f"  Average Score: {df_results['overall_score'].mean():.1f}%")
print(f"  Median Score: {df_results['overall_score'].median():.1f}%")
print(f"  Min Score: {df_results['overall_score'].min():.1f}%")
print(f"  Max Score: {df_results['overall_score'].max():.1f}%")

print("\n‚úÖ Per-Check Average Scores:")
check_columns = [col for col in df_results.columns if col != 'question' and col != 'overall_score']
for col in check_columns:
    print(f"  {col:25s}: {df_results[col].mean():.1f}/10")

# Display detailed results
print("\nüìù Detailed Results:")
print(df_results.to_string())