# IRIS System Evaluation Notebook

This notebook provides tools for evaluating the IRIS system against expected outputs and tracking agent decisions at each stage.

In [1]:
# This enables auto-reloading of modules
%load_ext autoreload
%autoreload 2

import json
import pandas as pd
import time
from IPython.display import display, HTML
from iris.src.chat_model.model import model

## Test Case Definition

First, let's define a structure for our test cases. Each test case should include:
- A unique ID
- A description of the test case
- The input conversation
- Expected outputs at various stages (e.g., expected route, expected clarification, expected databases)

In [2]:
# Define some sample test cases
test_cases = [
    {
        "id": "TC001",
        "description": "Simple IFRS 16 lease query",
        "conversation": {
            "messages": [
                {"role": "system", "content": "You are a helpful accounting assistant."},
                {"role": "user", "content": "How do I account for lease modifications under IFRS 16?"}
            ]
        },
        "expected": {
            "router": "research_from_database",
            "clarifier": "request_essential_context",
            "requested_context": ["type of modification"],
            "databases": ["external_iasb", "external_kpmg"]
        }
    },
    {
        "id": "TC002",
        "description": "Revenue recognition with complete context",
        "conversation": {
            "messages": [
                {"role": "system", "content": "You are a helpful accounting assistant."},
                {"role": "user", "content": "What is the process for recognizing revenue for software licenses under IFRS 15?"}
            ]
        },
        "expected": {
            "router": "research_from_database",
            "clarifier": "request_essential_context",
            "requested_context": ["type of software license", "additional services"],
            "databases": ["external_iasb", "internal_wiki"]
        }
    },
    {
        "id": "TC003",
        "description": "Internal controls query",
        "conversation": {
            "messages": [
                {"role": "system", "content": "You are a helpful accounting assistant."},
                {"role": "user", "content": "What are the key internal controls for accounts payable?"}
            ]
        },
        "expected": {
            "router": "research_from_database",
            "clarifier": "request_essential_context",
            "requested_context": ["industry", "regulations"],
            "databases": ["internal_icfr", "internal_par"]
        }
    }
]

# Convert to DataFrame for easier management
test_df = pd.DataFrame(test_cases)
test_df[['id', 'description']]

Unnamed: 0,id,description
0,TC001,Simple IFRS 16 lease query
1,TC002,Revenue recognition with complete context
2,TC003,Internal controls query


## Running Tests and Collecting Results

Now, let's create a function to run each test case and collect the results:

In [3]:
def run_test_case(test_case):
    """Run a single test case and collect results from various stages."""
    
    print(f"Running test case {test_case['id']}: {test_case['description']}")
    
    results = {
        "id": test_case["id"],
        "description": test_case["description"],
        "start_time": time.time(),
        "stages": [],
        "final_response": "",
        "response_type": ""
    }
    
    # Run the model
    try:
        result_generator = model(test_case["conversation"])
        
        # Get the initial chunk with metadata
        initial_chunk = next(result_generator)
        results["response_type"] = initial_chunk.get("type", "None")
        
        # Capture the response
        response_text = ""
        for chunk in result_generator:
            if "stage" in chunk:
                results["stages"].append({
                    "name": chunk.get("stage", "unknown"),
                    "details": chunk
                })
            
            if chunk.get("response_chunk"):
                response_text += chunk["response_chunk"]
        
        results["final_response"] = response_text
        
    except Exception as e:
        results["error"] = str(e)
    
    results["end_time"] = time.time()
    results["duration"] = results["end_time"] - results["start_time"]
    
    return results

# Run a test case to see the output structure
# sample_result = run_test_case(test_cases[0])
# print(json.dumps(sample_result, indent=2))

## Running Multiple Tests

Now let's run all our test cases and collect the results:

In [4]:
def run_all_tests(test_cases):
    """Run all test cases and collect results."""
    results = []
    
    for test_case in test_cases:
        result = run_test_case(test_case)
        results.append(result)
        print(f"Completed test case {test_case['id']} in {result['duration']:.2f} seconds")
        print(f"Response type: {result['response_type']}")
        print("-" * 80)
    
    return results

# Uncomment to run all tests
# all_results = run_all_tests(test_cases)

## Analyzing Single Test Case with Detailed Logging

Let's create a function to run a single test case with detailed logging of each stage:

In [5]:
def analyze_test_case(test_case_id):
    """Run a single test case with detailed logging and analysis."""
    
    # Find the test case
    test_case = next((tc for tc in test_cases if tc["id"] == test_case_id), None)
    if not test_case:
        print(f"Test case {test_case_id} not found")
        return
    
    print(f"Analyzing test case {test_case['id']}: {test_case['description']}")
    print("Input conversation:")
    for i, msg in enumerate(test_case["conversation"]["messages"]):
        print(f"  Message {i+1}: {msg['role']} - {msg['content'][:100]}{'...' if len(msg['content']) > 100 else ''}")
    print("\nExpected outcomes:")
    for key, value in test_case["expected"].items():
        print(f"  {key}: {value}")
    
    print("\nRunning test...")
    result = run_test_case(test_case)
    
    print(f"\nResponse type: {result['response_type']}")
    print(f"Execution time: {result['duration']:.2f} seconds")
    
    if "error" in result:
        print(f"Error: {result['error']}")
        return
    
    print("\nStages:")
    for stage in result.get("stages", []):
        print(f"  Stage: {stage['name']}")
        for k, v in stage["details"].items():
            if k != "stage":
                print(f"    {k}: {v}")
    
    print("\nFinal response:")
    print(result["final_response"])
    
    return result

# Example usage
# detailed_result = analyze_test_case("TC001")

## Adding Expected vs. Actual Analysis

Let's create a function to compare expected vs. actual results:

In [6]:
def compare_results(test_case_id, result):
    """Compare expected vs. actual results for a test case."""
    
    # Find the test case
    test_case = next((tc for tc in test_cases if tc["id"] == test_case_id), None)
    if not test_case:
        print(f"Test case {test_case_id} not found")
        return
    
    expected = test_case["expected"]
    
    # Extract actual values from the result
    actual = {
        "router": None,
        "clarifier": None,
        "requested_context": [],
        "databases": []
    }
    
    # Extract actual values from stages
    for stage in result.get("stages", []):
        if stage["name"] == "router_decision":
            actual["router"] = stage["details"].get("decision")
        elif stage["name"] == "clarifier_decision":
            actual["clarifier"] = stage["details"].get("decision")
            if "context_questions" in stage["details"]:
                actual["requested_context"] = stage["details"]["context_questions"]
        elif stage["name"] == "query_plan":
            if "queries" in stage["details"]:
                actual["databases"] = [q.get("database") for q in stage["details"]["queries"]]
    
    # Compare expected vs. actual
    comparison = {}
    for key in expected.keys():
        expected_value = expected.get(key)
        actual_value = actual.get(key)
        
        if key in ["requested_context", "databases"]:
            # For lists, check if any expected items appear in actual
            matches = []
            for exp_item in expected_value:
                found = False
                for act_item in actual_value:
                    if isinstance(act_item, str) and exp_item.lower() in act_item.lower():
                        found = True
                        break
                matches.append(found)
            match_percentage = sum(matches) / len(matches) if matches else 0
            comparison[key] = {
                "expected": expected_value,
                "actual": actual_value,
                "match": match_percentage == 1,
                "match_percentage": match_percentage
            }
        else:
            # For simple values, check for exact match
            match = False
            if actual_value and expected_value:
                match = expected_value == actual_value
            comparison[key] = {
                "expected": expected_value,
                "actual": actual_value,
                "match": match
            }
    
    # Calculate overall match percentage
    matches = [c["match"] for c in comparison.values()]
    match_percentage = sum(matches) / len(matches) if matches else 0
    
    # Print results
    print(f"\nTest case {test_case_id} comparison:")
    print(f"Overall match: {match_percentage:.0%}")
    
    for key, comp in comparison.items():
        print(f"\n{key}:")
        print(f"  Expected: {comp['expected']}")
        print(f"  Actual: {comp['actual']}")
        if "match_percentage" in comp:
            print(f"  Match: {comp['match_percentage']:.0%}")
        else:
            print(f"  Match: {'✓' if comp['match'] else '✗'}")
    
    return comparison

# Example usage
# result = analyze_test_case("TC001")
# comparison = compare_results("TC001", result)

## Run Tests and Analyze

Now you can run any test case and analyze its results:

In [7]:
# Uncomment to run a specific test case
# result = analyze_test_case("TC001")
# comparison = compare_results("TC001", result)

## Adding Your Own Test Cases

You can add your own test cases to the list by following this pattern:

In [8]:
def add_test_case(test_id, description, conversation, expected_outcome):
    """Add a new test case to the test_cases list."""
    new_test = {
        "id": test_id,
        "description": description,
        "conversation": conversation,
        "expected": expected_outcome
    }
    
    # Check if test ID already exists
    existing_ids = [tc["id"] for tc in test_cases]
    if test_id in existing_ids:
        print(f"Warning: Test case {test_id} already exists. It will be overwritten.")
        test_cases[:] = [tc for tc in test_cases if tc["id"] != test_id]
    
    test_cases.append(new_test)
    print(f"Added test case {test_id}: {description}")
    
    # Update the DataFrame
    global test_df
    test_df = pd.DataFrame(test_cases)
    
    return new_test

# Example usage
# new_test = add_test_case(
#     "TC004",
#     "IFRS vs GAAP comparison",
#     {
#         "messages": [
#             {"role": "system", "content": "You are a helpful accounting assistant."},
#             {"role": "user", "content": "What are the key differences between IFRS and US GAAP for revenue recognition?"}
#         ]
#     },
#     {
#         "router": "research_from_database",
#         "clarifier": "request_essential_context",
#         "requested_context": ["specific industry", "time period"],
#         "databases": ["external_iasb", "external_ey", "external_kpmg"]
#     }
# )