In [None]:
import json
import pandas as pd
import textwrap
from ipywidgets import widgets, HBox, VBox
from IPython.display import display, HTML
from tabulate import tabulate
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re

def jprint(obj):
    print(json.dumps(obj, indent=2))


# Evaluation Results Inspector

This notebook provides tools to inspect and analyze evaluation results from conversational multi-hop QA tasks.


## Data Loading Functions


In [None]:
def load_results(file_path):
    """Load evaluation results from a JSONL file"""
    results = []
    with open(file_path, "r") as f:
        for line in f:
            results.append(json.loads(line.strip()))
    return results


def results_to_dataframe(results):
    """Convert results to a pandas DataFrame with derived columns"""
    df = pd.DataFrame(results)

    # Extract key information
    df["question"] = df["prompt"].apply(lambda x: x[1]["content"].split("\n")[0] if len(x) > 1 else "")
    df["predicted_answer"] = df['answer']
    df["reference_answers"] = df["info"].map(lambda x: str(x["answers"]))
    df["n_hops"] = df["info"].map(lambda x: x["n_hops"])

    # Conversation analysis
    df["n_turns"] = df["completion"].apply(count_turns)
    df["n_tool_calls"] = df["completion"].apply(count_tool_calls)
    df["used_supporting_docs"] = df.apply(
        lambda row: check_supporting_docs_usage(row["completion"], row["info"]["docs"]), axis=1
    )

    return df.drop(columns=["answer"])


def extract_final_answer(completion):
    """Extract the final answer from the completion"""
    if not completion:
        return ""

    # Look for the last assistant message with an <answer> tag
    for message in reversed(completion):
        if message.get("role") == "assistant" and "content" in message:
            content = message["content"]
            # Extract text between <answer> tags
            answer_match = re.search(r"<answer>(.*?)</answer>", content, re.DOTALL)
            if answer_match:
                return answer_match.group(1).strip()

    return ""


def calculate_exact_match(predicted, ground_truth_list):
    """Calculate exact match score"""
    if not predicted:
        return 0

    predicted_clean = predicted.lower().strip()
    for gt in ground_truth_list:
        if predicted_clean == gt.lower().strip():
            return 1
    return 0


def count_turns(completion):
    """Count conversation turns"""
    return len([msg for msg in completion if msg.get("role") == "assistant"])


def count_tool_calls(completion):
    """Count total tool calls made"""
    count = 0
    for msg in completion:
        if msg.get("tool_calls"):
            count += len(msg["tool_calls"])
    return count


def check_supporting_docs_usage(completion, docs):
    """Check if supporting documents were retrieved and used"""
    supporting_doc_ids = [str(doc["id"]) for doc in docs if doc.get("is_supporting", False)]

    # Look for document IDs mentioned in tool responses
    used_doc_ids = set()
    for msg in completion:
        if msg.get("role") == "tool" and "content" in msg:
            content = msg["content"]
            # Extract document IDs from tool responses
            doc_id_matches = re.findall(r"Document ID: (\d+)", content)
            used_doc_ids.update(doc_id_matches)

    # Check how many supporting docs were used
    used_supporting = len(set(supporting_doc_ids) & used_doc_ids)
    total_supporting = len(supporting_doc_ids)

    return used_supporting / total_supporting if total_supporting > 0 else 0


## Load and Process Data


In [None]:
# Load your results file
results_file = "../outputs/sample.jsonl"  # Change this to your actual results file
results = load_results(results_file)
df = results_to_dataframe(results)

print(f"Loaded {len(df)} evaluation results")
df.head()


In [None]:
df.columns

## Summary Statistics


In [None]:
def print_summary_stats(df):
    """Print comprehensive summary statistics"""
    print("=== EVALUATION SUMMARY ===")
    print(f"Total examples: {len(df)}")
    print(f"Exact Match: {df['exact_match'].mean():.3f} ({df['exact_match'].sum()}/{len(df)})")
    print(f"Average reward: {df['reward'].mean():.3f}")
    
    print("\n=== BY NUMBER OF HOPS ===")
    hop_stats = df.groupby('n_hops').agg({
        'exact_match': ['count', 'mean'],
        'reward': 'mean'
    }).round(3)
    hop_stats.columns = ['count', 'exact_match', 'avg_reward']
    print(hop_stats)
    
    print("\n=== CONVERSATION STATS ===")
    print(f"Average turns: {df['n_turns'].mean():.1f}")
    print(f"Average tool calls: {df['n_tool_calls'].mean():.1f}")
    print(f"Average supporting docs used: {df['used_supporting_docs'].mean():.3f}")
    
    print("\n=== TURN DISTRIBUTION ===")
    turn_dist = df['n_turns'].value_counts().sort_index()
    for turns, count in turn_dist.items():
        print(f"{turns} turns: {count} examples ({count/len(df)*100:.1f}%)")

print_summary_stats(df)


## Visualization


In [None]:
# Performance by number of hops
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# Exact match by hops
hop_performance = df.groupby('n_hops')['exact_match'].mean()
axes[0,0].bar(hop_performance.index, hop_performance.values)
axes[0,0].set_title('Exact Match by Number of Hops')
axes[0,0].set_xlabel('Number of Hops')
axes[0,0].set_ylabel('Exact Match Rate')

# Tool calls distribution
axes[0,1].hist(df['n_tool_calls'], bins=20, alpha=0.7)
axes[0,1].set_title('Distribution of Tool Calls')
axes[0,1].set_xlabel('Number of Tool Calls')
axes[0,1].set_ylabel('Frequency')

# Supporting docs usage vs performance
axes[1,0].scatter(df['used_supporting_docs'], df['exact_match'], alpha=0.6)
axes[1,0].set_title('Supporting Docs Usage vs Exact Match')
axes[1,0].set_xlabel('Fraction of Supporting Docs Used')
axes[1,0].set_ylabel('Exact Match')

# Reward distribution
axes[1,1].hist(df['reward'], bins=20, alpha=0.7)
axes[1,1].set_title('Reward Distribution')
axes[1,1].set_xlabel('Reward')
axes[1,1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()


## Interactive Browser


In [None]:
def format_conversation(completion):
    """Format conversation for display"""
    formatted = []
    
    for i, message in enumerate(completion):
        role = message.get('role', 'unknown')
        content = message.get('content', '')
        
        if role == 'assistant':
            # Check for tool calls
            if message.get('tool_calls'):
                tool_calls = message.get('tool_calls', [])
                for tc in tool_calls:
                    if isinstance(tc, str):
                        tc_data = json.loads(tc)
                        func_name = tc_data.get('function', {}).get('name', 'unknown')
                        args = tc_data.get('function', {}).get('arguments', '{}')
                        formatted.append(f"🔧 **Tool Call**: {func_name}({args})")
            
            if content:
                formatted.append(f"🤖 **Assistant**: {content}")
                
        elif role == 'tool':
            # Truncate long tool responses
            truncated = content[:500] + "..." if len(content) > 500 else content
            formatted.append(f"⚙️ **Tool Response**: {truncated}")
            
    return "\n\n".join(formatted)

def format_documents(docs):
    """Format document information"""
    doc_info = []
    for doc in docs:
        support_status = "✅ Supporting" if doc.get('is_supporting', False) else "❌ Non-supporting"
        title = doc.get('title', 'No title')
        doc_info.append(f"Doc {doc['id']}: {title} ({support_status})")
    return "\n".join(doc_info)

def format_example(row):
    """Format a single example for display"""
    data = [
        ["Question", textwrap.fill(row['question'], width=80)],
        ["Ground Truth", str(row['info']['answers'])],
        ["Predicted", row['predicted_answer']],
        ["Exact Match", "✅ Yes" if row['exact_match'] else "❌ No"],
        ["Reward", f"{row['reward']:.3f}"],
        ["Hops", str(row['n_hops'])],
        ["Turns", str(row['n_turns'])],
        ["Tool Calls", str(row['n_tool_calls'])],
        ["Supporting Docs Used", f"{row['used_supporting_docs']:.1%}"],
        ["", ""],  # Separator
        ["Available Documents", format_documents(row['info']['docs'])],
        ["", ""],  # Separator
        ["Conversation", format_conversation(row['completion'])],
    ]
    return tabulate(data, tablefmt='grid')


In [None]:
def create_browser(df):
    """Create interactive browser for examples"""
    
    # Filters
    correct_only = widgets.Checkbox(description='Correct answers only', value=False)
    incorrect_only = widgets.Checkbox(description='Incorrect answers only', value=False)
    hop_filter = widgets.Dropdown(
        options=['All'] + sorted(df['n_hops'].unique().tolist()),
        value='All',
        description='Hops:'
    )
    
    # Navigation
    index_widget = widgets.IntText(value=0, description='Index:')
    prev_button = widgets.Button(description='Previous')
    next_button = widgets.Button(description='Next')
    random_button = widgets.Button(description='Random')
    
    # Output
    output = widgets.Output()
    
    def get_filtered_df():
        filtered_df = df.copy()
        
        if correct_only.value:
            filtered_df = filtered_df[filtered_df['exact_match'] == 1]
        elif incorrect_only.value:
            filtered_df = filtered_df[filtered_df['exact_match'] == 0]
            
        if hop_filter.value != 'All':
            filtered_df = filtered_df[filtered_df['n_hops'] == hop_filter.value]
            
        return filtered_df.reset_index(drop=True)
    
    def update_display():
        filtered_df = get_filtered_df()
        
        with output:
            output.clear_output()
            
            if len(filtered_df) == 0:
                print("No examples match the current filters.")
                return
                
            idx = max(0, min(index_widget.value, len(filtered_df) - 1))
            index_widget.value = idx
            
            print(f"Example {idx + 1} of {len(filtered_df)} (filtered)")
            print("=" * 80)
            print(format_example(filtered_df.iloc[idx]))
    
    def on_prev_clicked(b):
        index_widget.value = max(0, index_widget.value - 1)
        
    def on_next_clicked(b):
        filtered_df = get_filtered_df()
        index_widget.value = min(len(filtered_df) - 1, index_widget.value + 1)
        
    def on_random_clicked(b):
        filtered_df = get_filtered_df()
        if len(filtered_df) > 0:
            import numpy as np
            index_widget.value = np.random.randint(0, len(filtered_df))
    
    # Wire up events
    prev_button.on_click(on_prev_clicked)
    next_button.on_click(on_next_clicked)
    random_button.on_click(on_random_clicked)
    
    # Update display when any control changes
    widgets.interactive_output(lambda *args: update_display(), {
        'idx': index_widget,
        'correct': correct_only,
        'incorrect': incorrect_only,
        'hops': hop_filter
    })
    
    # Layout
    filters = HBox([correct_only, incorrect_only, hop_filter])
    navigation = HBox([prev_button, index_widget, next_button, random_button])
    
    display(VBox([filters, navigation, output]))
    
    # Initial display
    update_display()

# Create the browser
create_browser(df)


## Error Analysis


In [None]:
def analyze_errors(df):
    """Analyze common error patterns"""
    incorrect = df[df['exact_match'] == 0]
    
    print(f"=== ERROR ANALYSIS ({len(incorrect)} incorrect examples) ===")
    
    # By number of hops
    print("\nError rate by hops:")
    hop_errors = df.groupby('n_hops').agg({
        'exact_match': lambda x: 1 - x.mean()
    }).round(3)
    print(hop_errors)
    
    # Supporting document usage
    print("\nSupporting document usage comparison:")
    print(f"Correct answers: {df[df['exact_match']==1]['used_supporting_docs'].mean():.3f}")
    print(f"Incorrect answers: {df[df['exact_match']==0]['used_supporting_docs'].mean():.3f}")
    
    # Tool call patterns
    print("\nTool call patterns:")
    print(f"Correct answers avg tool calls: {df[df['exact_match']==1]['n_tool_calls'].mean():.1f}")
    print(f"Incorrect answers avg tool calls: {df[df['exact_match']==0]['n_tool_calls'].mean():.1f}")
    
    # Examples with no predicted answer
    no_answer = df[df['predicted_answer'] == '']
    print(f"\nExamples with no predicted answer: {len(no_answer)} ({len(no_answer)/len(df)*100:.1f}%)")
    
    return incorrect

error_df = analyze_errors(df)


## Custom Analysis


In [None]:
# Add your custom analysis here
# For example, analyze specific question types, patterns in tool usage, etc.

# Example: Look at questions that require exactly 2 hops
two_hop_questions = df[df['n_hops'] == 2]
print(f"2-hop questions performance: {two_hop_questions['exact_match'].mean():.3f}")

# Example: Correlation between tool calls and performance
correlation = df['n_tool_calls'].corr(df['exact_match'])
print(f"Correlation between tool calls and exact match: {correlation:.3f}")

# Example: Most common failure modes
print("\nMost common failure cases (first 5):")
failed_examples = df[df['exact_match'] == 0].head()
for idx, row in failed_examples.iterrows():
    print(f"- Question: {row['question'][:100]}...")
    print(f"  Predicted: '{row['predicted_answer']}'")
    print(f"  Expected: {row['info']['answers']}")
    print()
