# Test LLM-Based Orchestrator

This notebook tests the new LLM-based orchestrator that:
- Detects semantic duplicates
- Merges similar suggestions
- Identifies contradictions
- Processes sections in parallel
- Filters low-value suggestions

## Setup

In [None]:
import sys
import os
import asyncio
from pathlib import Path

# Add project root to path
project_root = Path(os.getcwd()).parent
sys.path.insert(0, str(project_root))

# Set environment variable for .env file
os.chdir(project_root)

from app.agents.orchestrator_agent import OrchestratorAgent
from app.models.schemas import (
    SectionSuggestions,
    SuggestionGroup,
    SuggestionItem,
    SuggestionType,
    SeverityLevel,
)

print("✓ Imports successful")
print(f"Working directory: {os.getcwd()}")

## Test 1: Semantic Duplicate Detection

Test if orchestrator can detect that these are duplicates:
- "Variable x is not defined"
- "Missing definition for variable x"

In [None]:
# Create test suggestions with semantic duplicates
test_section_duplicates = SectionSuggestions(
    section="Introduction",
    line=1,
    section_type="introduction",
    suggestions=[
        SuggestionGroup(
            type=SuggestionType.CLARITY,
            count=1,
            items=[
                SuggestionItem(
                    text="Variable x is not defined before first use",
                    line=15,
                    severity=SeverityLevel.ERROR,
                    severity_score=0.75,
                    explanation="Using undefined variables leads to confusion and potential errors.",
                    suggested_fix="Define variable x before line 15, e.g., 'Let x denote...'"
                )
            ]
        ),
        SuggestionGroup(
            type=SuggestionType.RIGOR,
            count=1,
            items=[
                SuggestionItem(
                    text="Missing definition for variable x",
                    line=15,
                    severity=SeverityLevel.ERROR,
                    severity_score=0.80,
                    explanation="Mathematical rigor requires all variables to be properly defined.",
                    suggested_fix="Add formal definition: 'Let x ∈ R denote...'"
                )
            ]
        )
    ]
)

print("Before orchestration:")
print(f"  Total suggestions: {sum(g.count for g in test_section_duplicates.suggestions)}")
for group in test_section_duplicates.suggestions:
    for item in group.items:
        print(f"  - [{group.type}] {item.text} (score: {item.severity_score})")

In [None]:
# Test orchestration
orchestrator = OrchestratorAgent()

async def test_duplicates():
    result = await orchestrator.validate_and_prioritize([test_section_duplicates])
    return result[0]

result_duplicates = await test_duplicates()

print("\nAfter orchestration:")
print(f"  Total suggestions: {sum(g.count for g in result_duplicates.suggestions)}")
for group in result_duplicates.suggestions:
    for item in group.items:
        print(f"\n  [{group.type}] {item.text}")
        print(f"    Score: {item.severity_score:.2f}")
        print(f"    Explanation: {item.explanation}")
        print(f"    Fix: {item.suggested_fix}")

# Check if duplicates were merged
total_before = sum(g.count for g in test_section_duplicates.suggestions)
total_after = sum(g.count for g in result_duplicates.suggestions)

print(f"\n{'✓' if total_after < total_before else '✗'} Duplicates merged: {total_before} → {total_after}")

## Test 2: Contradiction Detection

Test if orchestrator can detect and resolve contradictions:
- Clarity: "This explanation is too verbose, shorten it"
- Rigor: "This explanation lacks detail, expand it"

In [None]:
# Create test suggestions with contradiction
test_section_contradiction = SectionSuggestions(
    section="Methodology",
    line=20,
    section_type="methodology",
    suggestions=[
        SuggestionGroup(
            type=SuggestionType.CLARITY,
            count=1,
            items=[
                SuggestionItem(
                    text="The methodology description is overly verbose and should be shortened",
                    line=25,
                    severity=SeverityLevel.WARNING,
                    severity_score=0.55,
                    explanation="Long-winded explanations reduce readability and may confuse readers.",
                    suggested_fix="Condense the methodology section by removing redundant phrases and focusing on key steps."
                )
            ]
        ),
        SuggestionGroup(
            type=SuggestionType.RIGOR,
            count=1,
            items=[
                SuggestionItem(
                    text="The methodology description lacks sufficient detail for reproducibility",
                    line=25,
                    severity=SeverityLevel.ERROR,
                    severity_score=0.72,
                    explanation="Insufficient methodological detail makes it impossible to replicate the experiment.",
                    suggested_fix="Add specific details: dataset size, hyperparameters, training procedure, evaluation metrics."
                )
            ]
        )
    ]
)

print("Before orchestration (contradiction):")
for group in test_section_contradiction.suggestions:
    for item in group.items:
        print(f"  - [{group.type}] {item.text} (score: {item.severity_score})")

In [None]:
async def test_contradiction():
    result = await orchestrator.validate_and_prioritize([test_section_contradiction])
    return result[0]

result_contradiction = await test_contradiction()

print("\nAfter orchestration (should resolve contradiction):")
for group in result_contradiction.suggestions:
    for item in group.items:
        print(f"\n  [{group.type}] {item.text}")
        print(f"    Score: {item.severity_score:.2f}")
        print(f"    Explanation: {item.explanation}")
        print(f"    Fix: {item.suggested_fix}")

total_before = sum(g.count for g in test_section_contradiction.suggestions)
total_after = sum(g.count for g in result_contradiction.suggestions)

print(f"\nSuggestions: {total_before} → {total_after}")
print("Expected: Orchestrator should either merge into balanced suggestion or keep rigor (higher score)")

## Test 3: Parallel Processing Performance

Test that multiple sections are processed in parallel using asyncio.gather

In [None]:
import time

# Create multiple sections
test_sections_parallel = [
    SectionSuggestions(
        section=f"Section {i}",
        line=i * 10,
        section_type="methodology",
        suggestions=[
            SuggestionGroup(
                type=SuggestionType.CLARITY,
                count=2,
                items=[
                    SuggestionItem(
                        text=f"Issue A in section {i}",
                        line=i * 10 + 1,
                        severity=SeverityLevel.WARNING,
                        severity_score=0.60,
                        explanation="Test explanation A",
                        suggested_fix="Test fix A"
                    ),
                    SuggestionItem(
                        text=f"Issue B in section {i}",
                        line=i * 10 + 2,
                        severity=SeverityLevel.INFO,
                        severity_score=0.30,
                        explanation="Test explanation B",
                        suggested_fix="Test fix B"
                    )
                ]
            )
        ]
    )
    for i in range(1, 6)  # 5 sections
]

print(f"Testing parallel processing with {len(test_sections_parallel)} sections...")

start_time = time.time()
result_parallel = await orchestrator.validate_and_prioritize(test_sections_parallel)
elapsed_time = time.time() - start_time

print(f"\n✓ Processed {len(result_parallel)} sections in {elapsed_time:.2f}s")
print(f"  Average per section: {elapsed_time / len(result_parallel):.2f}s")
print("\nNote: With parallel processing (asyncio.gather), total time should be ~1-2s,")
print("      not 5-10s (which would be sequential processing)")

## Test 4: Quality Control - Low Value Filtering

Test if orchestrator filters out very low-value suggestions

In [None]:
# Create test with mix of high-value and low-value suggestions
test_section_quality = SectionSuggestions(
    section="Results",
    line=50,
    section_type="results",
    suggestions=[
        SuggestionGroup(
            type=SuggestionType.CLARITY,
            count=3,
            items=[
                SuggestionItem(
                    text="Missing statistical significance testing for main results",
                    line=55,
                    severity=SeverityLevel.ERROR,
                    severity_score=0.85,
                    explanation="Without p-values, readers cannot assess validity of claimed improvements.",
                    suggested_fix="Add t-test or ANOVA results with p-values for all claimed improvements."
                ),
                SuggestionItem(
                    text="Consider using a different font for table headers",
                    line=60,
                    severity=SeverityLevel.INFO,
                    severity_score=0.10,
                    explanation="Bold font might improve visual hierarchy.",
                    suggested_fix="Make table headers bold."
                ),
                SuggestionItem(
                    text="The word 'significant' appears twice in one sentence",
                    line=58,
                    severity=SeverityLevel.INFO,
                    severity_score=0.15,
                    explanation="Repetition of words can slightly reduce readability.",
                    suggested_fix="Replace one instance with 'substantial' or 'notable'."
                )
            ]
        )
    ]
)

print("Before orchestration (mixed quality):")
for group in test_section_quality.suggestions:
    for item in group.items:
        print(f"  - {item.text} (score: {item.severity_score})")

async def test_quality():
    result = await orchestrator.validate_and_prioritize([test_section_quality])
    return result[0]

result_quality = await test_quality()

print("\nAfter orchestration (should filter low-value):")
for group in result_quality.suggestions:
    for item in group.items:
        print(f"  - {item.text} (score: {item.severity_score:.2f})")

total_before = sum(g.count for g in test_section_quality.suggestions)
total_after = sum(g.count for g in result_quality.suggestions)

print(f"\nSuggestions: {total_before} → {total_after}")
print("Expected: Low-value suggestions (score < 0.2) should be filtered out")

## Test 5: Real-World Example - Climate Paper

Test with realistic suggestions from the climate prediction paper

In [None]:
# Simulate realistic suggestions for climate paper's Results section
test_section_realistic = SectionSuggestions(
    section="Experimental Validation",
    line=53,
    section_type="results",
    suggestions=[
        SuggestionGroup(
            type=SuggestionType.CLARITY,
            count=2,
            items=[
                SuggestionItem(
                    text="The claim 'our method outperforms all baselines' is vague",
                    line=68,
                    severity=SeverityLevel.WARNING,
                    severity_score=0.65,
                    explanation="Without specific metrics, readers cannot assess the magnitude of improvement.",
                    suggested_fix="Specify metrics: 'Our method achieves 15% lower MSE than the best baseline'."
                ),
                SuggestionItem(
                    text="Table 1 is referenced but not included in the document",
                    line=68,
                    severity=SeverityLevel.ERROR,
                    severity_score=0.90,
                    explanation="Missing tables break the flow and make results impossible to verify.",
                    suggested_fix="Add Table 1 with baseline comparisons or remove the reference."
                )
            ]
        ),
        SuggestionGroup(
            type=SuggestionType.RIGOR,
            count=2,
            items=[
                SuggestionItem(
                    text="No statistical significance testing for claimed improvements",
                    line=68,
                    severity=SeverityLevel.ERROR,
                    severity_score=0.82,
                    explanation="Without p-values or confidence intervals, improvements may not be statistically significant.",
                    suggested_fix="Add statistical tests (t-test, ANOVA) with p-values and confidence intervals."
                ),
                SuggestionItem(
                    text="Missing details about baseline implementations",
                    line=59,
                    severity=SeverityLevel.WARNING,
                    severity_score=0.58,
                    explanation="Fair comparison requires identical implementation details for all methods.",
                    suggested_fix="Specify: hyperparameters, training procedures, and hardware used for baselines."
                )
            ]
        )
    ]
)

print("Realistic example - Before orchestration:")
print(f"Total: {sum(g.count for g in test_section_realistic.suggestions)} suggestions\n")
for group in test_section_realistic.suggestions:
    print(f"[{group.type}] ({group.count} suggestions):")
    for item in group.items:
        print(f"  • Line {item.line}: {item.text[:60]}...")
        print(f"    Score: {item.severity_score:.2f}, Severity: {item.severity}")

async def test_realistic():
    result = await orchestrator.validate_and_prioritize([test_section_realistic])
    return result[0]

result_realistic = await test_realistic()

print("\n" + "="*80)
print("After orchestration:")
print(f"Total: {sum(g.count for g in result_realistic.suggestions)} suggestions\n")
for group in result_realistic.suggestions:
    print(f"[{group.type}] ({group.count} suggestions):")
    for item in group.items:
        print(f"\n  • Line {item.line}: {item.text}")
        print(f"    Score: {item.severity_score:.2f}, Severity: {item.severity}")
        print(f"    Explanation: {item.explanation[:100]}...")
        print(f"    Fix: {item.suggested_fix[:100]}...")

print("\n" + "="*80)
print("Expected behavior:")
print("  • Line 68 duplicates (clarity + rigor about vague claims) should be merged")
print("  • Missing Table 1 (score 0.90) should be prioritized first")
print("  • Statistical testing (score 0.82) should be second")
print("  • Lower-score suggestions should follow")

## Summary

The LLM-based orchestrator should:
1. ✓ Merge semantic duplicates (even with different wording)
2. ✓ Detect and resolve contradictions intelligently
3. ✓ Process sections in parallel (fast total time)
4. ✓ Filter out very low-value suggestions
5. ✓ Prioritize by impact × actionability, not just score