In [1]:
import json
import re
from typing import List, Dict, Tuple, Set
from collections import defaultdict
import math

In [2]:

# ============================================================================
# 1. GOLD STANDARD ANNOTATIONS
# ============================================================================

GOLD_ANNOTATIONS = [
    {
        "id": 1,
        "text": "Sarah bought a new car. She drove it to work. The vehicle performed excellently.",
        "entities": [
            {"entity_id": 1, "mentions": ["Sarah", "She"]},
            {"entity_id": 2, "mentions": ["a new car", "it", "The vehicle"]}
        ]
    },
    {
        "id": 2,
        "text": "John met Mary at the coffee shop. He ordered tea while she got a cappuccino. Both enjoyed their drinks.",
        "entities": [
            {"entity_id": 1, "mentions": ["John", "He"]},
            {"entity_id": 2, "mentions": ["Mary", "she"]},
            {"entity_id": 3, "mentions": ["the coffee shop"]},
            {"entity_id": 4, "mentions": ["tea"]},
            {"entity_id": 5, "mentions": ["a cappuccino"]},
            {"entity_id": 6, "mentions": ["Both", "their drinks"]}
        ]
    },
    {
        "id": 3,
        "text": "Apple announced its new iPhone. The device features an advanced camera. Tim Cook presented it at the conference.",
        "entities": [
            {"entity_id": 1, "mentions": ["Apple"]},
            {"entity_id": 2, "mentions": ["its new iPhone", "The device", "it"]},
            {"entity_id": 3, "mentions": ["Tim Cook"]}
        ]
    },
    {
        "id": 4,
        "text": "The lawyer called the client after he finished the report. It was crucial for the case.",
        "entities": [
            {"entity_id": 1, "mentions": ["The lawyer", "he"]},
            {"entity_id": 2, "mentions": ["the client"]},
            {"entity_id": 3, "mentions": ["the report", "It"]}
        ]
    },
    {
        "id": 5,
        "text": "Emma and her brother visited the museum. They spent hours looking at paintings. The artworks fascinated them.",
        "entities": [
            {"entity_id": 1, "mentions": ["Emma"]},
            {"entity_id": 2, "mentions": ["her brother"]},
            {"entity_id": 3, "mentions": ["the museum"]},
            {"entity_id": 4, "mentions": ["They", "them"]},
            {"entity_id": 5, "mentions": ["paintings", "The artworks"]}
        ]
    },
    {
        "id": 6,
        "text": "The company released a new product. It was designed for mobile users. Customers loved the innovation.",
        "entities": [
            {"entity_id": 1, "mentions": ["The company"]},
            {"entity_id": 2, "mentions": ["a new product", "It"]},
            {"entity_id": 3, "mentions": ["mobile users"]},
            {"entity_id": 4, "mentions": ["Customers"]},
            {"entity_id": 5, "mentions": ["the innovation"]}
        ]
    },
    {
        "id": 7,
        "text": "Peter gave his book to Michael. He read it quickly. The story impressed him.",
        "entities": [
            {"entity_id": 1, "mentions": ["Peter", "He"]},
            {"entity_id": 2, "mentions": ["his book", "it", "The story"]},
            {"entity_id": 3, "mentions": ["Michael", "him"]}
        ]
    },
    {
        "id": 8,
        "text": "The hospital admitted a patient yesterday. She required immediate treatment. The doctors examined her carefully.",
        "entities": [
            {"entity_id": 1, "mentions": ["The hospital"]},
            {"entity_id": 2, "mentions": ["a patient", "She", "her"]},
            {"entity_id": 3, "mentions": ["The doctors"]}
        ]
    },
    {
        "id": 9,
        "text": "Google acquired a startup. Its founders were thrilled. They received stock options and cash.",
        "entities": [
            {"entity_id": 1, "mentions": ["Google"]},
            {"entity_id": 2, "mentions": ["a startup"]},
            {"entity_id": 3, "mentions": ["Its founders", "They"]},
            {"entity_id": 4, "mentions": ["stock options and cash"]}
        ]
    },
    {
        "id": 10,
        "text": "The professor explained the theory. Students listened attentively. It was complex but fascinating.",
        "entities": [
            {"entity_id": 1, "mentions": ["The professor"]},
            {"entity_id": 2, "mentions": ["the theory", "It"]},
            {"entity_id": 3, "mentions": ["Students"]}
        ]
    }
]

In [3]:

class CoreferenceResolver:
    """
    Rule-based coreference resolution system.
    For demonstration, uses heuristics combined with mention extraction.
    """
    
    def __init__(self):
        self.pronouns = {
            'he': 'male', 'him': 'male', 'his': 'male',
            'she': 'female', 'her': 'female', 'hers': 'female',
            'it': 'neuter', 'its': 'neuter',
            'they': 'plural', 'them': 'plural', 'their': 'plural',
            'both': 'plural'
        }
    
    def extract_mentions(self, text: str) -> List[Tuple[str, int, int]]:
        """Extract mention candidates (nouns, pronouns, proper nouns)."""
        mentions = []
        
        # Extract pronouns
        for pronoun in self.pronouns.keys():
            pattern = r'\b' + pronoun + r'\b'
            for match in re.finditer(pattern, text, re.IGNORECASE):
                mentions.append((pronoun.lower(), match.start(), match.end()))
        
        # Extract noun phrases (simplified)
        noun_patterns = [
            r'\b[A-Z][a-z]+\b',  # Proper nouns
            r'\bthe\s+(?:[\w]+\s+)*[\w]+\b',  # Definite descriptions
            r'\ba\s+(?:[\w]+\s+)*[\w]+\b',  # Indefinite descriptions
        ]
        
        for pattern in noun_patterns:
            for match in re.finditer(pattern, text):
                mentions.append((match.group(), match.start(), match.end()))
        
        return mentions
    
    def resolve(self, text: str) -> List[Dict]:
        """Resolve coreferences using simple heuristics."""
        mentions = self.extract_mentions(text)
        chains = []
        processed = set()
        
        for i, (mention, start, end) in enumerate(mentions):
            if i in processed:
                continue
            
            chain = [mention]
            processed.add(i)
            
            # Find matching pronouns or similar mentions
            for j, (other_mention, other_start, other_end) in enumerate(mentions[i+1:], start=i+1):
                if j in processed:
                    continue
                
                # Simple matching heuristic
                if self._should_link(mention, other_mention):
                    chain.append(other_mention)
                    processed.add(j)
            
            if len(chain) > 0:
                chains.append({
                    "entity_id": len(chains) + 1,
                    "mentions": chain
                })
        
        return chains
    
    def _should_link(self, mention1: str, mention2: str) -> bool:
        """Determine if two mentions should be linked."""
        m1_lower = mention1.lower()
        m2_lower = mention2.lower()
        
        # Same mention
        if m1_lower == m2_lower:
            return True
        
        # Pronoun resolution heuristic
        if m2_lower in self.pronouns:
            return False  # Simplified: skip complex pronoun resolution
        
        return False


In [4]:

# ============================================================================
# 3. EVALUATION METRICS: MUC
# ============================================================================

class MUCMetric:
    """
    Muelas Unified Clustering (MUC) Metric
    Link-based evaluation: counts links formed/missing
    """
    
    @staticmethod
    def get_links(chains: List[Dict]) -> Set[Tuple[int, int]]:
        """
        Extract all links from coreference chains.
        A link is a pair of mentions in the same chain.
        """
        links = set()
        for chain in chains:
            mentions = chain.get("mentions", [])
            if len(mentions) < 2:
                continue
            for i in range(len(mentions) - 1):
                link = tuple(sorted([i, i + 1]))
                links.add(link)
        return links
    
    @staticmethod
    def compute(gold_chains: List[Dict], system_chains: List[Dict]) -> Dict:
        """Compute MUC precision, recall, and F1."""
        gold_links = MUCMetric.get_links(gold_chains)
        system_links = MUCMetric.get_links(system_chains)
        
        if len(system_links) == 0:
            precision = 0.0
        else:
            precision = len(gold_links & system_links) / len(system_links)
        
        if len(gold_links) == 0:
            recall = 1.0 if len(system_links) == 0 else 0.0
        else:
            recall = len(gold_links & system_links) / len(gold_links)
        
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
        
        return {
            "precision": round(precision, 4),
            "recall": round(recall, 4),
            "f1": round(f1, 4)
        }


In [5]:

# ============================================================================
# 4. EVALUATION METRICS: CEAF
# ============================================================================

class CEAFMetric:
    """
    CEAF-e (Entity-based CEAF)
    Optimal alignment between system and gold entities using mention overlap
    """
    
    @staticmethod
    def mention_overlap(gold_mentions: List[str], system_mentions: List[str]) -> float:
        """Compute similarity as number of overlapping mentions."""
        gold_set = set(m.lower() for m in gold_mentions)
        system_set = set(m.lower() for m in system_mentions)
        overlap = len(gold_set & system_set)
        return overlap
    
    @staticmethod
    def optimal_alignment(gold_chains: List[Dict], system_chains: List[Dict]) -> Tuple[float, List]:
        """
        Find optimal alignment using greedy matching.
        Returns (total_similarity, alignment_list)
        """
        if not gold_chains or not system_chains:
            return 0.0, []
        
        gold_entities = [(i, chain["mentions"]) for i, chain in enumerate(gold_chains)]
        system_entities = [(i, chain["mentions"]) for i, chain in enumerate(system_chains)]
        
        matched_gold = set()
        matched_system = set()
        alignment = []
        total_similarity = 0.0
        
        # Greedy matching: highest overlap first
        similarities = []
        for g_idx, (g_id, g_mentions) in enumerate(gold_entities):
            for s_idx, (s_id, s_mentions) in enumerate(system_entities):
                sim = CEAFMetric.mention_overlap(g_mentions, s_mentions)
                similarities.append((sim, g_idx, s_idx))
        
        similarities.sort(reverse=True)
        
        for sim, g_idx, s_idx in similarities:
            if g_idx not in matched_gold and s_idx not in matched_system and sim > 0:
                matched_gold.add(g_idx)
                matched_system.add(s_idx)
                alignment.append((g_idx, s_idx, sim))
                total_similarity += sim
        
        return total_similarity, alignment
    
    @staticmethod
    def compute(gold_chains: List[Dict], system_chains: List[Dict]) -> Dict:
        """Compute CEAF-e precision, recall, and F1."""
        total_sim, _ = CEAFMetric.optimal_alignment(gold_chains, system_chains)
        
        # Precision: total similarity / sum of system entity sizes
        system_size = sum(len(chain["mentions"]) for chain in system_chains)
        precision = total_sim / system_size if system_size > 0 else 0.0
        
        # Recall: total similarity / sum of gold entity sizes
        gold_size = sum(len(chain["mentions"]) for chain in gold_chains)
        recall = total_sim / gold_size if gold_size > 0 else 0.0
        
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
        
        return {
            "precision": round(precision, 4),
            "recall": round(recall, 4),
            "f1": round(f1, 4)
        }

In [6]:

# ============================================================================
# 5. EVALUATION RUNNER
# ============================================================================

def evaluate_system(gold_annotations: List[Dict]) -> Dict:
    """Evaluate the coreference resolver on gold annotations."""
    resolver = CoreferenceResolver()
    
    all_muc_scores = {"precision": [], "recall": [], "f1": []}
    all_ceaf_scores = {"precision": [], "recall": [], "f1": []}
    
    results_per_document = []
    
    for doc in gold_annotations:
        text = doc["text"]
        gold_chains = doc["entities"]
        system_chains = resolver.resolve(text)
        
        muc_result = MUCMetric.compute(gold_chains, system_chains)
        ceaf_result = CEAFMetric.compute(gold_chains, system_chains)
        
        for key in all_muc_scores:
            all_muc_scores[key].append(muc_result[key])
        for key in all_ceaf_scores:
            all_ceaf_scores[key].append(ceaf_result[key])
        
        results_per_document.append({
            "doc_id": doc["id"],
            "text": text,
            "gold": gold_chains,
            "system": system_chains,
            "muc": muc_result,
            "ceaf": ceaf_result
        })
    
    # Compute macro averages
    avg_muc = {k: round(sum(v) / len(v), 4) for k, v in all_muc_scores.items()}
    avg_ceaf = {k: round(sum(v) / len(v), 4) for k, v in all_ceaf_scores.items()}
    
    return {
        "per_document": results_per_document,
        "muc_average": avg_muc,
        "ceaf_average": avg_ceaf
    }

In [7]:

# ============================================================================
# 6. VISUALIZATION
# ============================================================================

def generate_html_visualization(text: str, chains: List[Dict]) -> str:
    """Generate HTML visualization with color-coded entities."""
    colors = [
        "#FF6B6B", "#4ECDC4", "#45B7D1", "#FFA07A", "#98D8C8",
        "#F7DC6F", "#BB8FCE", "#85C1E2", "#F8B88B", "#ABEBC6"
    ]
    
    # Create mention to entity mapping
    mention_to_entity = {}
    for chain in chains:
        entity_id = chain["entity_id"]
        for mention in chain["mentions"]:
            mention_to_entity[mention.lower()] = entity_id
    
    # Tokenize and colorize
    words = text.split()
    colored_html = '<div style="font-size: 16px; line-height: 1.8; font-family: Arial;">'
    
    for word in words:
        word_clean = word.rstrip('.,;:!?')
        punctuation = word[len(word_clean):]
        
        if word_clean.lower() in mention_to_entity:
            entity_id = mention_to_entity[word_clean.lower()]
            color = colors[(entity_id - 1) % len(colors)]
            colored_html += f'<span style="background-color: {color}; padding: 2px 4px; margin: 0 2px;">{word_clean}</span>{punctuation} '
        else:
            colored_html += word + ' '
    
    colored_html += '</div>'
    return colored_html


In [8]:
# ============================================================================
# 7. MAIN EXECUTION
# ============================================================================

if __name__ == "__main__":
    print("="*80)
    print("COREFERENCE RESOLUTION SYSTEM - EVALUATION RESULTS")
    print("="*80)
    
    # Run evaluation
    results = evaluate_system(GOLD_ANNOTATIONS)
    
    # Print overall metrics
    print("\n" + "="*80)
    print("OVERALL METRICS (Macro-Averaged)")
    print("="*80)
    print(f"\n{'Metric':<10} | {'Precision':<12} | {'Recall':<12} | {'F1':<12}")
    print("-"*50)
    print(f"{'MUC':<10} | {results['muc_average']['precision']:<12} | {results['muc_average']['recall']:<12} | {results['muc_average']['f1']:<12}")
    print(f"{'CEAF':<10} | {results['ceaf_average']['precision']:<12} | {results['ceaf_average']['recall']:<12} | {results['ceaf_average']['f1']:<12}")
    
    # Sample detailed results (first 3 documents)
    print("\n" + "="*80)
    print("SAMPLE DETAILED RESULTS (First 3 Documents)")
    print("="*80)
    
    for result in results["per_document"][:3]:
        print(f"\nDocument {result['doc_id']}: {result['text'][:60]}...")
        print(f"  Gold Entities: {result['gold']}")
        print(f"  System Entities: {result['system']}")
        print(f"  MUC: P={result['muc']['precision']}, R={result['muc']['recall']}, F1={result['muc']['f1']}")
        print(f"  CEAF: P={result['ceaf']['precision']}, R={result['ceaf']['recall']}, F1={result['ceaf']['f1']}")
    
    # Save results to JSON
    with open("coref_evaluation_results.json", "w") as f:
        json.dump(results, f, indent=2)
    
    print("\n" + "="*80)
    print("Results saved to: coref_evaluation_results.json")
    print("="*80)

COREFERENCE RESOLUTION SYSTEM - EVALUATION RESULTS

OVERALL METRICS (Macro-Averaged)

Metric     | Precision    | Recall       | F1          
--------------------------------------------------
MUC        | 0.9          | 0.75         | 0.8         
CEAF       | 0.3823       | 0.4413       | 0.4087      

SAMPLE DETAILED RESULTS (First 3 Documents)

Document 1: Sarah bought a new car. She drove it to work. The vehicle pe...
  Gold Entities: [{'entity_id': 1, 'mentions': ['Sarah', 'She']}, {'entity_id': 2, 'mentions': ['a new car', 'it', 'The vehicle']}]
  System Entities: [{'entity_id': 1, 'mentions': ['she', 'She']}, {'entity_id': 2, 'mentions': ['it']}, {'entity_id': 3, 'mentions': ['Sarah']}, {'entity_id': 4, 'mentions': ['The']}, {'entity_id': 5, 'mentions': ['a new car']}]
  MUC: P=1.0, R=0.5, F1=0.6667
  CEAF: P=0.3333, R=0.4, F1=0.3636

Document 2: John met Mary at the coffee shop. He ordered tea while she g...
  Gold Entities: [{'entity_id': 1, 'mentions': ['John', 'He']}, {'ent

In [9]:
"""
Extended Analysis and Visualization Module
Complements the main coreference resolution system
"""

import json
from typing import List, Dict
import re

# ============================================================================
# COLORED TERMINAL OUTPUT VISUALIZATION
# ============================================================================

class TerminalVisualizer:
    """Generate colored terminal output for coreference chains."""
    
    COLORS = {
        1: '\033[94m',   # Blue
        2: '\033[92m',   # Green
        3: '\033[91m',   # Red
        4: '\033[93m',   # Yellow
        5: '\033[95m',   # Magenta
        6: '\033[96m',   # Cyan
        7: '\033[97m',   # White
        8: '\033[44m',   # Blue background
        9: '\033[42m',   # Green background
        10: '\033[41m',  # Red background
    }
    RESET = '\033[0m'
    
    @staticmethod
    def visualize(text: str, chains: List[Dict]) -> str:
        """Generate colored terminal visualization."""
        # Create mention to entity mapping
        mention_to_entity = {}
        for chain in chains:
            entity_id = chain["entity_id"]
            for mention in chain["mentions"]:
                mention_to_entity[mention.lower()] = entity_id
        
        # Process words
        words = text.split()
        output_lines = []
        current_line = ""
        
        for word in words:
            word_clean = word.rstrip('.,;:!?')
            punctuation = word[len(word_clean):]
            
            # Check if word matches a mention
            matched = False
            for mention_key, entity_id in mention_to_entity.items():
                if word_clean.lower() == mention_key:
                    color = TerminalVisualizer.COLORS.get(entity_id, '\033[94m')
                    current_line += f"{color}[{word_clean}]_{entity_id}{TerminalVisualizer.RESET}{punctuation} "
                    matched = True
                    break
            
            if not matched:
                current_line += word + " "
            
            # Line wrapping for readability
            if len(current_line) > 80:
                output_lines.append(current_line)
                current_line = ""
        
        if current_line:
            output_lines.append(current_line)
        
        return "\n".join(output_lines)

# ============================================================================
# DETAILED ERROR ANALYSIS
# ============================================================================

class ErrorAnalyzer:
    """Detailed analysis of coreference resolution errors."""
    
    @staticmethod
    def classify_error(gold_mention: str, system_chain: List[str], 
                      gold_chain: List[str]) -> str:
        """Classify the type of error made."""
        gold_set = set(m.lower() for m in gold_chain)
        system_set = set(m.lower() for m in system_chain)
        
        if gold_set == system_set:
            return "CORRECT"
        elif gold_set.issubset(system_set):
            return "FALSE_POSITIVE"  # System linked extra mentions
        elif gold_set.issuperset(system_set):
            return "FALSE_NEGATIVE"  # System missed mentions
        else:
            return "PARTIAL_MATCH"
    
    @staticmethod
    def analyze_mention_type(mention: str) -> str:
        """Classify mention type."""
        mention_lower = mention.lower()
        
        if mention_lower in ['he', 'she', 'it', 'they', 'him', 'her', 'them', 'both']:
            return "PRONOUN"
        elif mention[0].isupper() and len(mention.split()) == 1:
            return "PROPER_NOUN"
        elif mention.startswith('the '):
            return "DEFINITE_DESCRIPTION"
        elif mention.startswith('a '):
            return "INDEFINITE_DESCRIPTION"
        else:
            return "OTHER"
    
    @staticmethod
    def analyze_result(gold_chains: List[Dict], system_chains: List[Dict], 
                      text: str) -> Dict:
        """Comprehensive error analysis for a single document."""
        
        analysis = {
            "total_gold_entities": len(gold_chains),
            "total_system_entities": len(system_chains),
            "gold_mention_types": {},
            "error_types": {},
            "false_positives": [],
            "false_negatives": [],
            "partial_matches": []
        }
        
        # Count mention types in gold
        for chain in gold_chains:
            for mention in chain["mentions"]:
                m_type = ErrorAnalyzer.analyze_mention_type(mention)
                analysis["gold_mention_types"][m_type] = \
                    analysis["gold_mention_types"].get(m_type, 0) + 1
        
        # Create gold mention to chain mapping
        gold_mention_to_chain = {}
        for chain in gold_chains:
            for mention in chain["mentions"]:
                gold_mention_to_chain[mention.lower()] = chain["mentions"]
        
        # Analyze system output
        for sys_chain in system_chains:
            sys_mentions_lower = [m.lower() for m in sys_chain["mentions"]]
            
            # Find corresponding gold chain
            gold_chain_match = None
            for gold_mention in sys_mentions_lower:
                if gold_mention in gold_mention_to_chain:
                    gold_chain_match = gold_mention_to_chain[gold_mention]
                    break
            
            if gold_chain_match is None:
                # False positive: entire system chain is spurious
                analysis["false_positives"].append(sys_chain["mentions"])
            else:
                # Partial match or false positive/negative
                error_type = ErrorAnalyzer.classify_error(
                    sys_chain["mentions"][0],
                    sys_chain["mentions"],
                    gold_chain_match
                )
                analysis["error_types"][error_type] = \
                    analysis["error_types"].get(error_type, 0) + 1
                
                if error_type == "FALSE_POSITIVE":
                    analysis["false_positives"].append({
                        "system": sys_chain["mentions"],
                        "gold": gold_chain_match
                    })
                elif error_type == "PARTIAL_MATCH":
                    analysis["partial_matches"].append({
                        "system": sys_chain["mentions"],
                        "gold": gold_chain_match
                    })
        
        # Find false negatives (gold chains not covered)
        covered_gold = set()
        for sys_chain in system_chains:
            for mention in sys_chain["mentions"]:
                if mention.lower() in gold_mention_to_chain:
                    covered_gold.add(tuple(sorted([m.lower() for m in 
                                          gold_mention_to_chain[mention.lower()]])))
        
        for gold_chain in gold_chains:
            gold_key = tuple(sorted([m.lower() for m in gold_chain["mentions"]]))
            if gold_key not in covered_gold:
                analysis["false_negatives"].append(gold_chain["mentions"])
        
        return analysis

# ============================================================================
# AMBIGUITY CASE STUDY
# ============================================================================

AMBIGUITY_EXAMPLES = [
    {
        "case": "Pronoun Ambiguity",
        "text": "The lawyer called the client after he finished the report.",
        "pronouns": ["he"],
        "antecedents": ["The lawyer", "the client"],
        "linguistic_analysis": """
The pronoun 'he' has two syntactically plausible antecedents:
1. "The lawyer" (subject of main clause) - Preferred by Binding Theory
2. "the client" (object of main clause) - Possible but less likely

LINGUISTIC CUES FOR RESOLUTION:
- Syntactic c-command: The lawyer c-commands the client
- Recency: The client is closer, but subject has priority
- Semantic plausibility: Who can "finish a report"? Both are plausible.
- World knowledge: Typically lawyers write reports for clients.

CHALLENGES FOR ML SYSTEMS:
- Requires syntactic tree construction
- Needs semantic understanding of predicate-argument structure
- Depends on pragmatic/common-sense reasoning
""",
        "system_error": "Rule-based system lacks syntactic analysis and semantic reasoning"
    },
    {
        "case": "Nested Entity Ambiguity",
        "text": "Apple's CEO announced their new product.",
        "entities": ["Apple", "Apple's CEO"],
        "pronouns": ["their"],
        "linguistic_analysis": """
The possessive pronoun 'their' is ambiguous:
1. Their = Apple (organization, plural interpretation)
2. Their = Apple's CEO (person, but using plural form)

LINGUISTIC CUES FOR RESOLUTION:
- Entity type mismatch: "their" expects plural, CEO is singular
- Possessive structure: "Apple's CEO" indicates person subsumed under org
- Semantic selection: Products belong to organizations primarily
- Predicate structure: "announced" fits both interpretations

CHALLENGES FOR ML SYSTEMS:
- Requires entity type classification
- Needs handling of possessive attachment
- Understanding of implicit entity relationships
- Generic plural pronoun usage
""",
        "system_error": "System fails to handle possessive constructions and entity types"
    },
    {
        "case": "Bridging Reference",
        "text": "The company released a new phone. The device featured an advanced camera.",
        "bridging_type": "whole-part",
        "linguistic_analysis": """
"The device" is a bridging reference to "a new phone" - they are not identical
mentions but related through semantic knowledge (device part of phone).

LINGUISTIC CUES FOR RESOLUTION:
- Semantic relation: phone ⊃ device (part-of relationship)
- Discourse context: Expected to discuss phone properties
- Lexical chains: "phone" and "device" are related
- Article use: "The device" presupposes familiarity from context

CHALLENGES FOR ML SYSTEMS:
- Requires semantic/knowledge graph understanding
- Bridging references are subtle and context-dependent
- Not taught explicitly in many coreference datasets
- Depends on world knowledge (what parts do phones have?)
""",
        "system_error": "Rule-based approach treats only identical mentions as coreferent"
    }
]

# ============================================================================
# REPORT GENERATION
# ============================================================================

def generate_ambiguity_analysis() -> str:
    """Generate detailed ambiguity analysis for report."""
    output = []
    output.append("\n" + "="*80)
    output.append("DETAILED AMBIGUITY AND ERROR ANALYSIS")
    output.append("="*80)
    
    for example in AMBIGUITY_EXAMPLES:
        output.append(f"\nCASE: {example['case']}")
        output.append("-" * 60)
        output.append(f"Text: {example['text']}")
        output.append(f"\nLinguistic Analysis:\n{example['linguistic_analysis']}")
        output.append(f"System Error: {example['system_error']}")
    
    return "\n".join(output)

def generate_results_table(results: Dict) -> str:
    """Generate formatted results table."""
    output = []
    output.append("\n" + "="*80)
    output.append("EVALUATION RESULTS SUMMARY")
    output.append("="*80)
    
    muc = results['muc_average']
    ceaf = results['ceaf_average']
    
    output.append(f"\n{'Metric':<15} | {'Precision':<12} | {'Recall':<12} | {'F1':<12}")
    output.append("-" * 55)
    output.append(f"{'MUC':<15} | {muc['precision']:<12.4f} | {muc['recall']:<12.4f} | {muc['f1']:<12.4f}")
    output.append(f"{'CEAF-e':<15} | {ceaf['precision']:<12.4f} | {ceaf['recall']:<12.4f} | {ceaf['f1']:<12.4f}")
    
    return "\n".join(output)

def generate_mention_error_statistics(all_results: List[Dict]) -> str:
    """Analyze error patterns across documents."""
    output = []
    
    mention_type_performance = {
        "PRONOUN": {"correct": 0, "total": 0},
        "PROPER_NOUN": {"correct": 0, "total": 0},
        "DEFINITE_DESCRIPTION": {"correct": 0, "total": 0},
        "INDEFINITE_DESCRIPTION": {"correct": 0, "total": 0},
    }
    
    for doc_result in all_results:
        gold = doc_result["gold"]
        system = doc_result["system"]
        
        for gold_chain in gold:
            for mention in gold_chain["mentions"]:
                m_type = ErrorAnalyzer.analyze_mention_type(mention)
                mention_type_performance[m_type]["total"] += 1
                
                # Check if system got it right
                for sys_chain in system:
                    if any(m.lower() == mention.lower() for m in sys_chain["mentions"]):
                        mention_type_performance[m_type]["correct"] += 1
                        break
    
    output.append("\n" + "="*80)
    output.append("MENTION TYPE PERFORMANCE ANALYSIS")
    output.append("="*80)
    output.append(f"\n{'Mention Type':<25} | {'Correct':<10} | {'Total':<10} | {'Accuracy':<10}")
    output.append("-" * 60)
    
    for m_type, stats in mention_type_performance.items():
        acc = stats["correct"] / stats["total"] if stats["total"] > 0 else 0
        output.append(f"{m_type:<25} | {stats['correct']:<10} | {stats['total']:<10} | {acc:<10.2%}")
    
    return "\n".join(output)

# ============================================================================
# MAIN EXECUTION
# ============================================================================

if __name__ == "__main__":
    print(generate_ambiguity_analysis())
    print("\n\nTo use this module with your results:")
    print("1. Load evaluation results from coref_evaluation_results.json")
    print("2. Call generate_results_table(results) for summary")
    print("3. Call generate_mention_error_statistics(results['per_document']) for error analysis")
    print("4. Use TerminalVisualizer.visualize(text, chains) for color-coded output")
    print("5. Use ErrorAnalyzer.analyze_result() for detailed per-document error analysis")


DETAILED AMBIGUITY AND ERROR ANALYSIS

CASE: Pronoun Ambiguity
------------------------------------------------------------
Text: The lawyer called the client after he finished the report.

Linguistic Analysis:

The pronoun 'he' has two syntactically plausible antecedents:
1. "The lawyer" (subject of main clause) - Preferred by Binding Theory
2. "the client" (object of main clause) - Possible but less likely

LINGUISTIC CUES FOR RESOLUTION:
- Syntactic c-command: The lawyer c-commands the client
- Recency: The client is closer, but subject has priority
- Semantic plausibility: Who can "finish a report"? Both are plausible.
- World knowledge: Typically lawyers write reports for clients.

CHALLENGES FOR ML SYSTEMS:
- Requires syntactic tree construction
- Needs semantic understanding of predicate-argument structure
- Depends on pragmatic/common-sense reasoning

System Error: Rule-based system lacks syntactic analysis and semantic reasoning

CASE: Nested Entity Ambiguity
----------------