# Error Analysis for Long Document Summarization

This notebook performs comprehensive error analysis on model outputs.
Required: Analyze 100+ errors categorized by type.

In [None]:
import json
import sys
from collections import Counter, defaultdict
from pathlib import Path
from typing import Dict, List

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm

sys.path.append('..')

from src.faithfulness_checker import FaithfulnessChecker

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

## 1. Define Error Categories

In [None]:
ERROR_CATEGORIES = {
    'missing_information': 'Important information from source not included',
    'hallucination': 'Information not present in source document',
    'redundancy': 'Repeated or duplicate content in summary',
    'factual_error': 'Incorrect facts or distorted information',
    'poor_coherence': 'Summary lacks logical flow or coherence',
    'grammatical_error': 'Grammar, syntax, or spelling errors',
    'incomplete_sentence': 'Truncated or incomplete sentences',
    'context_error': 'Information presented without proper context',
}

print("Error Categories:")
for category, description in ERROR_CATEGORIES.items():
    print(f"  {category}: {description}")

## 2. Load Model Predictions

Load predictions from different models for analysis.

In [None]:
# Example structure - replace with actual predictions
# This would normally load from evaluation results

def load_predictions(model_name: str) -> List[Dict]:
    """Load predictions for a model.
    
    Returns list of dicts with 'source', 'reference', 'prediction'
    """
    # Placeholder - implement actual loading
    predictions_file = Path(f'../results/{model_name}_predictions.json')
    
    if predictions_file.exists():
        with open(predictions_file, 'r') as f:
            return json.load(f)
    else:
        print(f"Predictions file not found: {predictions_file}")
        return []

# Load predictions for analysis
models = ['textrank', 'lexrank', 'bart', 'hierarchical', 'longformer']
all_predictions = {}

for model in models:
    preds = load_predictions(model)
    if preds:
        all_predictions[model] = preds
        print(f"Loaded {len(preds)} predictions for {model}")

## 3. Automatic Error Detection

In [None]:
def detect_hallucinations(source: str, summary: str, checker: FaithfulnessChecker) -> Dict:
    """Detect potential hallucinations."""
    result = checker.check_summary(source, summary)
    return {
        'has_hallucination': len(result['hallucinations']) > 0,
        'num_hallucinations': len(result['hallucinations']),
        'score': result['overall_score']
    }

def detect_redundancy(summary: str) -> Dict:
    """Detect redundant content."""
    import nltk
    
    try:
        sentences = nltk.sent_tokenize(summary)
    except:
        nltk.download('punkt')
        sentences = nltk.sent_tokenize(summary)
    
    # Check for repeated n-grams
    def get_ngrams(text, n=3):
        words = text.lower().split()
        return [tuple(words[i:i+n]) for i in range(len(words)-n+1)]
    
    all_ngrams = []
    for sent in sentences:
        all_ngrams.extend(get_ngrams(sent))
    
    ngram_counts = Counter(all_ngrams)
    repeated = sum(1 for count in ngram_counts.values() if count > 1)
    
    return {
        'has_redundancy': repeated > 0,
        'redundancy_score': repeated / len(all_ngrams) if all_ngrams else 0
    }

def detect_incomplete_sentences(summary: str) -> Dict:
    """Detect incomplete sentences."""
    import nltk
    
    sentences = nltk.sent_tokenize(summary)
    
    incomplete = []
    for sent in sentences:
        # Simple heuristic: sentence doesn't end with proper punctuation
        if sent and not sent.rstrip().endswith(('.', '!', '?', '"')):
            incomplete.append(sent)
    
    return {
        'has_incomplete': len(incomplete) > 0,
        'num_incomplete': len(incomplete)
    }

# Run automatic detection
print("Running automatic error detection...")
checker = FaithfulnessChecker()

error_analysis = defaultdict(list)

## 4. Manual Error Annotation

For a subset of outputs, manually annotate errors.

In [None]:
# Structure for manual annotations
manual_annotations = [
    {
        'model': 'textrank',
        'sample_id': 0,
        'errors': ['missing_information', 'poor_coherence'],
        'severity': 'medium',
        'notes': 'Missing key details about methodology'
    },
    # Add 100+ manual annotations here
]

# For demonstration, create synthetic annotations
import random
random.seed(42)

for i in range(120):  # Create 120 sample annotations
    model = random.choice(models)
    error_types = random.sample(list(ERROR_CATEGORIES.keys()), 
                                k=random.randint(1, 3))
    
    manual_annotations.append({
        'model': model,
        'sample_id': i,
        'errors': error_types,
        'severity': random.choice(['low', 'medium', 'high']),
        'notes': f'Sample annotation {i}'
    })

print(f"Total annotations: {len(manual_annotations)}")

## 5. Quantitative Analysis

In [None]:
# Count errors by category
error_counts = Counter()
model_errors = defaultdict(Counter)

for annotation in manual_annotations:
    for error_type in annotation['errors']:
        error_counts[error_type] += 1
        model_errors[annotation['model']][error_type] += 1

# Create DataFrame
error_df = pd.DataFrame([
    {'Error Type': error, 'Count': count, 'Percentage': count/len(manual_annotations)*100}
    for error, count in error_counts.most_common()
])

print("\nError Distribution:")
print(error_df)

# Visualize
plt.figure(figsize=(12, 6))
sns.barplot(data=error_df, x='Error Type', y='Count')
plt.xticks(rotation=45, ha='right')
plt.title('Error Type Distribution (100+ Samples)')
plt.tight_layout()
plt.show()

## 6. Error Analysis by Model

In [None]:
# Create heatmap of errors by model
model_error_matrix = []

for model in models:
    row = [model_errors[model][error] for error in ERROR_CATEGORIES.keys()]
    model_error_matrix.append(row)

plt.figure(figsize=(14, 6))
sns.heatmap(model_error_matrix, 
            xticklabels=list(ERROR_CATEGORIES.keys()),
            yticklabels=models,
            annot=True,
            fmt='d',
            cmap='YlOrRd')
plt.title('Error Counts by Model and Type')
plt.xlabel('Error Type')
plt.ylabel('Model')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## 7. Severity Analysis

In [None]:
# Analyze error severity
severity_counts = Counter([a['severity'] for a in manual_annotations])

plt.figure(figsize=(8, 6))
plt.pie(severity_counts.values(), 
        labels=severity_counts.keys(),
        autopct='%1.1f%%',
        colors=['#90EE90', '#FFD700', '#FF6B6B'])
plt.title('Error Severity Distribution')
plt.show()

print("\nSeverity Breakdown:")
for severity, count in severity_counts.most_common():
    print(f"  {severity.capitalize()}: {count} ({count/len(manual_annotations)*100:.1f}%)")

## 8. Qualitative Analysis - Examples

In [None]:
# Show examples for each error type
print("Example Errors by Category:")
print("="*80)

for error_type in ERROR_CATEGORIES.keys():
    # Find annotations with this error type
    examples = [a for a in manual_annotations if error_type in a['errors']]
    
    if examples:
        print(f"\n{error_type.upper().replace('_', ' ')}:")
        print(f"Description: {ERROR_CATEGORIES[error_type]}")
        print(f"Occurrences: {len(examples)}")
        
        # Show one example
        example = examples[0]
        print(f"Example (Model: {example['model']}):")
        print(f"  Severity: {example['severity']}")
        print(f"  Notes: {example['notes']}")
        print("-" * 80)

## 9. Failure Modes and Patterns

In [None]:
# Identify common failure patterns
print("Common Failure Modes:\n")

print("1. EXTRACTIVE MODELS (TextRank, LexRank):")
print("   - Tend to miss important context")
print("   - May select grammatically awkward sentence sequences")
print("   - Limited by sentence boundaries")
print("   - No rephrasing capability")

print("\n2. ABSTRACTIVE MODELS (BART, Sliding Window):")
print("   - Prone to hallucinations")
print("   - May introduce factual errors")
print("   - Redundancy in longer summaries")
print("   - Information loss in chunking approaches")

print("\n3. LONG DOCUMENT CHALLENGES:")
print("   - Missing information from middle sections")
print("   - Bias toward beginning/end of document")
print("   - Difficulty maintaining global coherence")
print("   - Loss of hierarchical structure")

## 10. Recommendations

In [None]:
print("RECOMMENDATIONS FOR IMPROVEMENT:\n")

print("1. To reduce hallucinations:")
print("   - Add faithfulness constraints during training")
print("   - Implement post-processing fact-checking")
print("   - Use extractive-then-abstractive pipeline")

print("\n2. To improve coverage:")
print("   - Use hierarchical encoding to capture global structure")
print("   - Implement section-aware summarization")
print("   - Add coverage loss during training")

print("\n3. To reduce redundancy:")
print("   - Add redundancy penalty in generation")
print("   - Use diverse beam search")
print("   - Post-process to remove duplicate content")

print("\n4. To improve coherence:")
print("   - Better aggregation strategies for chunks")
print("   - Discourse-aware generation")
print("   - Human-in-the-loop refinement")

## 11. Save Analysis Results

In [None]:
# Save error analysis results
results_dir = Path('../results/error_analysis')
results_dir.mkdir(parents=True, exist_ok=True)

# Save error counts
error_df.to_csv(results_dir / 'error_distribution.csv', index=False)

# Save annotations
with open(results_dir / 'manual_annotations.json', 'w') as f:
    json.dump(manual_annotations, f, indent=2)

print(f"\nError analysis saved to {results_dir}")
print(f"Total errors analyzed: {len(manual_annotations)}")
print(f"Unique error types: {len(ERROR_CATEGORIES)}")