# Paragraph-Level Data Processing for Thresh Annotation Interface

This notebook processes legal case checklist data at the paragraph level instead of the checklist item level.
Each instance represents a paragraph, containing all evidence-value pairs where the first evidence appears in that paragraph.

In [1]:
# Import necessary libraries
import json
import re
import os
import sys
from typing import List, Dict, Tuple, Optional, Any
from collections import defaultdict

print(f"Python: {sys.executable}")
os.chdir('/srv/nlprx-lab/share6/douy/summarization-rl/annotation_interface/thresh_mod/')

Python: /coc/pskynet6/douy/legal-envs/bin/python3


## Utility Functions from Original Notebook

In [2]:
# Utility functions for JSON extraction and processing
def remove_json_comments(text):
    """Remove single-line comments (//) from JSON-like text."""
    lines = text.split('\n')
    cleaned_lines = []
    
    for line in lines:
        string_pattern = r'"(?:[^"\\]|\\.)*"'
        strings = [(m.start(), m.end()) for m in re.finditer(string_pattern, line)]
        
        comment_pos = -1
        for i in range(len(line) - 1):
            if line[i:i+2] == '//':
                in_string = any(start <= i < end for start, end in strings)
                if not in_string:
                    comment_pos = i
                    break
        
        if comment_pos != -1:
            cleaned_lines.append(line[:comment_pos].rstrip())
        else:
            cleaned_lines.append(line)
    
    return '\n'.join(cleaned_lines)

def extract_json(text):
    """Extract JSON from text that may contain comments."""
    clean_text = remove_json_comments(text)
    
    start = clean_text.find('{')
    end = clean_text.rfind('}')
    
    if start == -1 or end == -1 or start > end:
        return None
    
    json_str = clean_text[start:end + 1]
    
    try:
        json_obj = json.loads(json_str)
        return json_obj
    except json.JSONDecodeError as e:
        print(f"Error: Invalid JSON format - {e}")
        return None

def extract_checklist_item_evidence(response):
    """Extract checklist item from the evidence format."""
    json_obj = extract_json(response)
    if json_obj:
        return json_obj
    else:
        print(f"Error: Invalid JSON format in {response}")
        return {}

def extract_summary(text: str) -> str:
    """Extract summary from text that might be wrapped in markdown or other formatting."""
    if not text:
        return ""
    
    # If text contains markdown code blocks, extract content
    if '```' in text:
        # Find content between first and last ```
        parts = text.split('```')
        if len(parts) >= 3:
            # Content is in parts[1] if properly formatted
            return parts[1].strip()
    
    return text.strip()

## Paragraph Detection and Processing Functions

In [3]:
def split_into_paragraphs(text: str) -> List[Dict[str, Any]]:
    """
    Split text into paragraphs and track their character indices.
    
    Returns:
        List of dicts with keys:
        - 'text': paragraph text
        - 'start': start character index in original text
        - 'end': end character index in original text (exclusive)
        - 'paragraph_num': 0-based paragraph number
    """
    paragraphs = []
    
    # Try splitting by double newlines first
    if '\n\n' in text:
        splits = text.split('\n\n')
        current_pos = 0
        
        for i, para_text in enumerate(splits):
            if para_text.strip():  # Skip empty paragraphs
                # Find the actual position in the original text
                start_pos = text.find(para_text, current_pos)
                end_pos = start_pos + len(para_text)
                
                paragraphs.append({
                    'text': para_text,
                    'start': start_pos,
                    'end': end_pos,
                    'paragraph_num': len(paragraphs)
                })
                
                current_pos = end_pos
    
    # If no double newlines, try single newlines
    elif '\n' in text:
        splits = text.split('\n')
        current_pos = 0
        
        for i, para_text in enumerate(splits):
            if para_text.strip():  # Skip empty lines
                start_pos = text.find(para_text, current_pos)
                end_pos = start_pos + len(para_text)
                
                paragraphs.append({
                    'text': para_text,
                    'start': start_pos,
                    'end': end_pos,
                    'paragraph_num': len(paragraphs)
                })
                
                current_pos = end_pos
    
    # If no newlines at all, treat entire text as one paragraph
    else:
        if text.strip():
            paragraphs.append({
                'text': text,
                'start': 0,
                'end': len(text),
                'paragraph_num': 0
            })
    
    return paragraphs

def find_paragraph_for_indices(indices: List[Tuple[int, int]], paragraphs: List[Dict[str, Any]]) -> int:
    """
    Find which paragraph contains the earliest evidence based on character indices.
    
    Args:
        indices: List of (start, end) character positions
        paragraphs: List of paragraph dictionaries with start/end positions
    
    Returns:
        Paragraph number (0-based) that contains the earliest evidence
    """
    if not indices:
        return -1
    
    # Find the earliest evidence position
    earliest_start = min(start for start, _ in indices)
    
    # Find which paragraph contains this position
    for para in paragraphs:
        # Check if the earliest evidence starts in this paragraph
        # or spans into this paragraph
        if para['start'] <= earliest_start < para['end']:
            return para['paragraph_num']
        
        # Also check if evidence spans across paragraph boundary
        for start, end in indices:
            # If evidence overlaps with this paragraph at all
            if (start < para['end'] and end > para['start']):
                # But only if this is the earliest such paragraph
                if start == earliest_start:
                    return para['paragraph_num']
    
    # If no paragraph found (shouldn't happen), return first paragraph
    return 0

def detect_multi_paragraph_evidence(indices: List[Tuple[int, int]], paragraphs: List[Dict[str, Any]]) -> List[int]:
    """
    Detect which paragraphs an evidence spans across.
    
    Returns:
        List of paragraph numbers that the evidence touches
    """
    touched_paragraphs = set()
    
    for start, end in indices:
        for para in paragraphs:
            # Check if evidence overlaps with this paragraph
            if start < para['end'] and end > para['start']:
                touched_paragraphs.add(para['paragraph_num'])
    
    return sorted(list(touched_paragraphs))

## Evidence Finding Functions

In [4]:
def find_evidence_indices(evidence_text: str, summary: str) -> List[Tuple[int, int]]:
    """
    Find all occurrences of evidence text in summary and return character indices.
    Returns list of (start, end) tuples where end is exclusive.
    """
    indices = []
    
    # Handle evidence with ellipsis by splitting and finding each part
    if '...' in evidence_text:
        parts = [part.strip() for part in evidence_text.split('...') if part.strip()]
        for part in parts:
            # Find all occurrences of this part
            start = 0
            while True:
                pos = summary.find(part, start)
                if pos == -1:
                    break
                indices.append((pos, pos + len(part)))
                start = pos + 1
    else:
        # Find all occurrences of the complete evidence
        start = 0
        while True:
            pos = summary.find(evidence_text, start)
            if pos == -1:
                break
            indices.append((pos, pos + len(evidence_text)))
            start = pos + 1
    
    return indices

def merge_overlapping_indices(indices: List[Tuple[int, int]]) -> List[Tuple[int, int]]:
    """Merge overlapping or adjacent character indices."""
    if not indices:
        return []
    
    # Sort by start position
    sorted_indices = sorted(indices)
    merged = [sorted_indices[0]]
    
    for start, end in sorted_indices[1:]:
        last_start, last_end = merged[-1]
        
        # If overlapping or adjacent, merge
        if start <= last_end:
            merged[-1] = (last_start, max(last_end, end))
        else:
            merged.append((start, end))
    
    return merged

## Main Processing Function for Paragraph-Level Organization

In [5]:
def create_checklist_category_mapping():
    """
    Create a mapping from checklist item names to category names used in the YML file.
    """
    return {
        # Basic Case Information
        "Filing Date": "filing_date",
        "Filing_Date": "filing_date",
        "Who are the Parties": "parties",
        "Parties": "parties",
        "Class Action or Individual Plaintiffs": "class_action",
        "Class_Action": "class_action",
        "Type of Counsel": "type_of_counsel",
        "Type_of_Counsel": "type_of_counsel",
        
        # Legal Basis
        "Cause of Action": "cause_of_action",
        "Cause_of_Action": "cause_of_action",
        "Statutory or Constitutional Basis for the Case": "statutory_basis",
        "Statutory_Constitutional_Basis": "statutory_basis",
        "Remedy Sought": "remedy_sought",
        "Remedy_Sought": "remedy_sought",
        
        # Judge Information
        "First and Last name of Judge": "judge_name",
        "Judge_Name": "judge_name",
        "First and Last Name of Judge": "judge_name",
        
        # Case Relations
        "Consolidated Cases Noted": "consolidated_cases",
        "Consolidated_Cases": "consolidated_cases",
        "Related Cases Listed by Their Case Code Number": "related_cases",
        "Related_Cases": "related_cases",
        
        # Filings and Rulings
        "Note Important Filings": "important_filings",
        "Important_Filings": "important_filings",
        "Court Rulings": "court_rulings",
        "Court_Rulings": "court_rulings",
        "All Reported Opinions Cited with Shortened Bluebook Citation": "reported_opinions",
        "Reported_Opinions": "reported_opinions",
        "All Reported Opinions": "reported_opinions",
        
        # Trials and Appeals
        "Trials": "trials",
        "Appeal": "appeals",
        "Appeals": "appeals",
        
        # Decrees
        "Significant Terms of Decrees": "decree_terms",
        "Decree_Terms": "decree_terms",
        "Dates of All Decrees": "decree_dates",
        "Decree_Dates": "decree_dates",
        "Dates_of_All_Decrees": "decree_dates",
        "How Long Decrees will Last": "decree_duration",
        "Decree_Duration": "decree_duration",
        
        # Settlement
        "Significant Terms of Settlement": "settlement_terms",
        "Settlement_Terms": "settlement_terms",
        "Date of Settlement": "settlement_date",
        "Settlement_Date": "settlement_date",
        "How Long Settlement will Last": "settlement_duration",
        "Settlement_Duration": "settlement_duration",
        "Whether the Settlement is Court-enforced or Not": "court_enforced",
        "Court_Enforced": "court_enforced",
        "Whether Settlement is Court-enforced": "court_enforced",
        "Disputes Over Settlement Enforcement": "settlement_disputes",
        "Settlement_Disputes": "settlement_disputes",
        
        # Monitor Information
        "Name of the Monitor": "monitor_name",
        "Monitor_Name": "monitor_name",
        "Monitor Reports": "monitor_reports",
        "Monitor_Reports": "monitor_reports",
        "Monitor's Reports": "monitor_reports",
        
        # Case Facts
        "Factual Basis of Case": "factual_basis",
        "Factual_Basis": "factual_basis",
        
        # Add any additional variations you might encounter
        "Final_Judgment_Date": "decree_dates",  # Map to decree dates if no specific category
    }

def process_checklist_to_paragraph_thresh(
    case_id: str,
    summary: str,
    checklist_data: Dict[str, Any],
    model_name: str = "extracted_checklist"
) -> List[Dict]:
    """
    Process a single case with all its checklist extractions into paragraph-based thresh format.
    
    Args:
        case_id: Unique identifier for the case
        summary: The legal case summary text
        checklist_data: Dictionary mapping checklist item names to extraction results
        model_name: Name of the model used for extraction
    
    Returns:
        List of thresh format dictionaries, one per paragraph
    """
    # Get the category mapping
    category_mapping = create_checklist_category_mapping()
    
    # Split summary into paragraphs
    paragraphs = split_into_paragraphs(summary)
    
    if not paragraphs:
        print(f"Warning: No paragraphs found in summary for case {case_id}")
        return []
    
    # Initialize storage for edits by paragraph
    paragraph_edits = defaultdict(list)
    
    # Process each checklist item
    for checklist_item_name, extraction in checklist_data.items():
        # Handle different formats of extraction
        if isinstance(extraction, str):
            model_output = extraction
        elif isinstance(extraction, dict) and "answer" in extraction:
            model_output = extraction["answer"]
        else:
            print(f"Warning: Unexpected format for {case_id}/{checklist_item_name}")
            continue
        
        # Parse the model output
        parsed_output = extract_checklist_item_evidence(model_output)
        extracted_items = parsed_output.get("extracted", [])
        
        # Process each extracted item
        for item in extracted_items:
            value = item.get("value", "")
            evidence_list = item.get("evidence", [])
            
            # Convert value to string if it's not
            if not isinstance(value, str):
                value = repr(value)
            
            # Find indices for all evidence snippets
            all_indices = []
            for evidence in evidence_list:
                indices = find_evidence_indices(evidence, summary)
                all_indices.extend(indices)
            
            # Skip if no evidence found
            if not all_indices:
                continue
            
            # Merge overlapping indices
            merged_indices = merge_overlapping_indices(all_indices)
            
            # Find which paragraph this evidence-value pair belongs to
            # Based on the FIRST (earliest) evidence position
            paragraph_num = find_paragraph_for_indices(merged_indices, paragraphs)
            
            if paragraph_num == -1:
                print(f"Warning: Could not find paragraph for evidence in {case_id}/{checklist_item_name}")
                continue
            
            # Detect if evidence spans multiple paragraphs (for metadata)
            spanned_paragraphs = detect_multi_paragraph_evidence(merged_indices, paragraphs)
            
            # Map checklist item name to category name
            category = category_mapping.get(checklist_item_name, "checklist_extraction")
            if category == "checklist_extraction":
                # If no specific mapping found, try to create one from the name
                # Convert to snake_case: "Filing Date" -> "filing_date"
                category_candidate = checklist_item_name.replace(" ", "_").replace("-", "_").lower()
                # Remove special characters
                category_candidate = ''.join(c if c.isalnum() or c == '_' else '' for c in category_candidate)
                # Use the candidate if it seems reasonable
                if category_candidate:
                    category = category_candidate
                    print(f"Note: Using generated category '{category}' for checklist item '{checklist_item_name}'")
            
            # Create edit for this evidence-value pair
            edit = {
                "category": category,
                "output_idx": [[start, end] for start, end in merged_indices],
                "annotation": {
                    "explanation": value,
                    "checklist_item": checklist_item_name,
                    "spans_paragraphs": spanned_paragraphs if len(spanned_paragraphs) > 1 else None
                }
            }
            
            # Add to the appropriate paragraph
            paragraph_edits[paragraph_num].append(edit)
    
    # Create thresh entries for each paragraph
    thresh_entries = []
    
    for para in paragraphs:
        para_num = para['paragraph_num']
        edits = paragraph_edits.get(para_num, [])
        
        # Assign IDs to edits
        for i, edit in enumerate(edits):
            edit['id'] = i
        
        # Create thresh entry for this paragraph
        thresh_entry = {
            "id": f"{case_id}_p{para_num}",
            "metadata": {
                "case_id": case_id,
                "paragraph_num": para_num,
                "total_paragraphs": len(paragraphs),
                "model": model_name,
                "paragraph_text_preview": para['text'][:100] + "..." if len(para['text']) > 100 else para['text']
            },
            "source": "",  # No source text in our case
            "target": summary,  # Full summary for context
            "target_paragraph": para['text'],  # The specific paragraph
            "target_paragraph_indices": [para['start'], para['end']],  # Character indices of paragraph
            "edits": edits
        }
        
        thresh_entries.append(thresh_entry)
    
    return thresh_entries

## Batch Processing Functions

In [6]:
def process_paragraph_batch_separate_files(
    checklist_data_path: str,
    summary_data_path: str,
    output_folder: str,
    model_name: str = "extracted_checklist"
):
    """
    Process a batch of cases with checklist extractions at paragraph level.
    Each case file contains multiple instances, one for each paragraph.
    
    Args:
        checklist_data_path: Path to JSON file with model extractions
        summary_data_path: Path to JSON file with case summaries  
        output_folder: Folder path to save individual case files
        model_name: Name of the model used for extraction
    """
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)
    
    # Load checklist data
    with open(checklist_data_path, 'r') as f:
        checklist_data = json.load(f)
    
    # Load summary data
    with open(summary_data_path, 'r') as f:
        summary_data = json.load(f)
    
    # Process summaries based on data format
    summary_dict = {}
    
    # Check if data is a list (reference format) or dict (model generated format)
    if isinstance(summary_data, list):
        # Reference data format: list of cases with "summary/long" field
        for case in summary_data:
            case_id = case.get("case_id")
            summary = case.get("summary/long")
            if case_id and summary:
                summary_dict[case_id] = summary
    elif isinstance(summary_data, dict) and "results" in summary_data:
        # Model generated format: dict with "results" containing case_id -> summary mapping
        for case_id, summary in summary_data["results"].items():
            if isinstance(summary, dict):
                # Summary might be wrapped in dict with "answer" field
                extracted = extract_summary(summary.get("answer", ""))
            else:
                # Direct summary string
                extracted = extract_summary(summary)
            
            if extracted:
                summary_dict[case_id] = extracted
    else:
        print(f"Warning: Unexpected summary data format")
        return 0
    
    # Process each case
    processed_count = 0
    total_paragraphs = 0
    results = checklist_data.get("results", {})
    
    for case_id, checklist in results.items():
        # Get the summary for this case
        summary = summary_dict.get(case_id)
        if not summary:
            print(f"Warning: No summary found for case {case_id}")
            continue
        
        # Process into paragraph-based instances
        paragraph_instances = process_checklist_to_paragraph_thresh(
            case_id=case_id,
            summary=summary,
            checklist_data=checklist,
            model_name=model_name
        )
        
        # Save all instances for this case to a single file
        if paragraph_instances:
            output_path = os.path.join(output_folder, f"{case_id}.json")
            with open(output_path, 'w') as f:
                json.dump(paragraph_instances, f, indent=2)
            
            processed_count += 1
            total_paragraphs += len(paragraph_instances)
            print(f"Processed {case_id}: {len(paragraph_instances)} paragraphs")
    
    print(f"\nTotal: Processed {processed_count} cases into {total_paragraphs} paragraph instances")
    print(f"Files saved to {output_folder}/")
    
    return processed_count

## Example Usage

In [7]:
# Example: Process a single case to demonstrate paragraph-level organization
example_case_id = "example_001"
example_summary = """This case involves a dispute over property rights between John Smith and Jane Doe.
The parties entered into a purchase agreement on January 10, 2023.

The court issued its first decree on March 15, 2023, ordering preliminary injunction.
The injunction prevented the sale of the property to third parties.
A second decree was issued on April 20, 2023, finalizing the property division.

The final judgment was entered on May 1, 2023.
Both parties were ordered to pay equal shares of court costs.
The case was officially closed on May 15, 2023."""

# Example checklist data with multiple items
example_checklist = {
    "Dates_of_All_Decrees": """{
        "reasoning": "The summary mentions multiple decree dates.",
        "extracted": [
            {
                "evidence": ["The court issued its first decree on March 15, 2023, ordering preliminary injunction."],
                "value": "March 15, 2023: preliminary injunction"
            },
            {
                "evidence": ["A second decree was issued on April 20, 2023, finalizing the property division."],
                "value": "April 20, 2023: property division finalization"
            }
        ]
    }""",
    "Filing_Date": """{
        "reasoning": "Purchase agreement date mentioned.",
        "extracted": [
            {
                "evidence": ["The parties entered into a purchase agreement on January 10, 2023."],
                "value": "January 10, 2023"
            }
        ]
    }""",
    "Final_Judgment_Date": """{
        "reasoning": "Final judgment and case closure dates found.",
        "extracted": [
            {
                "evidence": ["The final judgment was entered on May 1, 2023."],
                "value": "May 1, 2023"
            },
            {
                "evidence": ["The case was officially closed on May 15, 2023."],
                "value": "May 15, 2023: case closed"
            }
        ]
    }"""
}

# Process the example into paragraph-based format
paragraph_entries = process_checklist_to_paragraph_thresh(
    case_id=example_case_id,
    summary=example_summary,
    checklist_data=example_checklist,
    model_name="example_model"
)

# Display results
print(f"Generated {len(paragraph_entries)} paragraph-based instances:\n")
for i, entry in enumerate(paragraph_entries):
    print(f"Paragraph {i}: {entry['metadata']['paragraph_text_preview']}")
    print(f"  Number of edits: {len(entry['edits'])}")
    if entry['edits']:
        print(f"  Checklist items in this paragraph:")
        items = set(edit['annotation']['checklist_item'] for edit in entry['edits'])
        for item in items:
            print(f"    - {item}")
    print()

# Show detailed view of one paragraph instance
if paragraph_entries:
    print("\nDetailed view of first paragraph instance:")
    print(json.dumps(paragraph_entries[0], indent=2))

Generated 3 paragraph-based instances:

Paragraph 0: This case involves a dispute over property rights between John Smith and Jane Doe.
The parties enter...
  Number of edits: 1
  Checklist items in this paragraph:
    - Filing_Date

Paragraph 1: The court issued its first decree on March 15, 2023, ordering preliminary injunction.
The injunction...
  Number of edits: 2
  Checklist items in this paragraph:
    - Dates_of_All_Decrees

Paragraph 2: The final judgment was entered on May 1, 2023.
Both parties were ordered to pay equal shares of cour...
  Number of edits: 2
  Checklist items in this paragraph:
    - Final_Judgment_Date


Detailed view of first paragraph instance:
{
  "id": "example_001_p0",
  "metadata": {
    "case_id": "example_001",
    "paragraph_num": 0,
    "total_paragraphs": 3,
    "model": "example_model",
    "paragraph_text_preview": "This case involves a dispute over property rights between John Smith and Jane Doe.\nThe parties enter..."
  },
  "source": "",
  "t

## Process Actual Data Files

In [8]:
# Process actual data files into paragraph-level format
# Update these paths to your actual data locations

model_name = "gpt-5-2025-08-07"

file_name = "2024_example_cases_20"

# For reference summaries
checklist_path = f"../../summary_checklist_evidence/legal/multi_lexsum/{model_name}/{file_name}_thinking_medium.json"
summary_path = f"../../data/legal/multi_lexsum/{file_name}.json"
output_folder = "public/data/legal_extract_checklist_paragraph/legal_cases_reference"

# Process the files
if os.path.exists(checklist_path) and os.path.exists(summary_path):
    processed_count = process_paragraph_batch_separate_files(
        checklist_data_path=checklist_path,
        summary_data_path=summary_path,
        output_folder=output_folder,
        model_name=model_name
    )
else:
    print("Data files not found. Please update the paths.")

Processed 46014: 3 paragraphs
Processed 45235: 3 paragraphs
Processed 45223: 10 paragraphs
Processed 45160: 6 paragraphs
Processed 45157: 8 paragraphs
Processed 45858: 7 paragraphs
Processed 45544: 8 paragraphs
Processed 46071: 10 paragraphs
Processed 45429: 4 paragraphs
Processed 45737: 9 paragraphs
Processed 43840: 37 paragraphs
Processed 44606: 7 paragraphs
Processed 17762: 8 paragraphs
Processed 46406: 5 paragraphs
Processed 46310: 11 paragraphs
Processed 17268: 11 paragraphs
Processed 46083: 14 paragraphs
Processed 45647: 6 paragraphs
Processed 43966: 16 paragraphs
Processed 17701: 12 paragraphs

Total: Processed 20 cases into 195 paragraph instances
Files saved to public/data/legal_extract_checklist_paragraph/legal_cases_reference/


## Analysis and Statistics

In [9]:
def analyze_paragraph_distribution(output_folder: str):
    """
    Analyze the distribution of paragraphs and edits in processed files.
    """
    if not os.path.exists(output_folder):
        print(f"Output folder {output_folder} does not exist.")
        return
    
    stats = {
        'total_cases': 0,
        'total_paragraphs': 0,
        'total_edits': 0,
        'paragraphs_per_case': [],
        'edits_per_paragraph': [],
        'empty_paragraphs': 0,
        'multi_span_evidence': 0
    }
    
    # Iterate through all JSON files in the output folder
    for filename in os.listdir(output_folder):
        if filename.endswith('.json'):
            filepath = os.path.join(output_folder, filename)
            with open(filepath, 'r') as f:
                data = json.load(f)
            
            stats['total_cases'] += 1
            stats['total_paragraphs'] += len(data)
            stats['paragraphs_per_case'].append(len(data))
            
            for instance in data:
                edits = instance.get('edits', [])
                stats['total_edits'] += len(edits)
                stats['edits_per_paragraph'].append(len(edits))
                
                if len(edits) == 0:
                    stats['empty_paragraphs'] += 1
                
                # Check for multi-paragraph spanning evidence
                for edit in edits:
                    if edit.get('annotation', {}).get('spans_paragraphs'):
                        stats['multi_span_evidence'] += 1
    
    # Calculate averages
    if stats['total_cases'] > 0:
        avg_para_per_case = sum(stats['paragraphs_per_case']) / len(stats['paragraphs_per_case'])
        print(f"Total cases processed: {stats['total_cases']}")
        print(f"Total paragraph instances: {stats['total_paragraphs']}")
        print(f"Average paragraphs per case: {avg_para_per_case:.2f}")
    
    if stats['edits_per_paragraph']:
        avg_edits_per_para = sum(stats['edits_per_paragraph']) / len(stats['edits_per_paragraph'])
        print(f"Total edits: {stats['total_edits']}")
        print(f"Average edits per paragraph: {avg_edits_per_para:.2f}")
        print(f"Empty paragraphs (no edits): {stats['empty_paragraphs']}")
        print(f"Evidence spanning multiple paragraphs: {stats['multi_span_evidence']}")

# Run analysis on processed data
analyze_paragraph_distribution("public/data/legal_extract_checklist_paragraph/legal_cases_reference")

Total cases processed: 47
Total paragraph instances: 420
Average paragraphs per case: 8.94
Total edits: 2852
Average edits per paragraph: 6.79
Empty paragraphs (no edits): 21
Evidence spanning multiple paragraphs: 392
