# 03b — Chunkr Semantic Enhancement

**Pipeline Position:** 01_blocks (OCR) → 02_cleanup (Domain) → 03_llm_cleanup (LLM) → **03b_chunkr_enhance** → 04_json_extraction

This stage takes cleaned OCR blocks from the previous pipeline stages and enhances them with:
- **Advanced layout analysis** and structure detection
- **Semantic chunking** optimized for RAG/LLM applications  
- **Cross-reference resolution** and text flow reconstruction
- **Enhanced metadata** extraction

## How it works:
1. Takes cleaned blocks from `03_llm_cleanup`
2. Reconstructs page content from OCR blocks
3. Sends consolidated content to local Chunkr instance
4. Receives back semantically chunked and structured content
5. Merges Chunkr insights with original OCR block metadata
6. Outputs enhanced blocks ready for JSON extraction

## Prerequisites:
- Local Chunkr instance running (use `./setup_local_chunkr.sh`)
- Completed previous pipeline stages (01, 02, 03)

In [None]:
# Configuration
blocks_dir = "outputs/run_001/03_llmcleaned"  # Input from LLM cleanup stage
output_dir = "outputs/run_001/03b_chunkr_enhanced"  # Output for next stage
chunkr_base_url = "http://localhost:8000"  # Local Chunkr instance

# Chunkr processing parameters
chunk_target_length = 512
chunk_overlap = 50
ocr_strategy = "Auto"  # Since we already have OCR, this is mainly for layout analysis

# Processing limits
max_retries = 3
timeout_seconds = 300
chunkr_wait_seconds = 2

print(f"📁 Input: {blocks_dir}")
print(f"📁 Output: {output_dir}")
print(f"🔗 Chunkr: {chunkr_base_url}")

In [None]:
# Imports
import os
import json
import time
import requests
import tempfile
import re
from pathlib import Path
from typing import Dict, Any, List, Optional
import warnings
warnings.filterwarnings('ignore')

# Setup directories
blocks_dir = Path(blocks_dir).expanduser().resolve()
output_dir = Path(output_dir).expanduser().resolve()
output_dir.mkdir(parents=True, exist_ok=True)

print(f"✅ Input directory: {blocks_dir}")
print(f"✅ Output directory: {output_dir}")

In [None]:
# Health Check - Verify Chunkr is available
def check_chunkr_health() -> bool:
    """Check if Chunkr service is available"""
    try:
        response = requests.get(f"{chunkr_base_url}/health", timeout=10)
        return response.status_code == 200
    except Exception as e:
        print(f"❌ Chunkr health check failed: {e}")
        return False

# Test Chunkr availability
if check_chunkr_health():
    print(f"✅ Chunkr service is healthy at {chunkr_base_url}")
    
    # Get additional info
    try:
        response = requests.get(f"{chunkr_base_url}/llm/models", timeout=5)
        if response.status_code == 200:
            models = response.json()
            print(f"🤖 Available models: {models.get('models', [])}")
    except:
        pass
else:
    print(f"❌ Chunkr service not available at {chunkr_base_url}")
    print("💡 Please run: ./setup_local_chunkr.sh")
    print("💡 Or check if Docker services are running")

In [None]:
# Discover input files from previous pipeline stage
def discover_input_files(blocks_dir: Path) -> List[Path]:
    """Discover input JSON files from previous pipeline stage"""
    
    # Look for files in order of preference
    patterns = [
        "page_*_blocks.llmcleaned.json",  # From 03_llm_cleanup
        "page_*_blocks.cleaned.json",     # From 02_cleanup  
        "page_*_blocks.domain.json",      # From 02_cleanup (alt)
        "page_*_blocks.json"              # From 01_blocks (fallback)
    ]
    
    files = []
    for pattern in patterns:
        found = sorted(blocks_dir.glob(pattern))
        if found:
            files = found
            print(f"📄 Found {len(files)} files with pattern: {pattern}")
            break
    
    if not files:
        print(f"❌ No compatible block files found in {blocks_dir}")
        print("💡 Available files:")
        for f in sorted(blocks_dir.glob("*.json")):
            print(f"   {f.name}")
        return []
    
    return files

input_files = discover_input_files(blocks_dir)
print(f"\n📊 Processing {len(input_files)} files")

# Show first few files
for i, file_path in enumerate(input_files[:5]):
    print(f"   {i+1}. {file_path.name}")
if len(input_files) > 5:
    print(f"   ... and {len(input_files) - 5} more")

In [None]:
# Helper functions for content processing
def load_page_blocks(file_path: Path) -> Dict[str, Any]:
    """Load blocks from a page JSON file"""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        return data
    except Exception as e:
        print(f"[ERROR] Failed to load {file_path}: {e}")
        return {}

def reconstruct_page_content(blocks: List[Dict[str, Any]]) -> str:
    """Reconstruct page content from blocks for Chunkr processing"""
    
    # Sort blocks by position (top to bottom, left to right)
    def sort_key(block):
        bbox = block.get('bbox', [0, 0, 0, 0])
        return (bbox[1], bbox[0])  # y-coordinate first, then x-coordinate
    
    sorted_blocks = sorted(blocks, key=sort_key)
    
    # Extract text with structure preservation
    lines = []
    current_line_y = None
    current_line_parts = []
    
    for block in sorted_blocks:
        # Get the best available text (prioritize LLM cleaned)
        text = (
            block.get('text_final', '') or 
            block.get('text_llm', '') or 
            block.get('text', '')
        ).strip()
        
        if not text:
            continue
            
        bbox = block.get('bbox', [0, 0, 0, 0])
        block_y = bbox[1] if len(bbox) > 1 else 0
        
        # If this block is on a significantly different line, start new line
        if current_line_y is None or abs(block_y - current_line_y) > 10:
            if current_line_parts:
                lines.append(' '.join(current_line_parts))
            current_line_parts = [text]
            current_line_y = block_y
        else:
            current_line_parts.append(text)
    
    # Add the last line
    if current_line_parts:
        lines.append(' '.join(current_line_parts))
    
    return '\n'.join(lines)

print("✅ Helper functions defined")

In [None]:
# Test content reconstruction with first file
if input_files:
    test_file = input_files[0]
    print(f"🧪 Testing content reconstruction with: {test_file.name}")
    
    page_data = load_page_blocks(test_file)
    original_blocks = page_data.get('blocks', [])
    
    print(f"📄 Original blocks: {len(original_blocks)}")
    
    if original_blocks:
        # Show first few blocks
        print("\n📝 Sample blocks:")
        for i, block in enumerate(original_blocks[:3]):
            text = (
                block.get('text_final', '') or 
                block.get('text_llm', '') or 
                block.get('text', '')
            ).strip()
            bbox = block.get('bbox', [])
            confidence = block.get('confidence', 0)
            print(f"   {i+1}. '{text[:60]}{'...' if len(text) > 60 else ''}' (conf: {confidence:.2f})")
        
        # Reconstruct content
        reconstructed = reconstruct_page_content(original_blocks)
        print(f"\n📝 Reconstructed content ({len(reconstructed)} chars):")
        print(f"   {reconstructed[:200]}{'...' if len(reconstructed) > 200 else ''}")
    else:
        print("❌ No blocks found in test file")
else:
    print("❌ No input files to test")

In [None]:
# Chunkr processing functions
def create_temp_document(content: str, page_num: int) -> str:
    """Create a temporary text document for Chunkr processing"""
    
    temp_file = tempfile.NamedTemporaryFile(
        mode='w', 
        suffix=f'_page_{page_num}.txt', 
        delete=False,
        encoding='utf-8'
    )
    
    temp_file.write(content)
    temp_file.close()
    
    return temp_file.name

def process_with_chunkr(content: str, page_num: int) -> Optional[Dict[str, Any]]:
    """Send content to Chunkr for semantic processing"""
    
    if not content.strip():
        print(f"[WARN] Empty content for page {page_num}")
        return None
    
    temp_file_path = None
    try:
        # Create temporary document
        temp_file_path = create_temp_document(content, page_num)
        
        # Prepare Chunkr request
        with open(temp_file_path, 'rb') as f:
            files = {'file': (f'page_{page_num}.txt', f, 'text/plain')}
            
            data = {
                'ocr_strategy': ocr_strategy,
                'config': json.dumps({
                    "chunk_processing": {
                        "target_length": chunk_target_length,
                        "overlap": chunk_overlap
                    },
                    "segmentation_strategy": "LayoutAnalysis"
                })
            }
            
            # Upload to Chunkr
            print(f"   🔄 Uploading to Chunkr...")
            response = requests.post(
                f"{chunkr_base_url}/api/v1/task",
                files=files,
                data=data,
                timeout=timeout_seconds
            )
            
            if response.status_code != 200:
                print(f"   ❌ Upload failed: {response.status_code} - {response.text[:200]}")
                return None
            
            task_data = response.json()
            task_id = task_data.get('task_id')
            
            if not task_id:
                print(f"   ❌ No task_id received")
                return None
            
            print(f"   📋 Task ID: {task_id}")
            
            # Poll for completion
            for attempt in range(150):  # 5 minutes max
                task_response = requests.get(
                    f"{chunkr_base_url}/api/v1/task/{task_id}",
                    timeout=30
                )
                
                if task_response.status_code == 200:
                    task_status = task_response.json()
                    status = task_status.get('status', '').lower()
                    
                    if status == 'succeeded':
                        print(f"   ✅ Processing completed")
                        return task_status
                    elif status in ['failed', 'cancelled']:
                        error_msg = task_status.get('message', 'Unknown error')
                        print(f"   ❌ Task failed: {error_msg}")
                        return None
                    else:
                        if attempt % 10 == 0:  # Print status every 10 attempts
                            print(f"   ⏳ Status: {status}")
                        time.sleep(chunkr_wait_seconds)
                else:
                    print(f"   ❌ Failed to check status: {task_response.status_code}")
                    return None
            
            print(f"   ⏰ Processing timed out")
            return None
            
    except Exception as e:
        print(f"   ❌ Exception: {e}")
        return None
    finally:
        # Clean up temporary file
        if temp_file_path and os.path.exists(temp_file_path):
            try:
                os.unlink(temp_file_path)
            except:
                pass

print("✅ Chunkr processing functions defined")

In [None]:
# Test Chunkr processing with sample content
if input_files and check_chunkr_health():
    print("🧪 Testing Chunkr processing...")
    
    # Use the reconstructed content from earlier
    if 'reconstructed' in locals() and reconstructed:
        print(f"📤 Testing with {len(reconstructed)} characters of content")
        
        # Process with Chunkr
        test_result = process_with_chunkr(reconstructed, 999)  # Use page 999 for test
        
        if test_result:
            output = test_result.get('output', {})
            segments = output.get('segments', [])
            
            print(f"\n✅ Chunkr test successful!")
            print(f"   📊 Generated {len(segments)} semantic segments")
            print(f"   📄 File type: {output.get('file_type', 'unknown')}")
            print(f"   ⏱️ Processing time: {output.get('processing_time_s', 'unknown')}s")
            
            # Show sample segments
            print(f"\n📝 Sample segments:")
            for i, segment in enumerate(segments[:3]):
                content = segment.get('content', '').strip()
                seg_type = segment.get('segment_type', 'text')
                print(f"   {i+1}. [{seg_type}] {content[:80]}{'...' if len(content) > 80 else ''}")
                
        else:
            print("❌ Chunkr test failed")
    else:
        print("❌ No content available for testing")
else:
    print("⏭️ Skipping Chunkr test (no files or service unavailable)")

In [None]:
# Result merging function
def merge_chunkr_results(original_blocks: List[Dict[str, Any]], 
                        chunkr_result: Dict[str, Any],
                        page_num: int) -> List[Dict[str, Any]]:
    """Merge Chunkr semantic analysis with original OCR blocks"""
    
    if not chunkr_result:
        # If Chunkr failed, return original blocks with semantic metadata
        for block in original_blocks:
            block['chunkr_processed'] = False
            block['semantic_chunk_id'] = None
            block['semantic_type'] = 'text'
            block['processing_stage'] = '03b_chunkr_enhanced'
        return original_blocks
    
    # Extract Chunkr segments
    output = chunkr_result.get('output', {})
    segments = output.get('segments', [])
    
    enhanced_blocks = []
    
    for i, segment in enumerate(segments):
        # Create enhanced block combining Chunkr insights with original structure
        enhanced_block = {
            # Chunkr semantic content
            'text': segment.get('content', '').strip() or segment.get('text', '').strip(),
            'text_chunkr': segment.get('content', '').strip(),
            'text_final': segment.get('content', '').strip(),  # Use Chunkr as final
            
            # Semantic metadata
            'semantic_chunk_id': f"page_{page_num}_chunk_{i}",
            'semantic_type': segment.get('segment_type', 'text'),
            'chunkr_processed': True,
            
            # Preserve structure info if available
            'bbox': segment.get('bbox', [0, 0, 1, 1]),
            'confidence': segment.get('confidence', 0.95),
            
            # Processing metadata
            'chunkr_segment_id': segment.get('segment_id', f'seg_{i}'),
            'processing_stage': '03b_chunkr_enhanced',
            'enhanced_timestamp': time.time(),
            
            # Chunkr task metadata
            'chunkr_metadata': {
                'task_id': chunkr_result.get('task_id'),
                'file_type': output.get('file_type', 'text'),
                'total_segments': len(segments),
                'processing_time_s': output.get('processing_time_s'),
                'page_count': output.get('page_count', 1)
            }
        }
        
        enhanced_blocks.append(enhanced_block)
    
    return enhanced_blocks

print("✅ Result merging function defined")

In [None]:
# Main processing loop
if not input_files:
    print("❌ No input files found. Please check the input directory.")
elif not check_chunkr_health():
    print("❌ Chunkr service not available. Please run: ./setup_local_chunkr.sh")
else:
    print(f"🚀 Starting batch processing of {len(input_files)} files...")
    
    successful = 0
    failed = 0
    
    for i, file_path in enumerate(input_files):
        print(f"\n📄 [{i+1}/{len(input_files)}] Processing: {file_path.name}")
        
        try:
            # Load original blocks
            page_data = load_page_blocks(file_path)
            if not page_data:
                print(f"   ❌ Failed to load page data")
                failed += 1
                continue
            
            original_blocks = page_data.get('blocks', [])
            if not original_blocks:
                print(f"   ⚠️ No blocks found")
                failed += 1
                continue
            
            print(f"   📊 Loaded {len(original_blocks)} original blocks")
            
            # Extract page number
            page_num = 0
            try:
                match = re.search(r'page_(\d+)', file_path.name)
                if match:
                    page_num = int(match.group(1))
            except:
                pass
            
            # Reconstruct page content
            page_content = reconstruct_page_content(original_blocks)
            if not page_content.strip():
                print(f"   ⚠️ No content reconstructed")
                failed += 1
                continue
            
            print(f"   📝 Reconstructed {len(page_content)} characters")
            
            # Process with Chunkr
            chunkr_result = process_with_chunkr(page_content, page_num)
            
            # Merge results
            enhanced_blocks = merge_chunkr_results(original_blocks, chunkr_result, page_num)
            
            print(f"   📦 Generated {len(enhanced_blocks)} enhanced blocks")
            
            # Create output data structure
            output_data = {
                'page_num': page_num,
                'original_file': str(file_path),
                'processing_stage': '03b_chunkr_enhanced',
                'timestamp': time.time(),
                'chunkr_success': chunkr_result is not None,
                'original_blocks_count': len(original_blocks),
                'enhanced_blocks_count': len(enhanced_blocks),
                'blocks': enhanced_blocks,
                'metadata': {
                    'chunkr_base_url': chunkr_base_url,
                    'chunk_target_length': chunk_target_length,
                    'chunk_overlap': chunk_overlap,
                    'ocr_strategy': ocr_strategy
                }
            }
            
            # Preserve original metadata
            for key in ['page_info', 'image_path', 'source_pdf']:
                if key in page_data:
                    output_data[key] = page_data[key]
            
            # Save enhanced blocks
            output_file = output_dir / f"{file_path.stem}.chunkr.json"
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(output_data, f, indent=2, ensure_ascii=False)
            
            print(f"   ✅ Saved: {output_file.name}")
            successful += 1
            
        except Exception as e:
            print(f"   ❌ Exception: {e}")
            failed += 1
    
    # Summary
    print(f"\n" + "="*50)
    print(f"📊 Processing Summary:")
    print(f"   ✅ Successful: {successful}")
    print(f"   ❌ Failed: {failed}")
    print(f"   📄 Total: {successful + failed}")
    
    if successful > 0:
        print(f"\n📁 Enhanced blocks saved to: {output_dir}")
        print(f"🔄 Next step: Update 04_json_extraction.ipynb to use enhanced blocks")
        print(f"   Change input directory to: {output_dir}")

In [None]:
# Analysis of results
if output_dir.exists():
    enhanced_files = list(output_dir.glob("*.chunkr.json"))
    
    if enhanced_files:
        print(f"📈 Analysis of {len(enhanced_files)} enhanced files:\n")
        
        total_original = 0
        total_enhanced = 0
        successful_chunkr = 0
        
        for file_path in enhanced_files[:5]:  # Analyze first 5 files
            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                
                original_count = data.get('original_blocks_count', 0)
                enhanced_count = data.get('enhanced_blocks_count', 0)
                chunkr_success = data.get('chunkr_success', False)
                
                total_original += original_count
                total_enhanced += enhanced_count
                if chunkr_success:
                    successful_chunkr += 1
                
                print(f"📄 {file_path.name}:")
                print(f"   Original blocks: {original_count}")
                print(f"   Enhanced blocks: {enhanced_count}")
                print(f"   Chunkr success: {'✅' if chunkr_success else '❌'}")
                
                # Show sample enhanced block
                blocks = data.get('blocks', [])
                if blocks:
                    sample_block = blocks[0]
                    text = sample_block.get('text', '')[:60]
                    semantic_type = sample_block.get('semantic_type', 'text')
                    print(f"   Sample: [{semantic_type}] {text}...")
                print()
                
            except Exception as e:
                print(f"❌ Error analyzing {file_path.name}: {e}")
        
        print(f"📊 Summary (first 5 files):")
        print(f"   Total original blocks: {total_original}")
        print(f"   Total enhanced blocks: {total_enhanced}")
        print(f"   Successful Chunkr processing: {successful_chunkr}/5")
        
        if total_original > 0:
            ratio = total_enhanced / total_original
            print(f"   Enhancement ratio: {ratio:.2f}x")
    else:
        print("📁 No enhanced files found in output directory")
else:
    print("📁 Output directory not found")

## Next Steps

1. **Update 04_json_extraction.ipynb**: Change the input directory to use enhanced blocks:
   ```python
   blocks_dir = "outputs/run_001/03b_chunkr_enhanced"
   ```

2. **Update file patterns**: Look for `.chunkr.json` files:
   ```python
   patterns = ["page_*_blocks.chunkr.json", "page_*_blocks.llmcleaned.json", ...]
   ```

3. **Leverage semantic metadata**: Use the enhanced semantic types and chunk IDs in your JSON extraction

4. **Monitor performance**: Compare extraction quality with and without Chunkr enhancement

## Benefits of Chunkr Enhancement

- **Better chunking**: Content is semantically segmented rather than arbitrarily split
- **Structure awareness**: Identifies tables, headings, paragraphs automatically
- **Cross-reference resolution**: Maintains document flow and relationships
- **Metadata enrichment**: Additional semantic information for downstream processing

## Troubleshooting

- **Service not available**: Run `./setup_local_chunkr.sh`
- **Processing timeouts**: Increase `timeout_seconds` or reduce `chunk_target_length`
- **Memory issues**: Process files in smaller batches
- **Quality issues**: Adjust Chunkr configuration parameters