# OCR Extraction with PaddleOCR
Extract text from document regions using PaddleOCR.

**Workflow:**
1. Load layout detection results from ALL layout methods (doclayout_yolo, doctr, surya)
2. Run OCR on each detected region
3. Export results to JSON with layout and OCR library info

**Features:**
- High accuracy PP-OCRv4 models
- Multilingual support
- Optimized for documents

**IMPORTANT:** 
- PaddleOCR 3.x has reinitialization issues. This notebook uses PaddleOCR 2.7.x for stability.
- If you get "PDX has already been initialized" error, restart the kernel and run cells from the beginning.
- Do NOT re-run cells after the initial run without restarting the kernel first.

In [None]:
# Install PaddleOCR 2.7.x and configure environment
import subprocess
import sys
import os

# CRITICAL: Disable OneDNN BEFORE importing paddle
# This must be set before any paddle import to avoid OneDNN errors on Windows
os.environ['FLAGS_use_mkldnn'] = '0'
os.environ['FLAGS_use_onednn'] = '0'
os.environ['MKLDNN_CACHE_CAPACITY'] = '0'

def install(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package, "-q"])

def install_specific(package, version):
    subprocess.check_call([sys.executable, "-m", "pip", "install", f"{package}=={version}", "-q"])

print("Checking Python version...")
print(f"Python {sys.version}")

# Check if we need to downgrade PaddleOCR
try:
    import paddleocr
    version = getattr(paddleocr, '__version__', '0.0.0')
    if version.startswith('3.'):
        print(f"PaddleOCR {version} detected (has reinitialization bug)")
        print("Downgrading to stable version 2.7.3...")
        subprocess.run([sys.executable, "-m", "pip", "uninstall", "paddleocr", "paddlex", "-y"], 
                      capture_output=True)
        install_specific("paddleocr", "2.7.3")
        print("Please RESTART THE KERNEL and run this cell again!")
        raise SystemExit("Restart kernel required")
except ImportError:
    pass

# Install paddlepaddle if needed
try:
    import paddle
    print(f"PaddlePaddle version: {paddle.__version__}")
except ImportError:
    print("Installing paddlepaddle...")
    install("paddlepaddle")
    import paddle
    print(f"PaddlePaddle version: {paddle.__version__}")

# Install/check paddleocr
try:
    from paddleocr import PaddleOCR
    import paddleocr
    version = getattr(paddleocr, '__version__', 'installed')
    print(f"PaddleOCR version: {version}")
except ImportError:
    print("Installing paddleocr 2.7.3...")
    install_specific("paddleocr", "2.7.3")
    from paddleocr import PaddleOCR

print("\nDependencies ready!")

In [None]:
import os
import json
import re
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Suppress PaddleOCR logging
import logging
logging.getLogger('ppocr').setLevel(logging.ERROR)

import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from IPython.display import display, Image as IPImage

print("Imports ready!")

In [None]:
# Configuration
INPUT_FOLDER = Path("filled_documents")
LAYOUT_RESULTS_FOLDER = Path("layout_results")
OUTPUT_FOLDER = Path("ocr_results")
OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)

# OCR Library name (for output file naming)
OCR_LIBRARY = "paddleocr"

# OCR settings
OCR_LANG = "en"  # Language: 'en', 'ch', 'fr', 'german', 'korean', 'japan', etc.

In [None]:
# Initialize PaddleOCR
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

# Disable OneDNN to avoid errors on Windows
os.environ['FLAGS_use_mkldnn'] = '0'
os.environ['FLAGS_use_onednn'] = '0'

# Environment variables for PaddleOCR
os.environ.setdefault('HUB_DATASET_ENDPOINT', 'https://modelscope.cn/api/v1/datasets')
os.environ['DISABLE_MODEL_SOURCE_CHECK'] = 'True'

try:
    from paddleocr import PaddleOCR
    
    print("Initializing PaddleOCR...")
    
    # Initialize with English, CPU mode, and minimal logging
    # enable_mkldnn=False to avoid OneDNN errors
    ocr = PaddleOCR(
        use_angle_cls=True,  # Detect text orientation
        lang=OCR_LANG,
        use_gpu=False,
        show_log=False,
        enable_mkldnn=False  # Disable MKL-DNN/OneDNN
    )
    
    print("PaddleOCR ready!")
    PADDLEOCR_AVAILABLE = True
    
except Exception as e:
    print(f"PaddleOCR initialization failed: {e}")
    print("\nTroubleshooting:")
    print("  1. Restart the kernel and run cells from the beginning")
    print("  2. If still failing, try: pip install paddleocr==2.7.3")
    PADDLEOCR_AVAILABLE = False

In [None]:
def extract_layout_library(filename):
    """
    Extract the layout library name from the JSON filename.
    Example: 'Demand Letter_batch_1_layout_doclayout_yolo.json' -> 'doclayout_yolo'
    """
    match = re.search(r'_layout_([^.]+)\.json$', filename)
    if match:
        return match.group(1)
    if filename.endswith('_layout.json'):
        return 'unknown'
    return 'unknown'

# List available layout results from ALL layout detection methods
layout_files = sorted([f for f in LAYOUT_RESULTS_FOLDER.glob("*_layout_*.json")])

# Also include old format if exists
old_format_files = sorted([f for f in LAYOUT_RESULTS_FOLDER.glob("*_layout.json") 
                           if not any(f.name == lf.name for lf in layout_files)])
layout_files.extend(old_format_files)

if not layout_files:
    print(f"No layout JSON files found in '{LAYOUT_RESULTS_FOLDER}'.")
    print("Run the layout detection notebooks (04_*) first.")
else:
    print(f"Found {len(layout_files)} layout result files:")
    for i, f in enumerate(layout_files):
        lib = extract_layout_library(f.name)
        print(f"  {i}: {f.name} (layout: {lib})")

In [None]:
def run_ocr_on_region(image, bbox):
    """Run PaddleOCR on a specific region of the image."""
    x1, y1, x2, y2 = bbox['x1'], bbox['y1'], bbox['x2'], bbox['y2']
    
    # Crop region
    region = image.crop((x1, y1, x2, y2))
    region_array = np.array(region)
    
    # Run OCR
    result = ocr.ocr(region_array, cls=True)
    
    # Extract text and confidence
    text_lines = []
    full_text_parts = []
    
    # PaddleOCR returns: [[[box, (text, confidence)], ...], ...]
    if result and result[0]:
        for line in result[0]:
            if line is None:
                continue
            
            box = line[0]  # [[x1,y1], [x2,y1], [x2,y2], [x1,y2]]
            text_info = line[1]  # (text, confidence)
            
            text = text_info[0]
            confidence = text_info[1]
            
            # Convert box to simple format
            x_coords = [p[0] for p in box]
            y_coords = [p[1] for p in box]
            
            text_lines.append({
                "text": text,
                "confidence": round(float(confidence), 4),
                "bbox_in_region": {
                    "x1": int(min(x_coords)),
                    "y1": int(min(y_coords)),
                    "x2": int(max(x_coords)),
                    "y2": int(max(y_coords))
                }
            })
            full_text_parts.append(text)
    
    full_text = " ".join(full_text_parts)
    
    return {
        "full_text": full_text,
        "lines": text_lines
    }

In [None]:
# ============================================================
# CONFIGURATION: Select layout file to process (for testing)
# ============================================================
SELECTED_LAYOUT_INDEX = 0  # Change this to select a different file

if layout_files:
    selected_layout = layout_files[SELECTED_LAYOUT_INDEX]
    layout_library = extract_layout_library(selected_layout.name)
    print(f"Selected: {selected_layout.name}")
    print(f"Layout library: {layout_library}")
    
    # Load layout results
    with open(selected_layout, 'r') as f:
        layout_data = json.load(f)
    
    print(f"Document: {layout_data['document']}")
    print(f"Regions: {layout_data['num_regions']}")
    
    # Load corresponding image
    image_path = INPUT_FOLDER / layout_data['document']
    if image_path.exists():
        image = Image.open(image_path).convert("RGB")
        print(f"Image loaded: {image.size}")
    else:
        print(f"Image not found: {image_path}")
else:
    print("No layout files available")

In [None]:
# Run OCR on all detected regions (single file test)
if 'layout_data' in dir() and 'image' in dir():
    print(f"Running PaddleOCR on {len(layout_data['regions'])} regions...\n")
    
    ocr_results = {
        "document": layout_data['document'],
        "layout_library": layout_library,
        "ocr_library": OCR_LIBRARY,
        "image_width": layout_data['image_width'],
        "image_height": layout_data['image_height'],
        "num_regions": len(layout_data['regions']),
        "regions": []
    }
    
    for region in layout_data['regions']:
        print(f"Processing region {region['id']}: {region['type']}...")
        
        # Run OCR on this region
        ocr_output = run_ocr_on_region(image, region['bbox'])
        
        # Combine layout info with OCR results
        region_result = {
            "id": region['id'],
            "type": region['type'],
            "layout_confidence": region['confidence'],
            "bbox": region['bbox'],
            "ocr": ocr_output
        }
        ocr_results['regions'].append(region_result)
        
        # Print preview
        text_preview = ocr_output['full_text'][:80] + "..." if len(ocr_output['full_text']) > 80 else ocr_output['full_text']
        print(f"  Text: {text_preview}")
        print(f"  Lines: {len(ocr_output['lines'])}\n")
    
    print("OCR complete!")
else:
    print("Load layout data first (run the cells above)")

In [None]:
# Display results summary
if 'ocr_results' in dir():
    print("=" * 60)
    print(f"OCR RESULTS SUMMARY (Layout: {layout_library}, OCR: {OCR_LIBRARY})")
    print("=" * 60)
    
    for region in ocr_results['regions']:
        print(f"\n[Region {region['id']}] {region['type']}")
        print("-" * 40)
        print(region['ocr']['full_text'])
        print()

In [None]:
# Export results to JSON (single file)
if 'ocr_results' in dir():
    # Generate output filename with both layout and OCR library
    doc_stem = Path(layout_data['document']).stem
    json_path = OUTPUT_FOLDER / f"{doc_stem}_ocr_{layout_library}_{OCR_LIBRARY}.json"
    
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(ocr_results, f, indent=2, ensure_ascii=False)
    
    print(f"Saved OCR results to: {json_path}")
    
    # Also save a simple text version
    txt_path = OUTPUT_FOLDER / f"{doc_stem}_ocr_{layout_library}_{OCR_LIBRARY}.txt"
    with open(txt_path, 'w', encoding='utf-8') as f:
        f.write(f"Layout: {layout_library}, OCR: {OCR_LIBRARY}\n")
        f.write("=" * 50 + "\n\n")
        for region in ocr_results['regions']:
            f.write(f"[{region['type']}]\n")
            f.write(region['ocr']['full_text'])
            f.write("\n\n")
    
    print(f"Saved text version to: {txt_path}")

In [None]:
# Batch process ALL layout files from ALL layout detection methods
if layout_files:
    print(f"Processing {len(layout_files)} layout files with {OCR_LIBRARY}...\n")
    
    for layout_file in layout_files:
        layout_lib = extract_layout_library(layout_file.name)
        print(f"Processing: {layout_file.name} (layout: {layout_lib})")
        
        # Load layout
        with open(layout_file, 'r') as f:
            layout = json.load(f)
        
        # Load image
        img_path = INPUT_FOLDER / layout['document']
        if not img_path.exists():
            print(f"  Image not found: {img_path}")
            continue
        
        img = Image.open(img_path).convert("RGB")
        
        # Run OCR on all regions
        results = {
            "document": layout['document'],
            "layout_library": layout_lib,
            "ocr_library": OCR_LIBRARY,
            "image_width": layout['image_width'],
            "image_height": layout['image_height'],
            "num_regions": len(layout['regions']),
            "regions": []
        }
        
        for region in layout['regions']:
            ocr_output = run_ocr_on_region(img, region['bbox'])
            results['regions'].append({
                "id": region['id'],
                "type": region['type'],
                "layout_confidence": region['confidence'],
                "bbox": region['bbox'],
                "ocr": ocr_output
            })
        
        # Save JSON with both layout and OCR library in filename
        doc_stem = Path(layout['document']).stem
        json_path = OUTPUT_FOLDER / f"{doc_stem}_ocr_{layout_lib}_{OCR_LIBRARY}.json"
        with open(json_path, 'w', encoding='utf-8') as f:
            json.dump(results, f, indent=2, ensure_ascii=False)
        
        # Save text version
        txt_path = OUTPUT_FOLDER / f"{doc_stem}_ocr_{layout_lib}_{OCR_LIBRARY}.txt"
        with open(txt_path, 'w', encoding='utf-8') as f:
            f.write(f"Layout: {layout_lib}, OCR: {OCR_LIBRARY}\n")
            f.write("=" * 50 + "\n\n")
            for region in results['regions']:
                f.write(f"[{region['type']}]\n")
                f.write(region['ocr']['full_text'])
                f.write("\n\n")
        
        print(f"  Saved: {json_path.name}")
        print(f"  Regions: {len(results['regions'])}")
    
    print(f"\nDone! Results saved to {OUTPUT_FOLDER}")
else:
    print("No layout files to process")