# OCR Extraction with EasyOCR
Extract text from document regions using EasyOCR.

**Workflow:**
1. Load layout detection results from ALL layout methods (doclayout_yolo, doctr, surya)
2. Run OCR on each detected region
3. Export results to JSON with layout and OCR library info

In [1]:
# Install dependencies
import subprocess
import sys

def install(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package, "-q"])

# Install EasyOCR
try:
    import easyocr
except ImportError:
    print("Installing EasyOCR...")
    install("easyocr")
    import easyocr

print("Dependencies ready!")

Dependencies ready!


In [2]:
import os
import json
import re
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from IPython.display import display, Image as IPImage
import easyocr

print("Imports ready!")

Imports ready!


In [3]:
# Configuration
INPUT_FOLDER = Path("filled_documents")
LAYOUT_RESULTS_FOLDER = Path("layout_results")
OUTPUT_FOLDER = Path("ocr_results")
OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)

# OCR Library name (for output file naming)
OCR_LIBRARY = "easyocr"

# OCR settings
OCR_LANG = "en"  # Language: 'en', 'ch', 'fr', 'german', 'korean', 'japan', etc.

In [4]:
# Initialize EasyOCR
# First run will download models (~100MB)

# Workaround for SSL certificate issues
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

print("Initializing EasyOCR...")
reader = easyocr.Reader(
    [OCR_LANG],
    gpu=False  # Set to True if you have CUDA
)
print("EasyOCR ready!")

Using CPU. Note: This module is much faster with a GPU.


Initializing EasyOCR...
EasyOCR ready!


In [5]:
def extract_layout_library(filename):
    """
    Extract the layout library name from the JSON filename.
    Example: 'Demand Letter_batch_1_layout_doclayout_yolo.json' -> 'doclayout_yolo'
    """
    # Pattern: *_layout_{library}.json
    match = re.search(r'_layout_([^.]+)\.json$', filename)
    if match:
        return match.group(1)
    # Fallback for old format *_layout.json
    if filename.endswith('_layout.json'):
        return 'unknown'
    return 'unknown'

# List available layout results from ALL layout detection methods
layout_files = sorted([f for f in LAYOUT_RESULTS_FOLDER.glob("*_layout_*.json")])

# Also include old format if exists
old_format_files = sorted([f for f in LAYOUT_RESULTS_FOLDER.glob("*_layout.json") 
                           if not any(f.name == lf.name for lf in layout_files)])
layout_files.extend(old_format_files)

if not layout_files:
    print(f"No layout JSON files found in '{LAYOUT_RESULTS_FOLDER}'.")
    print("Run the layout detection notebooks (04_*) first.")
else:
    print(f"Found {len(layout_files)} layout result files:")
    for i, f in enumerate(layout_files):
        lib = extract_layout_library(f.name)
        print(f"  {i}: {f.name} (layout: {lib})")

Found 3 layout result files:
  0: Demand Letter_batch_1_layout_doclayout_yolo.json (layout: doclayout_yolo)
  1: Demand Letter_batch_1_layout_doctr.json (layout: doctr)
  2: Demand Letter_batch_1_layout_surya.json (layout: surya)


In [6]:
def run_ocr_on_region(image, bbox):
    """Run OCR on a specific region of the image."""
    x1, y1, x2, y2 = bbox['x1'], bbox['y1'], bbox['x2'], bbox['y2']
    
    # Crop region
    region = image.crop((x1, y1, x2, y2))
    region_array = np.array(region)
    
    # Run OCR
    result = reader.readtext(region_array)
    
    # Extract text and confidence
    text_lines = []
    for detection in result:
        box, text, confidence = detection
        # box is [[x1,y1], [x2,y1], [x2,y2], [x1,y2]]
        text_lines.append({
            "text": text,
            "confidence": round(confidence, 4),
            "bbox_in_region": {
                "x1": int(box[0][0]),
                "y1": int(box[0][1]),
                "x2": int(box[2][0]),
                "y2": int(box[2][1])
            }
        })
    
    # Combine all text
    full_text = " ".join([t["text"] for t in text_lines])
    
    return {
        "full_text": full_text,
        "lines": text_lines
    }

In [7]:
# ============================================================
# CONFIGURATION: Select layout file to process (for testing)
# ============================================================
SELECTED_LAYOUT_INDEX = 0  # Change this to select a different file

if layout_files:
    selected_layout = layout_files[SELECTED_LAYOUT_INDEX]
    layout_library = extract_layout_library(selected_layout.name)
    print(f"Selected: {selected_layout.name}")
    print(f"Layout library: {layout_library}")
    
    # Load layout results
    with open(selected_layout, 'r') as f:
        layout_data = json.load(f)
    
    print(f"Document: {layout_data['document']}")
    print(f"Regions: {layout_data['num_regions']}")
    
    # Load corresponding image
    image_path = INPUT_FOLDER / layout_data['document']
    if image_path.exists():
        image = Image.open(image_path).convert("RGB")
        print(f"Image loaded: {image.size}")
    else:
        print(f"Image not found: {image_path}")
else:
    print("No layout files available")

Selected: Demand Letter_batch_1_layout_doclayout_yolo.json
Layout library: doclayout_yolo
Document: Demand Letter_batch_1.png
Regions: 10
Image loaded: (1224, 1584)


In [8]:
# Run OCR on all detected regions (single file test)
if 'layout_data' in dir() and 'image' in dir():
    print(f"Running OCR on {len(layout_data['regions'])} regions...\n")
    
    ocr_results = {
        "document": layout_data['document'],
        "layout_library": layout_library,
        "ocr_library": OCR_LIBRARY,
        "image_width": layout_data['image_width'],
        "image_height": layout_data['image_height'],
        "num_regions": len(layout_data['regions']),
        "regions": []
    }
    
    for region in layout_data['regions']:
        print(f"Processing region {region['id']}: {region['type']}...")
        
        # Run OCR on this region
        ocr_output = run_ocr_on_region(image, region['bbox'])
        
        # Combine layout info with OCR results
        region_result = {
            "id": region['id'],
            "type": region['type'],
            "layout_confidence": region['confidence'],
            "bbox": region['bbox'],
            "ocr": ocr_output
        }
        ocr_results['regions'].append(region_result)
        
        # Print preview
        text_preview = ocr_output['full_text'][:80] + "..." if len(ocr_output['full_text']) > 80 else ocr_output['full_text']
        print(f"  Text: {text_preview}")
        print(f"  Lines: {len(ocr_output['lines'])}\n")
    
    print("OCR complete!")
else:
    print("Load layout data first (run the cells above)")

Running OCR on 10 regions...

Processing region 1: plain text...
  Text: If you do not pay this judgment immediately, we will proceed to collect this jud...
  Lines: 2

Processing region 2: plain text...
  Text: Demand is hereby made for payment of the judgment amount and, if applicable, int...
  Lines: 2

Processing region 3: plain text...
  Text: If you cannot pay this judgment; you have the right to designate property to lev...
  Lines: 2

Processing region 4: plain text...
  Text: This notice is being sent to you as a courtesy with the intention of saving you ...
  Lines: 3

Processing region 5: title...
  Text: (Insert Constable'$ Letterhead)
  Lines: 1

Processing region 6: plain text...
  Text: John Doe Defendant/Judgment Debtor 123 Main St
  Lines: 3

Processing region 7: plain text...
  Text: January 15 2026
  Lines: 2

Processing region 8: plain text...
  Text: Dallas, FX Address
  Lines: 2

Processing region 9: title...
  Text: You are hereby notified to call (555_1414411 im

In [9]:
# Display results summary
if 'ocr_results' in dir():
    print("=" * 60)
    print(f"OCR RESULTS SUMMARY (Layout: {layout_library}, OCR: {OCR_LIBRARY})")
    print("=" * 60)
    
    for region in ocr_results['regions']:
        print(f"\n[Region {region['id']}] {region['type']}")
        print("-" * 40)
        print(region['ocr']['full_text'])
        print()

OCR RESULTS SUMMARY (Layout: doclayout_yolo, OCR: easyocr)

[Region 1] plain text
----------------------------------------
If you do not pay this judgment immediately, we will proceed to collect this judgment by seizing any non-exempt property belonging to you according to law:


[Region 2] plain text
----------------------------------------
Demand is hereby made for payment of the judgment amount and, if applicable, interest, cour costs, attorney fees, and all other costs connected with collection ordered by this writ_


[Region 3] plain text
----------------------------------------
If you cannot pay this judgment; you have the right to designate property to levy upon to satisfy this judgment if payment is not forthcoming


[Region 4] plain text
----------------------------------------
This notice is being sent to you as a courtesy with the intention of saving you time and additional costs. All payments must be made through this office to ensure proper credit toward this judgment


[R

In [10]:
# Export results to JSON (single file)
if 'ocr_results' in dir():
    # Generate output filename with both layout and OCR library
    doc_stem = Path(layout_data['document']).stem
    json_path = OUTPUT_FOLDER / f"{doc_stem}_ocr_{layout_library}_{OCR_LIBRARY}.json"
    
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(ocr_results, f, indent=2, ensure_ascii=False)
    
    print(f"Saved OCR results to: {json_path}")
    
    # Also save a simple text version
    txt_path = OUTPUT_FOLDER / f"{doc_stem}_ocr_{layout_library}_{OCR_LIBRARY}.txt"
    with open(txt_path, 'w', encoding='utf-8') as f:
        f.write(f"Layout: {layout_library}, OCR: {OCR_LIBRARY}\n")
        f.write("=" * 50 + "\n\n")
        for region in ocr_results['regions']:
            f.write(f"[{region['type']}]\n")
            f.write(region['ocr']['full_text'])
            f.write("\n\n")
    
    print(f"Saved text version to: {txt_path}")

Saved OCR results to: ocr_results\Demand Letter_batch_1_ocr_doclayout_yolo_easyocr.json
Saved text version to: ocr_results\Demand Letter_batch_1_ocr_doclayout_yolo_easyocr.txt


In [11]:
# Batch process ALL layout files from ALL layout detection methods
if layout_files:
    print(f"Processing {len(layout_files)} layout files with {OCR_LIBRARY}...\n")
    
    for layout_file in layout_files:
        layout_lib = extract_layout_library(layout_file.name)
        print(f"Processing: {layout_file.name} (layout: {layout_lib})")
        
        # Load layout
        with open(layout_file, 'r') as f:
            layout = json.load(f)
        
        # Load image
        img_path = INPUT_FOLDER / layout['document']
        if not img_path.exists():
            print(f"  Image not found: {img_path}")
            continue
        
        img = Image.open(img_path).convert("RGB")
        
        # Run OCR on all regions
        results = {
            "document": layout['document'],
            "layout_library": layout_lib,
            "ocr_library": OCR_LIBRARY,
            "image_width": layout['image_width'],
            "image_height": layout['image_height'],
            "num_regions": len(layout['regions']),
            "regions": []
        }
        
        for region in layout['regions']:
            ocr_output = run_ocr_on_region(img, region['bbox'])
            results['regions'].append({
                "id": region['id'],
                "type": region['type'],
                "layout_confidence": region['confidence'],
                "bbox": region['bbox'],
                "ocr": ocr_output
            })
        
        # Save JSON with both layout and OCR library in filename
        doc_stem = Path(layout['document']).stem
        json_path = OUTPUT_FOLDER / f"{doc_stem}_ocr_{layout_lib}_{OCR_LIBRARY}.json"
        with open(json_path, 'w', encoding='utf-8') as f:
            json.dump(results, f, indent=2, ensure_ascii=False)
        
        # Save text version
        txt_path = OUTPUT_FOLDER / f"{doc_stem}_ocr_{layout_lib}_{OCR_LIBRARY}.txt"
        with open(txt_path, 'w', encoding='utf-8') as f:
            f.write(f"Layout: {layout_lib}, OCR: {OCR_LIBRARY}\n")
            f.write("=" * 50 + "\n\n")
            for region in results['regions']:
                f.write(f"[{region['type']}]\n")
                f.write(region['ocr']['full_text'])
                f.write("\n\n")
        
        print(f"  Saved: {json_path.name}")
        print(f"  Regions: {len(results['regions'])}")
    
    print(f"\nDone! Results saved to {OUTPUT_FOLDER}")
else:
    print("No layout files to process")

Processing 3 layout files with easyocr...

Processing: Demand Letter_batch_1_layout_doclayout_yolo.json (layout: doclayout_yolo)
  Saved: Demand Letter_batch_1_ocr_doclayout_yolo_easyocr.json
  Regions: 10
Processing: Demand Letter_batch_1_layout_doctr.json (layout: doctr)
  Saved: Demand Letter_batch_1_ocr_doctr_easyocr.json
  Regions: 180
Processing: Demand Letter_batch_1_layout_surya.json (layout: surya)
  Saved: Demand Letter_batch_1_ocr_surya_easyocr.json
  Regions: 23

Done! Results saved to ocr_results
