In [3]:
!apt-get install -y poppler-utils tesseract-ocr

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
poppler-utils is already the newest version (22.02.0-2ubuntu0.6).
0 upgraded, 0 newly installed, 0 to remove and 19 not upgraded.


In [16]:
!pip install pytesseract easyocr paddlepaddle paddleocr pdf2image pillow jiwer Levenshtein openmim python-doctr ocrmypdf pymupdf

Collecting pymupdf
  Downloading pymupdf-1.25.3-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.3-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m71.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.25.3


In [15]:
!apt-get install -y ghostscript

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ghostscript is already the newest version (9.55.0~dfsg1-0ubuntu5.10).
0 upgraded, 0 newly installed, 0 to remove and 19 not upgraded.


In [17]:
import time
import cv2
import numpy as np
from PIL import Image
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import fitz
import pytesseract
from paddleocr import PaddleOCR
import easyocr
from doctr.io import DocumentFile
from doctr.models import ocr_predictor
import ocrmypdf
from jiwer import wer, cer, Compose, RemovePunctuation, RemoveWhiteSpace, ToLowerCase

def preprocess_image(image_path):
    """Preprocess the image to improve OCR accuracy."""
    image = cv2.imread(image_path, cv2.IMREAD_COLOR)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)  # Convert to grayscale
    image = cv2.resize(image, None, fx=300/72, fy=300/72, interpolation=cv2.INTER_CUBIC)  # Set resolution to 300 DPI
    # Apply Gaussian Blur to reduce noise
    image = cv2.GaussianBlur(image, (5, 5), 0)

    # Apply Otsu's thresholding for binarization
    _, image = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

    # Apply morphological operations to remove small noise
    kernel = np.ones((1, 1), np.uint8)
    image = cv2.morphologyEx(image, cv2.MORPH_CLOSE, kernel)
    image = cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel)

    # Apply median blur for further noise reduction
    image = cv2.medianBlur(image, 3)

    preprocessed_path = "preprocessed_image.jpg"
    cv2.imwrite(preprocessed_path, image)
    return preprocessed_path

# Initialize OCR engines globally
paddle_ocr_engine = PaddleOCR(use_angle_cls=True, lang="en")
easyocr_reader = easyocr.Reader(['en'])
doctr_model = ocr_predictor(pretrained=True)

# Define text normalization pipeline
transform = Compose([
    RemovePunctuation(),
    RemoveWhiteSpace(replace_by_space=True),
    ToLowerCase()
])

def ocr_tesseract(image_path):
    """Performs OCR using Tesseract"""
    try:
        return pytesseract.image_to_string(image_path).strip()
    except Exception as e:
        print(f"Error in Tesseract OCR: {e}")
        return ""

def ocr_paddleocr(image_path):
    """Performs OCR using PaddleOCR"""
    try:
        if not paddle_ocr_engine:
            raise RuntimeError("PaddleOCR engine is not initialized.")
        result = paddle_ocr_engine.ocr(image_path, cls=True)
        return " ".join([line[1][0] for region in result for line in region])  # Extract recognized text
    except Exception as e:
        print(f"Error in PaddleOCR: {e}")
        return ""

def ocr_easyocr(image_path):
    """Performs OCR using EasyOCR"""

    try:
        if not easyocr_reader:
            raise RuntimeError("EasyOCR reader is not initialized.")
        result = easyocr_reader.readtext(image_path)
        return " ".join([line[1] for line in result])  # Extract recognized text
    except Exception as e:
        print(f"Error in EasyOCR: {e}")
        return ""


def ocr_doctr(image_path):
    """Performs OCR using DocTR with correct file input handling."""

    try:
        if not doctr_model:
            raise RuntimeError("DocTR model is not initialized.")
        doc = DocumentFile.from_images(image_path)  # Convert image for DocTR processing
        result = doctr_model(doc)  # Perform OCR
        recognized_text = " ".join([
            word.value for block in result.pages[0].blocks
            for line in block.lines for word in line.words
        ])
        return recognized_text
    except Exception as e:
        print(f"Error in DocTR OCR: {e}")
        return ""



def evaluate_ocr(image_path, ground_truth, framework):
    """
    Evaluates OCR output using the selected framework.

    :param image_path: Path to the input image.
    :param ground_truth: The actual text in the image (as a list or string).
    :param framework: OCR framework to use ('tesseract', 'paddleocr', 'easyocr', 'mmocr', 'doctr').
    :return: Dictionary containing WER, CER, Precision, Recall, Latency, and Layout Detection.
    """

    # Convert ground_truth list to a single string if necessary
    if isinstance(ground_truth, list):
        ground_truth = " ".join(ground_truth)

    # Select OCR framework
    ocr_function = {
        "tesseract": ocr_tesseract,
        "paddleocr": ocr_paddleocr,
        "easyocr": ocr_easyocr,
        "doctr": ocr_doctr,
    }.get(framework.lower())

    if ocr_function is None:
        raise ValueError(f"Unsupported OCR framework: {framework}")

    # Measure latency
    start_time = time.time()
    ocr_output = ocr_function(image_path)
    latency = time.time() - start_time

    # Normalize text
    ground_truth_normalized = transform(ground_truth)
    ocr_output_normalized = transform(ocr_output)

    # Compute Word Error Rate (WER) and Character Error Rate (CER)
    word_error_rate = wer(ground_truth_normalized, ocr_output_normalized)
    char_error_rate = cer(ground_truth_normalized, ocr_output_normalized)

    # Compute Precision and Recall
    truth_chars = set(ground_truth_normalized)
    ocr_chars = set(ocr_output_normalized)

    true_positive = len(truth_chars & ocr_chars)  # Correctly recognized characters
    false_positive = len(ocr_chars - truth_chars)  # Extra characters recognized
    false_negative = len(truth_chars - ocr_chars)  # Missing characters

    precision = true_positive / (true_positive + false_positive) if (true_positive + false_positive) > 0 else 0
    recall = true_positive / (true_positive + false_negative) if (true_positive + false_negative) > 0 else 0

    # Perform Layout Detection (simply counting words as an approximation)
    layout_detection = len(ocr_output.split())

    return {
        "WER": word_error_rate,
        "CER": char_error_rate,
        "Precision": precision,
        "Recall": recall,
        "Latency (seconds)": latency,
        "Layout Detection (approx. word count)": layout_detection,
        "OCR Output": ocr_output
    }

# Example usage
if __name__ == "__main__":
    image_path = "image.jpg"  # Provide the path to your test image
    ground_truth = "I see a light in the darkness"

    frameworks = ["tesseract", "paddleocr", "easyocr", "doctr"]
    results = {}

    for framework in frameworks:
        results = evaluate_ocr(image_path, ground_truth, framework)
        print(f"\nOCR Evaluation for {framework.upper()}:")
        for metric, value in results.items():
            print(f"{metric}: {value}")



[2025/02/08 10:27:47] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_length=25, rec_c




OCR Evaluation for TESSERACT:
WER: 1.7142857142857142
CER: 1.1379310344827587
Precision: 0.7647058823529411
Recall: 1.0
Latency (seconds): 3.536595582962036
Layout Detection (approx. word count): 13
OCR Output: |SEE A LIGHT

 

ei
DARKNESS

 

ft
if
Ho
f i
rk
4g
if
[2025/02/08 10:28:00] ppocr DEBUG: dt_boxes num : 3, elapsed : 0.22078919410705566
[2025/02/08 10:28:00] ppocr DEBUG: cls num  : 3, elapsed : 0.018848657608032227
[2025/02/08 10:28:00] ppocr DEBUG: rec_res num  : 3, elapsed : 0.1311631202697754

OCR Evaluation for PADDLEOCR:
WER: 0.0
CER: 0.0
Precision: 1.0
Recall: 1.0
Latency (seconds): 0.954735279083252
Layout Detection (approx. word count): 7
OCR Output: I SEE A LIGHT IN THE DARKNESS

OCR Evaluation for EASYOCR:
WER: 0.2857142857142857
CER: 0.034482758620689655
Precision: 1.0
Recall: 1.0
Latency (seconds): 87.41713833808899
Layout Detection (approx. word count): 6
OCR Output: ISEE A LIGHT IN THE DARKNESS
Error in DocTR OCR: Given input size: (128x1x16). Calculated output