### Arabic OCR Experiment

This notebook compares different OCR methods for Arabic text recognition on receipt images, including Docling with multiple backends

### Imports and Configuration

In [None]:
# Standard library imports
import os
from pathlib import Path
from types import SimpleNamespace

# Third-party imports
import cv2
import numpy as np
import pandas as pd
import pytesseract
import easyocr
from PIL import Image
from dotenv import load_dotenv

# Docling imports
from docling.document_converter import DocumentConverter, ImageFormatOption
from docling.datamodel.pipeline_options import (
    PdfPipelineOptions,
    EasyOcrOptions,
    TesseractOcrOptions,
    TesseractCliOcrOptions
)
from docling.datamodel.base_models import InputFormat

# Get the notebook directory
NOTEBOOK_DIR = Path.cwd()
print(f"Notebook directory: {NOTEBOOK_DIR}")

# Load environment variables from .env file
env_path = NOTEBOOK_DIR / '.env'
if env_path.exists():
    load_dotenv(env_path)
    print(f"✅ Loaded environment from: {env_path}")
else:
    load_dotenv()
    print("✅ Loaded environment variables")

print("✅ All imports successful!")

### Tesseract Configuration

In [None]:
class TesseractConfig:
    """Configuration class for Tesseract OCR"""
    
    @staticmethod
    def find_tesseract():
        """Find Tesseract executable using environment configuration"""
        # Check environment variable first (from .env file or system)
        if 'TESSERACT_CMD' in os.environ:
            tesseract_cmd = os.environ['TESSERACT_CMD']
            # If it's a relative path, make it absolute from notebook directory
            if not os.path.isabs(tesseract_cmd):
                tesseract_cmd = os.path.join(NOTEBOOK_DIR, tesseract_cmd)
            return tesseract_cmd
        
        # Fallback to common system paths
        tesseract_paths = [
            'tesseract',  # If in PATH
            '/usr/bin/tesseract',  # Linux system
            '/usr/local/bin/tesseract',  # Linux local
        ]
        
        for path in tesseract_paths:
            if path == 'tesseract' or os.path.exists(path):
                return path
        
        return None
    
    @classmethod
    def configure(cls):
        """Configure Tesseract path and data directory"""
        # Set Tesseract executable path
        tesseract_cmd = cls.find_tesseract()
        if tesseract_cmd:
            pytesseract.pytesseract.tesseract_cmd = tesseract_cmd
            print(f"✅ Using Tesseract at: {tesseract_cmd}")
        else:
            print("❌ Warning: Tesseract not found")
            return False
        
        # Set TESSDATA_PREFIX environment variable for Docling
        if 'TESSDATA_PREFIX' in os.environ:
            tessdata_prefix = os.environ['TESSDATA_PREFIX']
            # Ensure it's set in the environment
            os.environ['TESSDATA_PREFIX'] = tessdata_prefix
            print(f"✅ TESSDATA_PREFIX set to: {tessdata_prefix}")
        else:
            print("⚠️ Warning: TESSDATA_PREFIX not set. Add it to .env file for Docling to work properly")
        
        return True

# Configure Tesseract
TesseractConfig.configure()

### OCR Processor Class with Multiple Methods

In [None]:
class OCRProcessor:
    """Main OCR processing class with multiple methods including Docling"""
    
    def __init__(self):
        """Initialize OCR processors"""
        # Initialize EasyOCR reader
        self.easyocr_reader = easyocr.Reader(['ar', 'en'])
        print("✅ EasyOCR reader initialized")
        
        # Initialize Docling converters
        self._init_docling_converters()
    
    def _init_docling_converters(self):
        """Initialize Docling document converters with different OCR backends"""
        try:
            # Docling with EasyOCR backend
            self.docling_easyocr_converter = DocumentConverter(
                format_options={
                    InputFormat.IMAGE: ImageFormatOption(
                        pipeline_options=PdfPipelineOptions(
                            do_ocr=True,
                            ocr_options=EasyOcrOptions(
                                lang=['ar', 'en'],
                                use_gpu=False,
                                confidence_threshold=0.5
                            )
                        )
                    )
                }
            )
            print("✅ Docling EasyOCR converter initialized")
            
            # Docling with Tesseract backend
            self.docling_tesseract_converter = DocumentConverter(
                format_options={
                    InputFormat.IMAGE: ImageFormatOption(
                        pipeline_options=PdfPipelineOptions(
                            do_ocr=True,
                            ocr_options=TesseractOcrOptions(
                                lang=['ara', 'eng']
                            )
                        )
                    )
                }
            )
            print("✅ Docling Tesseract converter initialized")
            
            # Docling with TesseractCLI backend
            # Get the tesseract path (same as configured for pytesseract)
            tesseract_path = pytesseract.pytesseract.tesseract_cmd
            print(f"  Using Tesseract path for Docling: {tesseract_path}")
            
            self.docling_tesseract_cli_converter = DocumentConverter(
                format_options={
                    InputFormat.IMAGE: ImageFormatOption(
                        pipeline_options=PdfPipelineOptions(
                            do_ocr=True,
                            ocr_options=TesseractCliOcrOptions(
                                lang=['ara', 'eng'],
                                path=tesseract_path
                            )
                        )
                    )
                }
            )
            print("✅ Docling TesseractCLI converter initialized")
            
        except Exception as e:
            print(f"⚠️ Warning: Some Docling converters failed to initialize: {e}")
    
    def safe_pytesseract_extract(self, image, config='--psm 6'):
        """Safely extract text from image using pytesseract with proper encoding"""
        try:
            # Convert to PIL Image for better compatibility
            if len(image.shape) == 3:
                pil_image = Image.fromarray(image)
            else:
                pil_image = Image.fromarray(image)

            # Extract text with proper encoding handling
            text = pytesseract.image_to_string(
                pil_image,
                lang='ara+eng',
                config=config
            )

            # Handle encoding issues
            if isinstance(text, bytes):
                try:
                    text = text.decode('utf-8')
                except UnicodeDecodeError:
                    try:
                        text = text.decode('cp1256')  # Arabic Windows encoding
                    except UnicodeDecodeError:
                        text = text.decode('utf-8', errors='replace')

            # Clean the text
            text = text.strip()

            # If text is empty, try alternative PSM modes
            if not text:
                for psm_mode in ['--psm 3', '--psm 4', '--psm 11']:
                    text = pytesseract.image_to_string(
                        pil_image,
                        lang='ara+eng',
                        config=psm_mode
                    )
                    if isinstance(text, bytes):
                        try:
                            text = text.decode('utf-8')
                        except UnicodeDecodeError:
                            text = text.decode('utf-8', errors='replace')
                    text = text.strip()
                    if text:
                        break

            return text if text else "No text detected"

        except Exception as e:
            return f"Error: {str(e)}"
    
    # Standard OCR Methods
    def extract_easyocr(self, image_path):
        """Extract text using EasyOCR"""
        try:
            results = self.easyocr_reader.readtext(str(image_path))
            text_parts = [result[1] for result in results]
            return "\n".join(text_parts)
        except Exception as e:
            return f"Error: {e}"
    
    def extract_pytesseract(self, image_path):
        """Extract text using Pytesseract"""
        try:
            img = cv2.imread(str(image_path))
            if img is None:
                return "Error: Could not read image"
            
            img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            return self.safe_pytesseract_extract(img_rgb)
        except Exception as e:
            return f"Error: {e}"
    
    def extract_opencv(self, image_path):
        """Extract text using OpenCV preprocessing + Pytesseract"""
        try:
            img = cv2.imread(str(image_path))
            if img is None:
                return "Error: Could not read image"
            
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            _, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
            
            return self.safe_pytesseract_extract(binary)
        except Exception as e:
            return f"Error: {e}"
    
    def extract_enhanced(self, image_path):
        """Extract text with enhanced preprocessing"""
        try:
            img = cv2.imread(str(image_path))
            if img is None:
                return "Error: Could not read image"
            
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            blurred = cv2.GaussianBlur(gray, (5, 5), 0)
            binary = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
            
            return self.safe_pytesseract_extract(binary)
        except Exception as e:
            return f"Error: {e}"
    
    # Docling OCR Methods
    def extract_docling_easyocr(self, image_path):
        """Extract text using Docling with EasyOCR backend"""
        try:
            result = self.docling_easyocr_converter.convert(str(image_path))
            return result.document.export_to_markdown()
        except Exception as e:
            return f"Error: {e}"
    
    def extract_docling_tesseract(self, image_path):
        """Extract text using Docling with Tesseract backend"""
        try:
            result = self.docling_tesseract_converter.convert(str(image_path))
            return result.document.export_to_markdown()
        except Exception as e:
            return f"Error: {e}"
    
    def extract_docling_tesseract_cli(self, image_path):
        """Extract text using Docling with TesseractCLI backend"""
        try:
            result = self.docling_tesseract_cli_converter.convert(str(image_path))
            return result.document.export_to_markdown()
        except Exception as e:
            return f"Error: {e}"

# Initialize OCR processor
ocr_processor = OCRProcessor()

### Results Processing Class

In [None]:
class OCRResults:
    """Class to handle OCR results and data processing"""
    
    def __init__(self, images_dir):
        """Initialize with images directory"""
        self.images_dir = Path(images_dir)
        self.image_paths = sorted(self.images_dir.glob('*.jpg'))[:10]
        self.results = []
        
        print(f"Working directory: {Path.cwd()}")
        print(f"Images directory: {self.images_dir}")
        print(f"Found {len(self.image_paths)} images to process")
    
    def process_images(self, ocr_processor, include_docling=True):
        """Process all images with different OCR methods"""
        for image_path in self.image_paths:
            print(f"Processing: {image_path.name}")
            
            result = {
                "image_path": image_path.name,
                "easyocr_text": ocr_processor.extract_easyocr(image_path),
                "pytesseract_text": ocr_processor.extract_pytesseract(image_path),
                "opencv_text": ocr_processor.extract_opencv(image_path),
                "enhanced_text": ocr_processor.extract_enhanced(image_path)
            }
            
            # Add Docling methods if requested
            if include_docling:
                result["docling_easyocr_text"] = ocr_processor.extract_docling_easyocr(image_path)
                result["docling_tesseract_text"] = ocr_processor.extract_docling_tesseract(image_path)
                result["docling_tesseract_cli_text"] = ocr_processor.extract_docling_tesseract_cli(image_path)
            
            self.results.append(result)
    
    def create_dataframe(self, include_docling=True):
        """Create a pandas DataFrame for better visualization"""
        results_data = []
        for res in self.results:
            row = {
                'Image': res['image_path'],
                'EasyOCR': res['easyocr_text'][:100] + '...' if len(res['easyocr_text']) > 100 else res['easyocr_text'],
                'Pytesseract': res['pytesseract_text'][:100] + '...' if len(res['pytesseract_text']) > 100 else res['pytesseract_text'],
                'OpenCV': res['opencv_text'][:100] + '...' if len(res['opencv_text']) > 100 else res['opencv_text'],
                'Enhanced': res['enhanced_text'][:100] + '...' if len(res['enhanced_text']) > 100 else res['enhanced_text']
            }
            
            # Add Docling columns if available
            if include_docling and 'docling_easyocr_text' in res:
                row['Docling-EasyOCR'] = res['docling_easyocr_text'][:100] + '...' if len(res['docling_easyocr_text']) > 100 else res['docling_easyocr_text']
                row['Docling-Tesseract'] = res['docling_tesseract_text'][:100] + '...' if len(res['docling_tesseract_text']) > 100 else res['docling_tesseract_text']
                row['Docling-TessCLI'] = res['docling_tesseract_cli_text'][:100] + '...' if len(res['docling_tesseract_cli_text']) > 100 else res['docling_tesseract_cli_text']
            
            results_data.append(row)
        
        return pd.DataFrame(results_data)
    
    def display_results(self, include_docling=True):
        """Display results in organized format"""
        df = self.create_dataframe(include_docling)
        print("\n" + "=" * 120)
        print("OCR RESULTS SUMMARY")
        print("=" * 120)
        print(df.to_string(index=False))
        
        print("\n" + "=" * 120)
        print("DETAILED RESULTS")
        print("=" * 120)
        
        for i, res in enumerate(self.results):
            print(f"\n--- IMAGE {i+1}: {res['image_path']} ---")
            print(f"\n[EasyOCR]\n{res['easyocr_text']}")
            print(f"\n[Pytesseract]\n{res['pytesseract_text']}")
            print(f"\n[OpenCV]\n{res['opencv_text']}")
            print(f"\n[Enhanced]\n{res['enhanced_text']}")
            
            if include_docling and 'docling_easyocr_text' in res:
                print(f"\n[Docling-EasyOCR]\n{res['docling_easyocr_text']}")
                print(f"\n[Docling-Tesseract]\n{res['docling_tesseract_text']}")
                print(f"\n[Docling-TesseractCLI]\n{res['docling_tesseract_cli_text']}")
            
            print("-" * 120)

# Initialize results processor
images_dir = os.getenv('DATASET_IMAGES_DIR', './datasets/train/images')
results_processor = OCRResults(images_dir)

### Available OCR Methods

**Standard Methods:**
- EasyOCR (direct)
- Pytesseract (direct)
- OpenCV + Pytesseract
- Enhanced preprocessing + Pytesseract

**Docling Methods:**
- Docling with EasyOCR backend
- Docling with Tesseract backend
- Docling with TesseractCLI backend

### Run OCR Processing

In [None]:
# Process all images with ALL methods (standard + Docling)
results_processor.process_images(ocr_processor, include_docling=True)

# Display results
results_processor.display_results(include_docling=True)