# OCR Models Evaluation on French and English Datasets


In [1]:
# # Install required packages
# !pip install pytesseract easyocr paddlepaddle paddleocr kraken pillow opencv-python
# !pip install google-cloud-vision  
# !pip install lxml beautifulsoup4 
# !pip install matplotlib seaborn pandas numpy scikit-learn

In [2]:
import os
import json
import random
import xml.etree.ElementTree as ET
from pathlib import Path
import pandas as pd
import numpy as np
from PIL import Image
import cv2
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Dict, Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

# OCR Libraries
import pytesseract
import easyocr
from paddleocr import PaddleOCR


import kraken
from kraken import pageseg
from kraken import rpred


from google.cloud import vision

In [3]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/home/duyle/Documents/Case-Study2/active-sun-455914-a1-141238abf96c.json'


## Dataset Configuration

In [4]:
FRENCH_DATASET_PATH = "/home/duyle/Documents/Case-Study2/French_OCR_dataset/"
ENGLISH_DATASET_PATH = "/home/duyle/Documents/Case-Study2/English_OCR_dataset/"

SAMPLES_PER_DATASET = 10
TOTAL_SAMPLES = SAMPLES_PER_DATASET * 2

print(f"French dataset path: {FRENCH_DATASET_PATH}")
print(f"English dataset path: {ENGLISH_DATASET_PATH}")
print(f"Total samples to process: {TOTAL_SAMPLES}")

French dataset path: /home/duyle/Documents/Case-Study2/French_OCR_dataset/
English dataset path: /home/duyle/Documents/Case-Study2/English_OCR_dataset/
Total samples to process: 20


## Data Loading and Sampling Functions

In [5]:
def get_french_samples(dataset_path, num_samples):
    samples = []
    
    image_files = [f for f in os.listdir(dataset_path) if f.endswith('.jpg')]
    
    valid_pairs = []
    
    for img_file in image_files:
        base_name = img_file.replace('_default.jpg', '')  
        xml_file = None
        
        possible_xml = [
            f"{base_name}_default.xml",  
            f"{base_name[:-1]}g_default.xml",
        ]
        
        for xml_name in possible_xml:
            if os.path.exists(os.path.join(dataset_path, xml_name)):
                xml_file = xml_name
                break
        
        if xml_file:
            valid_pairs.append((img_file, xml_file))
    
    
    selected_pairs = random.sample(valid_pairs, min(num_samples, len(valid_pairs)))
    
    for img_file, xml_file in selected_pairs:
        samples.append({
            'dataset': 'French',
            'image_path': os.path.join(dataset_path, img_file),
            'annotation_path': os.path.join(dataset_path, xml_file),
            'image_name': img_file,
            'annotation_name': xml_file
        })
    
    return samples

def get_english_samples(dataset_path, num_samples):
    """Get random samples from English OCR dataset (JSON format)"""
    samples = []
    
    images_path = os.path.join(dataset_path, 'images')
    annotations_path = os.path.join(dataset_path, 'annotations')
    
    # Get all image files
    image_files = [f for f in os.listdir(images_path) if f.endswith('.png')]
    selected_files = random.sample(image_files, min(num_samples, len(image_files)))
    
    for img_file in selected_files:
        # Find corresponding JSON file
        base_name = img_file.replace('.png', '')
        json_file = f"{base_name}.json"
        
        json_path = os.path.join(annotations_path, json_file)
        if os.path.exists(json_path):
            samples.append({
                'dataset': 'English',
                'image_path': os.path.join(images_path, img_file),
                'annotation_path': json_path,
                'image_name': img_file,
                'annotation_name': json_file
            })
    
    return samples

In [6]:
def extract_text_from_french_xml(xml_path):
    try:
        tree = ET.parse(xml_path)
        root = tree.getroot()
        
        # Find all String elements with CONTENT attribute
        texts = []
        for string_elem in root.findall('.//{http://www.loc.gov/standards/alto/ns-v4#}String'):
            content = string_elem.get('CONTENT')
            if content:
                texts.append(content)
        
        return ' '.join(texts)
    except Exception as e:
        print(f"Error parsing XML {xml_path}: {e}")
        return ""

def extract_text_from_english_json(json_path):
    try:
        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        texts = []
        for form_item in data.get('form', []):
            text = form_item.get('text', '').strip()
            if text:
                texts.append(text)
        
        return ' '.join(texts)
    except Exception as e:
        print(f"Error parsing JSON {json_path}: {e}")
        return ""

In [7]:


def extract_boxes_from_french_xml(xml_path):
    """Extract bounding boxes and text from French dataset XML file"""
    try:
        tree = ET.parse(xml_path)
        root = tree.getroot()
        
        boxes = []
        # Find all String elements with coordinates and content
        for string_elem in root.findall('.//{http://www.loc.gov/standards/alto/ns-v4#}String'):
            content = string_elem.get('CONTENT')
            hpos = string_elem.get('HPOS')
            vpos = string_elem.get('VPOS')
            width = string_elem.get('WIDTH')
            height = string_elem.get('HEIGHT')
            
            if content and hpos and vpos and width and height:
                x1 = int(float(hpos))
                y1 = int(float(vpos))
                x2 = x1 + int(float(width))
                y2 = y1 + int(float(height))
                
                boxes.append({
                    'text': content,
                    'box': [x1, y1, x2, y2]
                })
        
        return boxes
    except Exception as e:
        print(f"Error parsing XML boxes {xml_path}: {e}")
        return []

def extract_boxes_from_english_json(json_path):
    """Extract bounding boxes and text from English dataset JSON file"""
    try:
        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        
        boxes = []
        for form_item in data.get('form', []):
            text = form_item.get('text', '').strip()
            box = form_item.get('box', [])
            
            if text and len(box) == 4:
                boxes.append({
                    'text': text,
                    'box': box 
                })
        
        return boxes
    except Exception as e:
        print(f"Error parsing JSON boxes {json_path}: {e}")
        return []



## Sample Dataset and Load Ground Truth

In [8]:
# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)

# Get samples from both datasets
french_samples = get_french_samples(FRENCH_DATASET_PATH, SAMPLES_PER_DATASET)
english_samples = get_english_samples(ENGLISH_DATASET_PATH, SAMPLES_PER_DATASET)

all_samples = french_samples + english_samples

print(f"French samples: {len(french_samples)}")
print(f"English samples: {len(english_samples)}")
print(f"Total samples: {len(all_samples)}")

# Load ground truth text and bounding boxes for all samples
for sample in all_samples:
    if sample['dataset'] == 'French':
        sample['ground_truth'] = extract_text_from_french_xml(sample['annotation_path'])
        sample['ground_truth_boxes'] = extract_boxes_from_french_xml(sample['annotation_path'])
    else:
        sample['ground_truth'] = extract_text_from_english_json(sample['annotation_path'])
        sample['ground_truth_boxes'] = extract_boxes_from_english_json(sample['annotation_path'])


French samples: 10
English samples: 10
Total samples: 20


## OCR Model Implementations

In [14]:
# Global variables to store initialized models
easyocr_reader = None
paddleocr_reader = None
google_client = None

# Check model availability
models_available = {
    'tesseract': True,
    'easyocr': True,
    'paddleocr': PaddleOCR is not None,
    'kraken': kraken is not None,
    'google_vision': vision is not None
}

def initialize_ocr_models():
    """Initialize all available OCR models"""
    global easyocr_reader, paddleocr_reader, google_client
    
    print("Initializing OCR models...")
    
    # Initialize EasyOCR
    try:
        if models_available['easyocr']:
            easyocr_reader = easyocr.Reader(['en', 'fr'])
            print("✓ EasyOCR initialized")
    except Exception as e:
        print(f"✗ EasyOCR initialization failed: {e}")
        models_available['easyocr'] = False
    
    # Initialize PaddleOCR
    try:
        if models_available['paddleocr']:
            paddleocr_reader = PaddleOCR(use_angle_cls=True, lang='en')
            print("✓ PaddleOCR initialized")
    except Exception as e:
        print(f"✗ PaddleOCR initialization failed: {e}")
        models_available['paddleocr'] = False
    
    # Initialize Google Cloud Vision
    try:
        if models_available['google_vision']:
            google_client = vision.ImageAnnotatorClient()
            print("✓ Google Cloud Vision initialized")
    except Exception as e:
        print(f"✗ Google Cloud Vision initialization failed: {e}")
        models_available['google_vision'] = False
    
    available_models = [k for k, v in models_available.items() if v]
    print(f"\nAvailable models: {available_models}")
    return available_models

def tesseract_ocr(image_path):
    """Run Tesseract OCR on an image"""
    try:
        image = Image.open(image_path)
        text = pytesseract.image_to_string(image)
        return text.strip()
    except Exception as e:
        print(f"Tesseract error on {image_path}: {e}")
        return ""

def easyocr_ocr(image_path):
    """Run EasyOCR on an image"""
    global easyocr_reader
    try:
        if not easyocr_reader:
            print("EasyOCR not initialized. Run initialize_ocr_models() first.")
            return ""
        results = easyocr_reader.readtext(image_path)
        text = ' '.join([result[1] for result in results])
        return text.strip()
    except Exception as e:
        print(f"EasyOCR error on {image_path}: {e}")
        return ""

def paddleocr_ocr(image_path):
    """Run PaddleOCR on an image with image resizing to prevent memory issues"""
    global paddleocr_reader
    try:
        if not paddleocr_reader:
            print("PaddleOCR not initialized. Run initialize_ocr_models() first.")
            return ""
        
        # Load and resize image to prevent memory issues
        image = Image.open(image_path)
        
        # Resize if image is too large
        max_size = 3500  # Adjust this value as needed
        if max(image.size) > max_size:
            ratio = max_size / max(image.size)
            new_size = (int(image.size[0] * ratio), int(image.size[1] * ratio))
            image = image.resize(new_size, Image.Resampling.LANCZOS)
            
            # Convert to numpy array for PaddleOCR
            import numpy as np
            image_array = np.array(image)
            results = paddleocr_reader.ocr(image_array)
        else:
            results = paddleocr_reader.ocr(image_path)
        
        text_parts = []
        for line in results:
            if line:
                for word_info in line:
                    if len(word_info) > 1:
                        text_parts.append(word_info[1][0])
        
        return ' '.join(text_parts).strip()
    except Exception as e:
        print(f"PaddleOCR error on {image_path}: {e}")
        return ""

def kraken_ocr(image_path):
    """Run Kraken OCR on an image"""
    try:
        if not kraken:
            return "Kraken not available"
        return "Kraken requires model setup"
    except Exception as e:
        print(f"Kraken error on {image_path}: {e}")
        return ""

def google_vision_ocr(image_path):
    """Run Google Cloud Vision OCR on an image"""
    global google_client
    try:
        if not google_client:
            print("Google Cloud Vision not initialized. Run initialize_ocr_models() first.")
            return ""
        
        with open(image_path, 'rb') as image_file:
            content = image_file.read()
        
        image = vision.Image(content=content)
        response = google_client.text_detection(image=image)
        texts = response.text_annotations
        
        if texts:
            return texts[0].description.strip()
        return ""
    except Exception as e:
        print(f"Google Vision error on {image_path}: {e}")
        return ""



## Evaluation Metrics

In [15]:
# Bounding box evaluation functions
def calculate_iou(box1, box2):
    """Calculate Intersection over Union (IoU) between two bounding boxes"""
    # box format: [x1, y1, x2, y2]
    x1_inter = max(box1[0], box2[0])
    y1_inter = max(box1[1], box2[1])
    x2_inter = min(box1[2], box2[2])
    y2_inter = min(box1[3], box2[3])
    
    if x1_inter >= x2_inter or y1_inter >= y2_inter:
        return 0.0
    
    intersection = (x2_inter - x1_inter) * (y2_inter - y1_inter)
    
    area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
    area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
    
    union = area1 + area2 - intersection
    
    return intersection / union if union > 0 else 0.0

def get_ocr_boxes_with_text(ocr_model_name, image_path):
    """Get bounding boxes and text from OCR models that support it"""
    boxes = []
    
    try:
        if ocr_model_name == 'easyocr':
            global easyocr_reader
            if easyocr_reader:
                results = easyocr_reader.readtext(image_path)
                for result in results:
                    # EasyOCR returns: [bbox_points, text, confidence]
                    bbox_points = result[0]
                    text = result[1]
                    # Convert bbox points to [x1, y1, x2, y2]
                    x_coords = [point[0] for point in bbox_points]
                    y_coords = [point[1] for point in bbox_points]
                    x1, x2 = min(x_coords), max(x_coords)
                    y1, y2 = min(y_coords), max(y_coords)
                    boxes.append({'text': text, 'box': [x1, y1, x2, y2]})
        
        elif ocr_model_name == 'paddleocr':
            global paddleocr_reader
            if paddleocr_reader:
                results = paddleocr_reader.ocr(image_path)
                for line in results:
                    if line:
                        for word_info in line:
                            if len(word_info) > 1:
                                # PaddleOCR returns: [bbox_points, (text, confidence)]
                                bbox_points = word_info[0]
                                text = word_info[1][0]
                                # Convert bbox points to [x1, y1, x2, y2]
                                x_coords = [point[0] for point in bbox_points]
                                y_coords = [point[1] for point in bbox_points]
                                x1, x2 = min(x_coords), max(x_coords)
                                y1, y2 = min(y_coords), max(y_coords)
                                boxes.append({'text': text, 'box': [x1, y1, x2, y2]})
        
        elif ocr_model_name == 'google_vision':
            global google_client
            if google_client:
                with open(image_path, 'rb') as image_file:
                    content = image_file.read()
                
                image = vision.Image(content=content)
                response = google_client.text_detection(image=image)
                texts = response.text_annotations
                
                # Skip the first annotation (full text) and process individual words
                for annotation in texts[1:]:
                    text = annotation.description
                    vertices = annotation.bounding_poly.vertices
                    
                    # Convert vertices to [x1, y1, x2, y2]
                    x_coords = [vertex.x for vertex in vertices]
                    y_coords = [vertex.y for vertex in vertices]
                    x1, x2 = min(x_coords), max(x_coords)
                    y1, y2 = min(y_coords), max(y_coords)
                    boxes.append({'text': text, 'box': [x1, y1, x2, y2]})
        
        # For other models (tesseract, kraken), we only have text
        # so we can't do spatial evaluation
        
    except Exception as e:
        print(f"Error getting boxes from {ocr_model_name}: {e}")
    
    return boxes

def evaluate_spatial_accuracy(predicted_boxes, ground_truth_boxes, iou_threshold=0.5):
    """Evaluate spatial accuracy using bounding box matching"""
    if not predicted_boxes or not ground_truth_boxes:
        return {
            'detection_precision': 0.0,
            'detection_recall': 0.0,
            'detection_f1': 0.0,
            'avg_iou': 0.0,
            'matched_pairs': 0,
            'total_predicted': len(predicted_boxes),
            'total_ground_truth': len(ground_truth_boxes)
        }
    
    # Find best matches between predicted and ground truth boxes
    matched_pairs = []
    used_gt_indices = set()
    
    for pred_idx, pred_box in enumerate(predicted_boxes):
        best_iou = 0.0
        best_gt_idx = -1
        
        for gt_idx, gt_box in enumerate(ground_truth_boxes):
            if gt_idx in used_gt_indices:
                continue
            
            iou = calculate_iou(pred_box['box'], gt_box['box'])
            if iou > best_iou and iou >= iou_threshold:
                best_iou = iou
                best_gt_idx = gt_idx
        
        if best_gt_idx != -1:
            matched_pairs.append({
                'pred_idx': pred_idx,
                'gt_idx': best_gt_idx,
                'iou': best_iou,
                'pred_text': pred_box['text'],
                'gt_text': ground_truth_boxes[best_gt_idx]['text']
            })
            used_gt_indices.add(best_gt_idx)
    
    # Calculate metrics
    num_matches = len(matched_pairs)
    detection_precision = num_matches / len(predicted_boxes) if predicted_boxes else 0.0
    detection_recall = num_matches / len(ground_truth_boxes) if ground_truth_boxes else 0.0
    detection_f1 = 2 * (detection_precision * detection_recall) / (detection_precision + detection_recall) if (detection_precision + detection_recall) > 0 else 0.0
    avg_iou = sum(pair['iou'] for pair in matched_pairs) / num_matches if num_matches > 0 else 0.0
    
    return {
        'detection_precision': detection_precision,
        'detection_recall': detection_recall,
        'detection_f1': detection_f1,
        'avg_iou': avg_iou,
        'matched_pairs': num_matches,
        'total_predicted': len(predicted_boxes),
        'total_ground_truth': len(ground_truth_boxes),
        'matches': matched_pairs
    }

def evaluate_ocr_with_spatial(predicted_text, ground_truth_text, predicted_boxes, ground_truth_boxes):
    """Comprehensive OCR evaluation including both text and spatial metrics"""
    # Text-based evaluation (existing)
    text_metrics = evaluate_ocr_result(predicted_text, ground_truth_text)
    
    # Spatial evaluation (new)
    spatial_metrics = evaluate_spatial_accuracy(predicted_boxes, ground_truth_boxes)
    
    # Combined metrics
    combined_metrics = {
        # Text metrics
        'text_similarity': text_metrics['similarity'],
        'text_precision': text_metrics['precision'],
        'text_recall': text_metrics['recall'],
        'text_f1': text_metrics['f1'],
        
        # Spatial metrics
        'detection_precision': spatial_metrics['detection_precision'],
        'detection_recall': spatial_metrics['detection_recall'],
        'detection_f1': spatial_metrics['detection_f1'],
        'avg_iou': spatial_metrics['avg_iou'],
        'matched_boxes': spatial_metrics['matched_pairs'],
        'total_predicted_boxes': spatial_metrics['total_predicted'],
        'total_ground_truth_boxes': spatial_metrics['total_ground_truth'],
        
        # Combined score (average of text F1 and detection F1)
        'combined_f1': (text_metrics['f1'] + spatial_metrics['detection_f1']) / 2,
        
        # Additional info
        'predicted_length': text_metrics.get('predicted_length', 0),
        'ground_truth_length': text_metrics.get('ground_truth_length', 0),
        'spatial_matches': spatial_metrics.get('matches', [])
    }
    
    return combined_metrics

In [16]:
from difflib import SequenceMatcher
import re

def clean_text(text):
    """Clean text for comparison"""
    text = re.sub(r'\s+', ' ', text.strip())
    return text.lower()

def calculate_similarity(text1,text2):
    """Calculate similarity between two texts using SequenceMatcher"""
    clean1 = clean_text(text1)
    clean2 = clean_text(text2)
    return SequenceMatcher(None, clean1, clean2).ratio()

def calculate_word_accuracy(predicted, ground_truth):
    """Calculate word-level precision, recall, and F1-score"""
    pred_words = set(clean_text(predicted).split())
    gt_words = set(clean_text(ground_truth).split())
    
    if not gt_words:
        return {'precision': 0.0, 'recall': 0.0, 'f1': 0.0}
    
    if not pred_words:
        return {'precision': 0.0, 'recall': 0.0, 'f1': 0.0}
    
    intersection = pred_words.intersection(gt_words)
    
    precision = len(intersection) / len(pred_words) if pred_words else 0.0
    recall = len(intersection) / len(gt_words) if gt_words else 0.0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
    
    return {
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

def evaluate_ocr_result(predicted: str, ground_truth: str) -> Dict[str, float]:
    """Comprehensive evaluation of OCR result"""
    similarity = calculate_similarity(predicted, ground_truth)
    word_metrics = calculate_word_accuracy(predicted, ground_truth)
    
    return {
        'similarity': similarity,
        'precision': word_metrics['precision'],
        'recall': word_metrics['recall'],
        'f1': word_metrics['f1'],
        'predicted_length': len(predicted),
        'ground_truth_length': len(ground_truth)
    }

## Run OCR Evaluation on All Samples

In [17]:
import time

def run_single_ocr_evaluation(samples, ocr_model_name):
    results = []
    
    ocr_functions = {
        'tesseract': tesseract_ocr,
        'easyocr': easyocr_ocr,
        'paddleocr': paddleocr_ocr,
        'kraken': kraken_ocr,
        'google_vision': google_vision_ocr
    }
    
    ocr_function = ocr_functions[ocr_model_name]
    
    print(f"Running {ocr_model_name} on {len(samples)} imgs")
    
    for i, sample in enumerate(samples):
        print(f"Processing {i+1}/{len(samples)}: {sample['image_name']}")
        
        sample_result = {
            'sample_id': i,
            'dataset': sample['dataset'],
            'image_name': sample['image_name'],
            'ground_truth': sample['ground_truth'],
            'ocr_result': {}
        }
        
        start_time = time.time()
        
        predicted_text = ocr_function(sample['image_path'])
        processing_time = time.time() - start_time
        
        # Evaluate results
        metrics = evaluate_ocr_result(predicted_text, sample['ground_truth'])
        
        sample_result['ocr_result'] = {
            'model': ocr_model_name,
            'predicted_text': predicted_text,
            'processing_time': processing_time,
            'metrics': metrics
        }
        print(f"  Similarity: {metrics['similarity']:.3f}, P: {metrics['precision']:.3f}, R: {metrics['recall']:.3f}, F1: {metrics['f1']:.3f}, Time: {processing_time:.2f}s")
        

        
        results.append(sample_result)
    
    return results

def run_single_ocr_evaluation_with_spatial(samples, ocr_model_name):
    """Run OCR evaluation with both text and spatial metrics"""
    results = []
    
    ocr_functions = {
        'tesseract': tesseract_ocr,
        'easyocr': easyocr_ocr,
        'paddleocr': paddleocr_ocr,
        'kraken': kraken_ocr,
        'google_vision': google_vision_ocr
    }
    
    ocr_function = ocr_functions[ocr_model_name]
    
    # Check if model supports spatial evaluation
    spatial_supported = ocr_model_name in ['easyocr', 'paddleocr', 'google_vision']
    
    print(f"Running {ocr_model_name} on {len(samples)} samples...")
    if spatial_supported:
        print(f"Spatial evaluation supported")
    else:
        print(f"Text-only evaluation (no bounding boxes)")
    
    for i, sample in enumerate(samples):
        print(f"Processing {i+1}/{len(samples)}: {sample['image_name']}")
        
        sample_result = {
            'sample_id': i,
            'dataset': sample['dataset'],
            'image_name': sample['image_name'],
            'ground_truth': sample['ground_truth'],
            'ground_truth_boxes': sample.get('ground_truth_boxes', []),
            'ocr_result': {}
        }
        
        start_time = time.time()
        
        # Get OCR text
        predicted_text = ocr_function(sample['image_path'])
        processing_time = time.time() - start_time
        
        # Get OCR boxes if supported
        predicted_boxes = []
        if spatial_supported:
            predicted_boxes = get_ocr_boxes_with_text(ocr_model_name, sample['image_path'])
        
        # Evaluate with spatial metrics if available
        if spatial_supported and sample.get('ground_truth_boxes'):
            metrics = evaluate_ocr_with_spatial(
                predicted_text, 
                sample['ground_truth'],
                predicted_boxes,
                sample['ground_truth_boxes']
            )
            evaluation_type = 'spatial'
        else:
            # Fall back to text-only evaluation
            metrics = evaluate_ocr_result(predicted_text, sample['ground_truth'])
            evaluation_type = 'text_only'
        
        sample_result['ocr_result'] = {
            'model': ocr_model_name,
            'predicted_text': predicted_text,
            'predicted_boxes': predicted_boxes,
            'processing_time': processing_time,
            'metrics': metrics,
            'evaluation_type': evaluation_type
        }
        
        # Print results
        if evaluation_type == 'spatial':
            print(f"  Text F1: {metrics['text_f1']:.3f}, Detection F1: {metrics['detection_f1']:.3f}, Combined: {metrics['combined_f1']:.3f}, IoU: {metrics['avg_iou']:.3f}")
        else:
            print(f"  Similarity: {metrics['similarity']:.3f}, F1: {metrics['f1']:.3f}, Time: {processing_time:.2f}s")
        
        results.append(sample_result)
    
    return results

In [18]:
initialize_ocr_models()

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


Initializing OCR models...
✓ EasyOCR initialized


which: no ccache in (/home/duyle/.pyenv/versions/3.10.16/bin:/usr/local/bin:/usr/bin:/bin:/usr/local/sbin:/home/duyle/.local/share/flatpak/exports/bin:/var/lib/flatpak/exports/bin:/usr/lib/jvm/default/bin:/usr/bin/site_perl:/usr/bin/vendor_perl:/usr/bin/core_perl:/var/lib/snapd/snap/bin:/home/duyle/.local/share/bin)
[32mCreating model: ('PP-LCNet_x1_0_doc_ori', None)[0m
[32mUsing official model (PP-LCNet_x1_0_doc_ori), the model files will be automatically downloaded and saved in /home/duyle/.paddlex/official_models.[0m
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 82510.90it/s]
[32mCreating model: ('UVDoc', None)[0m
[33mThe model(UVDoc) is not supported to run in MKLDNN mode! Using `paddle` instead![0m
[32mUsing official model (UVDoc), the model files will be automatically downloaded and saved in /home/duyle/.paddlex/official_models.[0m
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 15797.76it/s]
[32mCreating model: ('PP-LCNet_x1_0_textline_ori', None)[0m
[32

✓ PaddleOCR initialized
✓ Google Cloud Vision initialized

Available models: ['tesseract', 'easyocr', 'paddleocr', 'kraken', 'google_vision']


['tesseract', 'easyocr', 'paddleocr', 'kraken', 'google_vision']

In [None]:
paddleocr_results = run_single_ocr_evaluation(all_samples, 'paddleocr')

Running paddleocr on 20 imgs
Processing 1/20: 14_e26ee_default.jpg
  Similarity: 0.000, P: 0.000, R: 0.000, F1: 0.000, Time: 267.55s
Processing 2/20: 4_938dc_default.jpg


KeyboardInterrupt: 

In [18]:
spatial_paddleocr_results = run_single_ocr_evaluation_with_spatial(all_samples, 'paddleocr')

Running paddleocr on 20 samples...
Spatial evaluation supported
Processing 1/20: 14_e26ee_default.jpg
PaddleOCR error on /home/duyle/Documents/Case-Study2/French_OCR_dataset/14_e26ee_default.jpg: PaddleOCR.predict() got an unexpected keyword argument 'cls'


[33mResized image size (4582x3131) exceeds max_side_limit of 4000. Resizing to fit within limit.[0m


KeyboardInterrupt: 

In [None]:
google_results = run_single_ocr_evaluation(all_samples, 'google_vision')
spatial_google_results = run_single_ocr_evaluation_with_spatial(all_samples, 'google_vision')

In [None]:
tesseract_results = run_single_ocr_evaluation(all_samples, 'tesseract')

## Test Spatial Evaluation with Bounding Boxes

In [None]:
# Test bounding box extraction on a sample
sample = all_samples[0]
print(f"Testing on: {sample['image_name']} ({sample['dataset']})")
print(f"Ground truth boxes: {len(sample.get('ground_truth_boxes', []))}")

# Show first few ground truth boxes
for i, box in enumerate(sample.get('ground_truth_boxes', [])[:3]):
    print(f"  Box {i+1}: {box['box']} -> '{box['text'][:50]}...'")

In [None]:
# Test with spatial evaluation (EasyOCR supports bounding boxes)
print("=== Testing EasyOCR with Spatial Evaluation ===")
easyocr_spatial_results = run_single_ocr_evaluation_with_spatial(all_samples[:3], 'easyocr')

In [None]:
# Compare text-only vs spatial evaluation
print("\n=== Comparison: Text-only vs Spatial Evaluation ===")
sample_result = easyocr_spatial_results[0]
metrics = sample_result['ocr_result']['metrics']

if sample_result['ocr_result']['evaluation_type'] == 'spatial':
    print(f"Sample: {sample_result['image_name']}")
    print(f"Text Metrics:")
    print(f"  - Text F1: {metrics['text_f1']:.3f}")
    print(f"  - Text Precision: {metrics['text_precision']:.3f}")
    print(f"  - Text Recall: {metrics['text_recall']:.3f}")
    print(f"Spatial Metrics:")
    print(f"  - Detection F1: {metrics['detection_f1']:.3f}")
    print(f"  - Detection Precision: {metrics['detection_precision']:.3f}")
    print(f"  - Detection Recall: {metrics['detection_recall']:.3f}")
    print(f"  - Average IoU: {metrics['avg_iou']:.3f}")
    print(f"  - Matched Boxes: {metrics['matched_boxes']}/{metrics['total_ground_truth_boxes']}")
    print(f"Combined F1 Score: {metrics['combined_f1']:.3f}")
else:
    print("Spatial evaluation not available for this sample")

In [None]:
# Test Google Vision with spatial evaluation
print("=== Testing Google Vision with Spatial Evaluation ===")
google_spatial_results = run_single_ocr_evaluation_with_spatial(all_samples[:2], 'google_vision')

## Results Analysis and Visualization

In [None]:
def create_results_dataframe(results: List[Dict]) -> pd.DataFrame:
    """Convert results to pandas DataFrame for analysis"""
    rows = []
    
    for result in results:
        for model_name, model_result in result['ocr_results'].items():
            row = {
                'sample_id': result['sample_id'],
                'dataset': result['dataset'],
                'image_name': result['image_name'],
                'model': model_name,
                'processing_time': model_result.get('processing_time', 0),
                'similarity': model_result['metrics']['similarity'],
                'precision': model_result['metrics']['precision'],
                'recall': model_result['metrics']['recall'],
                'f1': model_result['metrics']['f1'],
                'predicted_length': model_result['metrics'].get('predicted_length', 0),
                'ground_truth_length': model_result['metrics'].get('ground_truth_length', 0),
                'has_error': 'error' in model_result
            }
            rows.append(row)
    
    return pd.DataFrame(rows)

# Create DataFrame
df_results = create_results_dataframe(evaluation_results)
print(f"Results DataFrame shape: {df_results.shape}")
print(f"Available models: {df_results['model'].unique()}")
df_results.head()

In [None]:
# Summary statistics by model
summary_stats = df_results.groupby('model').agg({
    'similarity': ['mean', 'std', 'min', 'max'],
    'precision': ['mean', 'std'],
    'recall': ['mean', 'std'],
    'f1': ['mean', 'std'],
    'processing_time': ['mean', 'std']
}).round(4)

print("=== OCR Model Performance Summary ===")
summary_stats

In [None]:
# Performance by dataset (French vs English)
dataset_performance = df_results.groupby(['dataset', 'model']).agg({
    'similarity': 'mean',
    'f1': 'mean',
    'processing_time': 'mean'
}).round(4)

print("=== Performance by Dataset ===")
dataset_performance

In [None]:
# Create visualizations
plt.style.use('default')
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. F1 Score comparison
sns.boxplot(data=df_results, x='model', y='f1', ax=axes[0,0])
axes[0,0].set_title('F1 Score by OCR Model')
axes[0,0].tick_params(axis='x', rotation=45)

# 2. Similarity comparison
sns.boxplot(data=df_results, x='model', y='similarity', ax=axes[0,1])
axes[0,1].set_title('Text Similarity by OCR Model')
axes[0,1].tick_params(axis='x', rotation=45)

# 3. Processing time comparison
sns.boxplot(data=df_results, x='model', y='processing_time', ax=axes[1,0])
axes[1,0].set_title('Processing Time by OCR Model')
axes[1,0].set_ylabel('Time (seconds)')
axes[1,0].tick_params(axis='x', rotation=45)

# 4. Performance by dataset
sns.barplot(data=df_results, x='model', y='f1', hue='dataset', ax=axes[1,1])
axes[1,1].set_title('F1 Score by Model and Dataset')
axes[1,1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## Model Ranking and Recommendations

In [None]:
# Calculate overall ranking
model_ranking = df_results.groupby('model').agg({
    'f1': 'mean',
    'similarity': 'mean',
    'precision': 'mean',
    'recall': 'mean',
    'processing_time': 'mean'
}).round(4)

# Add pricing information
pricing_info = {
    'tesseract': 'Free',
    'easyocr': 'Free',
    'paddleocr': 'Free',
    'kraken': 'Free',
    'google_vision': '$1.50 per 1,000 images'
}

model_ranking['pricing'] = model_ranking.index.map(pricing_info)

# Sort by F1 score
model_ranking_sorted = model_ranking.sort_values('f1', ascending=False)

print("=== OCR Model Ranking (by F1 Score) ===")
model_ranking_sorted

## Export Results to JSON

In [None]:
# Create comprehensive results report
final_report = {
    'evaluation_summary': {
        'total_samples': len(all_samples),
        'french_samples': len(french_samples),
        'english_samples': len(english_samples),
        'models_tested': list(df_results['model'].unique()),
        'evaluation_date': pd.Timestamp.now().isoformat()
    },
    'model_performance': model_ranking_sorted.to_dict('index'),
    'dataset_performance': dataset_performance.to_dict(),
    'pricing_information': pricing_info,
    'detailed_results': evaluation_results
}

# Save to JSON file
output_file = 'ocr_evaluation_results.json'
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(final_report, f, indent=2, ensure_ascii=False)

print(f"Results saved to {output_file}")
print(f"File size: {os.path.getsize(output_file) / 1024:.1f} KB")