In [37]:
import os
import pandas as pd
from datasets import load_dataset
from PIL import Image
import torch
import time
import easyocr
import pytesseract
from jiwer import wer, cer
from tqdm import tqdm
import re

In [38]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)

In [39]:
datasets_to_benchmark = [    
    "ahmedheakl/arocrbench_synthesizear",
    "ahmedheakl/arocrbench_patsocr",
    "ahmedheakl/arocrbench_arabicocr",
    "ahmedheakl/arocrbench_hindawi",

]

In [40]:
def remove_diacritics(text):
    diacritics = [
        '\u0617', '\u0618', '\u0619', '\u061A',
        '\u064B', '\u064C', '\u064D', '\u064E', '\u064F', '\u0650',
        '\u0651', '\u0652', '\u0653', '\u0654', '\u0655', '\u0656',
        '\u0657', '\u0658', '\u0659', '\u065A', '\u065B', '\u065C',
        '\u065D', '\u065E', '\u065F', '\u0670'
    ]
    pattern = '[' + ''.join(diacritics) + ']'
    return re.sub(pattern, '', text)

def remove_english_letters(text):
    pattern = r'[a-zA-Z]'
    return re.sub(pattern, '', text)

def clean_text(text):
    cleaned = re.sub(r'[\n\t]+', ' ', text)
    cleaned = re.sub(r'\s+', ' ', cleaned)
    return cleaned.strip()



In [41]:
import numpy as np 
easy_ocr = easyocr.Reader(['ar'])

def benchmark_dataset(dataset_name, split="train"):
    print(f"Loading dataset: {dataset_name}")
    dataset = load_dataset(dataset_name, split=split)
    
    results = []
    sample_keys = dataset[0].keys()  
    ground_truth_key = "text" if "text" in sample_keys else "answer" if "answer" in sample_keys else None
    if ground_truth_key is None:
        raise ValueError(f"No suitable ground truth key ('text' or 'answer') found in dataset: {dataset_name}")
    
    # Prepare samples
    for sample in tqdm(dataset, desc=f"Preparing samples from {dataset_name}"):
        image = sample["image"]
        if image.mode != "RGB":
            image = image.convert("RGB")
        ground_truth = sample[ground_truth_key]
        # Skip samples with empty ground truth
        if not ground_truth or ground_truth.strip() == "":
            continue
        results.append({
            "dataset": dataset_name,
            "image": image,
            "ground_truth": ground_truth,
            "easyocr": {"text": None, "time": 0},
            "tesseract": {"text": None, "time": 0},
            "status": "pending"
        })
    
    skipped_samples = 0
    for i, sample in tqdm(enumerate(results), total=len(results), desc=f"Benchmarking OCR on {dataset_name}"):
        torch.cuda.empty_cache()
        image = sample["image"]
        
        try:
            # Convert PIL Image to NumPy array for EasyOCR
            image_np = np.array(image)

            # EasyOCR
            start_time = time.time()
            easy_result = easy_ocr.readtext(image_np)
            easy_time = time.time() - start_time
            easy_text = " ".join([res[1] for res in easy_result]) if easy_result else " "
            results[i]["easyocr"] = {"text": easy_text, "time": easy_time}

            # TesseractOCR
            start_time = time.time()
            tess_result = pytesseract.image_to_string(image, lang='ara')
            tess_time = time.time() - start_time
            tess_text = tess_result if tess_result.strip() else " "
            results[i]["tesseract"] = {"text": tess_text, "time": tess_time}

            results[i]["status"] = "success"
            
        except Exception as e:
            print(f"Error processing sample {i} in {dataset_name}: {str(e)}")
            results[i]["status"] = "error"
            skipped_samples += 1
    
    print(f"Processed {len(results) - skipped_samples} samples successfully, skipped {skipped_samples} in {dataset_name}.")
    return results

In [42]:
all_results = []
for dataset_name in datasets_to_benchmark:
    dataset_results = benchmark_dataset(dataset_name)
    all_results.extend(dataset_results)

Loading dataset: ahmedheakl/arocrbench_synthesizear


Preparing samples from ahmedheakl/arocrbench_synthesizear: 100%|██████████| 500/500 [00:00<00:00, 1130.54it/s]
Benchmarking OCR on ahmedheakl/arocrbench_synthesizear: 100%|██████████| 500/500 [01:26<00:00,  5.81it/s]


Processed 500 samples successfully, skipped 0 in ahmedheakl/arocrbench_synthesizear.
Loading dataset: ahmedheakl/arocrbench_patsocr


Preparing samples from ahmedheakl/arocrbench_patsocr: 100%|██████████| 500/500 [00:00<00:00, 687.22it/s]
Benchmarking OCR on ahmedheakl/arocrbench_patsocr: 100%|██████████| 500/500 [01:53<00:00,  4.39it/s]


Processed 500 samples successfully, skipped 0 in ahmedheakl/arocrbench_patsocr.
Loading dataset: ahmedheakl/arocrbench_arabicocr


Preparing samples from ahmedheakl/arocrbench_arabicocr: 100%|██████████| 50/50 [00:00<00:00, 1124.53it/s]
Benchmarking OCR on ahmedheakl/arocrbench_arabicocr: 100%|██████████| 50/50 [00:33<00:00,  1.49it/s]


Processed 50 samples successfully, skipped 0 in ahmedheakl/arocrbench_arabicocr.
Loading dataset: ahmedheakl/arocrbench_hindawi


Preparing samples from ahmedheakl/arocrbench_hindawi: 100%|██████████| 200/200 [00:01<00:00, 156.47it/s]
Benchmarking OCR on ahmedheakl/arocrbench_hindawi: 100%|██████████| 200/200 [03:59<00:00,  1.20s/it]

Processed 200 samples successfully, skipped 0 in ahmedheakl/arocrbench_hindawi.





In [47]:
df = pd.DataFrame(all_results)

In [48]:
df[df['status']!='success']

Unnamed: 0,dataset,image,ground_truth,easyocr,tesseract,status


I think these didn't incluced in the benchmark
- "ahmedheakl/arocrbench_isippt",
- "ahmedheakl/arocrbench_arabicocr",
-  "ahmedheakl/arocrbench_hindawi",
-   "ahmedheakl/arocrbench_evarest"

In [49]:

df.to_csv("all_nonllms.csv")

In [50]:
for ocr_method in ["easyocr", "tesseract"]:
    df[f"{ocr_method}_text"] = df[ocr_method].apply(lambda x: clean_text(str(x["text"])))
    df[f"{ocr_method}_time"] = df[ocr_method].apply(lambda x: x["time"])
    df[f"{ocr_method}_text_no_diacritics"] = df[f"{ocr_method}_text"].apply(remove_diacritics)

df["ground_truth"] = df["ground_truth"].astype(str).fillna("").apply(remove_english_letters).apply(clean_text)
df["ground_truth_no_diacritics"] = df["ground_truth"].apply(remove_diacritics)

# %%
# Calculate metrics
for dataset_name in datasets_to_benchmark:
    subset_df = df[df["dataset"] == dataset_name]
    print(f"\nResults for {dataset_name}:")
    
    for ocr_method in ["easyocr", "tesseract"]:
        wer_score = wer(subset_df["ground_truth"].tolist(), subset_df[f"{ocr_method}_text"].tolist())
        cer_score = cer(subset_df["ground_truth"].tolist(), subset_df[f"{ocr_method}_text"].tolist())
        wer_score_t = wer(subset_df["ground_truth_no_diacritics"].tolist(), subset_df[f"{ocr_method}_text_no_diacritics"].tolist())
        cer_score_t = cer(subset_df["ground_truth_no_diacritics"].tolist(), subset_df[f"{ocr_method}_text_no_diacritics"].tolist())
        
        print(f"{ocr_method}:")
        print(f"  WER: {wer_score:.2f}, CER: {cer_score:.2f}")
        print(f"  WER (no diacritics): {wer_score_t:.2f}, CER (no diacritics): {cer_score_t:.2f}")



Results for ahmedheakl/arocrbench_synthesizear:
easyocr:
  WER: 1.06, CER: 0.59
  WER (no diacritics): 0.76, CER (no diacritics): 0.46
tesseract:
  WER: 1.06, CER: 0.42
  WER (no diacritics): 0.76, CER (no diacritics): 0.33

Results for ahmedheakl/arocrbench_patsocr:
easyocr:
  WER: 0.74, CER: 0.55
  WER (no diacritics): 0.74, CER (no diacritics): 0.55
tesseract:
  WER: 0.31, CER: 0.14
  WER (no diacritics): 0.31, CER (no diacritics): 0.14

Results for ahmedheakl/arocrbench_arabicocr:
easyocr:
  WER: 0.76, CER: 0.56
  WER (no diacritics): 0.76, CER (no diacritics): 0.56
tesseract:
  WER: 0.03, CER: 0.01
  WER (no diacritics): 0.03, CER (no diacritics): 0.01

Results for ahmedheakl/arocrbench_hindawi:
easyocr:
  WER: 0.77, CER: 0.43
  WER (no diacritics): 0.74, CER (no diacritics): 0.41
tesseract:
  WER: 0.52, CER: 0.33
  WER (no diacritics): 0.50, CER (no diacritics): 0.31


In [51]:

df.to_csv("all_nonllms_cleaned.csv", index=False)
print("Results saved to 'arabic_ocr_benchmark_results.csv'")

Results saved to 'arabic_ocr_benchmark_results.csv'


In [52]:
df.head()

Unnamed: 0,dataset,image,ground_truth,easyocr,tesseract,status,easyocr_text,easyocr_time,easyocr_text_no_diacritics,tesseract_text,tesseract_time,tesseract_text_no_diacritics,ground_truth_no_diacritics
0,ahmedheakl/arocrbench_synthesizear,<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=639x114 at 0x7557E425C790>,"وَإِذا ما سَأَلَتْنِي عَن مَعْنَى لَفَظَهُ ""عرب"" عِنْدَ","{'text': 'لفظهً "" عرب "" عندً عن واذا ما سألتني معنى', 'time': 0.12821722030639648}","{'text': 'وَإِذا ما سَألْْيِي عَن مَعْنَى لَفَظَهُ ""عرب"" عِنْدَ ', 'time': 0.054384469985961914}",success,"لفظهً "" عرب "" عندً عن واذا ما سألتني معنى",0.128217,"لفظه "" عرب "" عند عن واذا ما سألتني معنى","وَإِذا ما سَألْْيِي عَن مَعْنَى لَفَظَهُ ""عرب"" عِنْدَ",0.054384,"وإذا ما سأليي عن معنى لفظه ""عرب"" عند","وإذا ما سألتني عن معنى لفظه ""عرب"" عند"
1,ahmedheakl/arocrbench_synthesizear,<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1182x147 at 0x755820245950>,أَمّا فَهُم النُصُوصِ وَاِسْتِنْباط مَعانِيها بِوَجْهٍ صَحِيحٌ دقيق،,"{'text': '_ النضوص واستنباط قعانيها بؤجه ضجيح دقيق . أما فهم', 'time': 0.10543584823608398}","{'text': 'أما فَهُم النْصْوصٍ واشتئباط قعانيها بوكو صَحيحٌ دقيق, ', 'time': 0.07516288757324219}",success,_ النضوص واستنباط قعانيها بؤجه ضجيح دقيق . أما فهم,0.105436,_ النضوص واستنباط قعانيها بؤجه ضجيح دقيق . أما فهم,"أما فَهُم النْصْوصٍ واشتئباط قعانيها بوكو صَحيحٌ دقيق,",0.075163,"أما فهم النصوص واشتئباط قعانيها بوكو صحيح دقيق,",أما فهم النصوص واستنباط معانيها بوجه صحيح دقيق،
2,ahmedheakl/arocrbench_synthesizear,<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=994x163 at 0x7556DCDBDD50>,تُثِير فِيها الاسود، وَهُوَ ما يَتَعارَض مَعَ اِكْتِشافِ,"{'text': 'تثير فيها الاسود وهو ما يتعارض مع اكتشاف', 'time': 0.05660748481750488}","{'text': 'تَثْير فيها الاسود. وَهَوٍ ما تَتَعارَض مق اكْتشافيٍ ', 'time': 0.07706618309020996}",success,تثير فيها الاسود وهو ما يتعارض مع اكتشاف,0.056607,تثير فيها الاسود وهو ما يتعارض مع اكتشاف,تَثْير فيها الاسود. وَهَوٍ ما تَتَعارَض مق اكْتشافيٍ,0.077066,تثير فيها الاسود. وهو ما تتعارض مق اكتشافي,تثير فيها الاسود، وهو ما يتعارض مع اكتشاف
3,ahmedheakl/arocrbench_synthesizear,<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1315x224 at 0x7557F35EC850>,الجماعي، وَكادَت تَصِل إِلَى المَرْحَلَةِ الاِشْتِراكِيَّة لِسَيْطَرَةِ أَكْبَرَ,"{'text': 'وكادت تصل إلى المزحلة الاشتاكيًة لسيط ة أفبر الجماعي ,', 'time': 0.104278564453125}","{'text': 'لَى المَرْحَلَةِ الإشترايّة لسَيْطرة أَخُبرَ الجماعي. وَكَادَت تصل إِلَى المَْحَلَةِ الاشترايّة لِسَيْط ', 'time': 0.09968733787536621}",success,"وكادت تصل إلى المزحلة الاشتاكيًة لسيط ة أفبر الجماعي ,",0.104279,"وكادت تصل إلى المزحلة الاشتاكية لسيط ة أفبر الجماعي ,",لَى المَرْحَلَةِ الإشترايّة لسَيْطرة أَخُبرَ الجماعي. وَكَادَت تصل إِلَى المَْحَلَةِ الاشترايّة لِسَيْط,0.099687,لى المرحلة الإشتراية لسيطرة أخبر الجماعي. وكادت تصل إلى المحلة الاشتراية لسيط,الجماعي، وكادت تصل إلى المرحلة الاشتراكية لسيطرة أكبر
4,ahmedheakl/arocrbench_synthesizear,<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1224x129 at 0x7557B4671D10>,مَعَهُ مَنْدِيلا فِيهِ جردقتان وَقَطَعَ لَحْم سَكْباج مُبَرَّد,"{'text': 'معهً منديلا فيه جردقتان وقطع سحباج آخم مبرد', 'time': 0.11063003540039062}","{'text': 'مَعهَ منديلا فيه جردقتان وَفَطعَ لخم سه باج مبرد ', 'time': 0.07319283485412598}",success,معهً منديلا فيه جردقتان وقطع سحباج آخم مبرد,0.11063,معه منديلا فيه جردقتان وقطع سحباج آخم مبرد,مَعهَ منديلا فيه جردقتان وَفَطعَ لخم سه باج مبرد,0.073193,معه منديلا فيه جردقتان وفطع لخم سه باج مبرد,معه منديلا فيه جردقتان وقطع لحم سكباج مبرد


: 

In [52]:
df.to_csv("nonllms_ocr_benchmark_results.csv", index=False)
print("Results saved to 'nonllms_ocr_benchmark_results.csv'")

Results saved to 'qari_ocr_benchmark_results.csv'
