In [10]:
import os
import pandas as pd
from datasets import load_dataset
from PIL import Image
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText
from jiwer import wer, cer
from tqdm import tqdm
import re

In [11]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)

In [12]:
datasets_to_benchmark = [    
    "ahmedheakl/arocrbench_synthesizear",
    "ahmedheakl/arocrbench_patsocr",
    "ahmedheakl/arocrbench_arabicocr",
    "ahmedheakl/arocrbench_hindawi",

]

In [13]:
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
model = Qwen2VLForConditionalGeneration.from_pretrained(
    "MBZUAI/AIN", torch_dtype="auto", device_map="auto"
)

processor = AutoProcessor.from_pretrained("MBZUAI/AIN")


config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/56.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/7 [00:00<?, ?it/s]

model-00001-of-00007.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00007.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00007.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00004-of-00007.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00005-of-00007.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00006-of-00007.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00007-of-00007.safetensors:   0%|          | 0.00/3.38G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/4.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/392 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

In [14]:
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": "Just return the plain text representation of this document as if you were reading it naturally. Do not hallucinate"}
        ]
    }
]
text_prompt = processor.apply_chat_template(messages, add_generation_prompt=True)

In [15]:
def qwen_extract_ocr(text_prompt, image):
    image = resize_image(image, min_factor=28, max_size=(1024, 1024))
    inputs = processor(text=[text_prompt], images=[image], padding=True, return_tensors="pt")
    inputs = inputs.to("cuda")
    output_ids = model.generate(**inputs, max_new_tokens=1024)
    generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
    output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    return " ".join(output_text)

In [16]:
def remove_diacritics(text):
    diacritics = [
        '\u0617', '\u0618', '\u0619', '\u061A',
        '\u064B', '\u064C', '\u064D', '\u064E', '\u064F', '\u0650',
        '\u0651', '\u0652', '\u0653', '\u0654', '\u0655', '\u0656',
        '\u0657', '\u0658', '\u0659', '\u065A', '\u065B', '\u065C',
        '\u065D', '\u065E', '\u065F', '\u0670'
    ]
    pattern = '[' + ''.join(diacritics) + ']'
    return re.sub(pattern, '', text)

def remove_english_letters(text):
    pattern = r'[a-zA-Z]'
    return re.sub(pattern, '', text)

def clean_text(text):
    cleaned = re.sub(r'[\n\t]+', ' ', text)
    cleaned = re.sub(r'\s+', ' ', cleaned)
    return cleaned.strip()



In [17]:
import math 
def resize_image(image, min_factor=28, max_size=(1024, 1024)):
    width, height = image.size
    factor = min_factor  # Must be divisible by this (processor's patch size * merge_size)
    
    # Calculate aspect ratio
    aspect_ratio = width / height
    
    # Ensure dimensions are at least factor and divisible by factor
    new_width = max(factor, width)
    new_height = max(factor, height)
    
    # Adjust to be multiples of factor while preserving aspect ratio as closely as possible
    new_width = math.ceil(new_width / factor) * factor
    new_height = math.ceil(new_height / factor) * factor
    
    # Recalculate to maintain aspect ratio
    if new_width / new_height > aspect_ratio:
        new_width = int(new_height * aspect_ratio)
        new_width = math.ceil(new_width / factor) * factor  # Ensure divisible by factor
    else:
        new_height = int(new_width / aspect_ratio)
        new_height = math.ceil(new_height / factor) * factor  # Ensure divisible by factor
    
    # Ensure within max_size
    if new_width > max_size[0] or new_height > max_size[1]:
        if new_width > max_size[0]:
            new_width = max_size[0] - (max_size[0] % factor)  # Largest multiple of factor <= max_size[0]
            new_height = int(new_width / aspect_ratio)
            new_height = math.ceil(new_height / factor) * factor
        if new_height > max_size[1]:
            new_height = max_size[1] - (max_size[1] % factor)  # Largest multiple of factor <= max_size[1]
            new_width = int(new_height * aspect_ratio)
            new_width = math.ceil(new_width / factor) * factor
    
    # Resize if necessary
    if (new_width, new_height) != (width, height):
        image = image.resize((new_width, new_height), Image.LANCZOS)
    return image

In [18]:
def benchmark_dataset(dataset_name, split="train"):
    print(f"Loading dataset: {dataset_name}")
    dataset = load_dataset(dataset_name, split=split)
    
    results = []
    sample_keys = dataset[0].keys()  
    ground_truth_key = "text" if "text" in sample_keys else "answer" if "answer" in sample_keys else None
    if ground_truth_key is None:
        raise ValueError(f"No suitable ground truth key ('text' or 'answer') found in dataset: {dataset_name}")
    
    # Prepare samples
    for sample in tqdm(dataset, desc=f"Preparing samples from {dataset_name}"):
        image = sample["image"]
        if image.mode != "RGB":
            image = image.convert("RGB")
        ground_truth = sample[ground_truth_key]
        results.append({
            "dataset": dataset_name,
            "image": image,
            "ground_truth": ground_truth,
            "ain": None,
            "status": "pending"  # Add status to track processing outcome
        })
    
    skipped_samples = 0
    for i, sample in tqdm(enumerate(results), total=len(results), desc=f"Running ain on {dataset_name}"):
        # torch.cuda.empty_cache()
        image = sample["image"]
        try:
            text_result = qwen_extract_ocr(text_prompt, image)
            results[i]["ain"] = text_result
            results[i]["status"] = "success"
        except RuntimeError as e:
            if "CUDA out of memory" in str(e):
                print(f"Skipping sample {i} in {dataset_name} due to CUDA out of memory error.")
                results[i]["ain"] = "Skipped - CUDA OOM"
                results[i]["status"] = "skipped"
                skipped_samples += 1
            else:
                print(f"Error processing sample {i} in {dataset_name}: {str(e)}")
                results[i]["ain"] = f"Error - {str(e)}"
                results[i]["status"] = "error"
    
    print(f"Processed {len(results) - skipped_samples} samples successfully, skipped {skipped_samples} due to memory issues in {dataset_name}.")
    return results

In [19]:
all_results = []
for dataset_name in datasets_to_benchmark:
    dataset_results = benchmark_dataset(dataset_name)
    all_results.extend(dataset_results)

Loading dataset: ahmedheakl/arocrbench_synthesizear


Preparing samples from ahmedheakl/arocrbench_synthesizear: 100%|██████████| 500/500 [00:00<00:00, 1112.94it/s]
Running ain on ahmedheakl/arocrbench_synthesizear: 100%|██████████| 500/500 [18:48<00:00,  2.26s/it]


Processed 500 samples successfully, skipped 0 due to memory issues in ahmedheakl/arocrbench_synthesizear.
Loading dataset: ahmedheakl/arocrbench_patsocr


Preparing samples from ahmedheakl/arocrbench_patsocr: 100%|██████████| 500/500 [00:00<00:00, 683.37it/s]
Running ain on ahmedheakl/arocrbench_patsocr: 100%|██████████| 500/500 [13:23<00:00,  1.61s/it]


Processed 500 samples successfully, skipped 0 due to memory issues in ahmedheakl/arocrbench_patsocr.
Loading dataset: ahmedheakl/arocrbench_arabicocr


Preparing samples from ahmedheakl/arocrbench_arabicocr: 100%|██████████| 50/50 [00:00<00:00, 741.43it/s]
Running ain on ahmedheakl/arocrbench_arabicocr: 100%|██████████| 50/50 [04:23<00:00,  5.28s/it]


Processed 50 samples successfully, skipped 0 due to memory issues in ahmedheakl/arocrbench_arabicocr.
Loading dataset: ahmedheakl/arocrbench_hindawi


Preparing samples from ahmedheakl/arocrbench_hindawi: 100%|██████████| 200/200 [00:01<00:00, 174.91it/s]
Running ain on ahmedheakl/arocrbench_hindawi: 100%|██████████| 200/200 [1:03:44<00:00, 19.12s/it]

Processed 200 samples successfully, skipped 0 due to memory issues in ahmedheakl/arocrbench_hindawi.





In [20]:
df = pd.DataFrame(all_results)

In [21]:
df[df['status']!='success']

Unnamed: 0,dataset,image,ground_truth,ain,status


In [22]:

df.to_csv("uncleand_ain.csv")

In [23]:
df.head()

Unnamed: 0,dataset,image,ground_truth,ain,status
0,ahmedheakl/arocrbench_synthesizear,<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=639x114 at 0x7EA68961B490>,"وَإِذا ما سَأَلَتْنِي عَن مَعْنَى لَفَظَهُ ""عرب"" عِنْدَ","وَإِذا ما سَأَلَتْنِي عَن مَعْنَى لَفَظَهُ ""عرب"" عِنْدَ",success
1,ahmedheakl/arocrbench_synthesizear,<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1182x147 at 0x7EA68961B9D0>,أَمّا فَهُم النُصُوصِ وَاِسْتِنْباط مَعانِيها بِوَجْهٍ صَحِيحٌ دقيق،,أَمّا فَهُم النُصُوصِ وَاِسْتِنْباط مَعانِيها بِوَجْهٍ صَحِيحٌ دقيق،,success
2,ahmedheakl/arocrbench_synthesizear,<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=994x163 at 0x7EA68961BED0>,تُثِير فِيها الاسود، وَهُوَ ما يَتَعارَض مَعَ اِكْتِشافِ,تُثِير فِيها الاسود، وَهُوَ ما يَتَعارَض مَعَ اِكْتِشافِ,success
3,ahmedheakl/arocrbench_synthesizear,<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1315x224 at 0x7EA6301D6690>,الجماعي، وَكادَت تَصِل إِلَى المَرْحَلَةِ الاِشْتِراكِيَّة لِسَيْطَرَةِ أَكْبَرَ,الجماعي، وَكادَت تَصِل إِلَى المَرْحَلَةِ الاِشْتِراكِيَّة لِسَيْطَرَةِ أَكْبَرَ,success
4,ahmedheakl/arocrbench_synthesizear,<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1224x129 at 0x7EA689678210>,مَعَهُ مَنْدِيلا فِيهِ جردقتان وَقَطَعَ لَحْم سَكْباج مُبَرَّد,مَعَهُ مَنْدِيلا فِيهِ جردقتان وَقَطَعَ لَحْم سَكْباج مُبَرَّد,success


In [24]:
import pandas as pd
df = pd.read_csv("uncleand_ain.csv",index_col="Unnamed: 0")

In [25]:
df["ground_truth"] = df["ground_truth"].astype(str).fillna("").apply(remove_english_letters).apply(clean_text).astype(str)
df["ain"] = df["ain"].astype(str).fillna("").apply(remove_english_letters).apply(clean_text).astype(str)
df["ground_truth_t"] = df["ground_truth"].astype(str).fillna("").apply(remove_diacritics).astype(str)
df["ain_t"] = df["ain"].astype(str).fillna("").apply(remove_diacritics).astype(str)

In [26]:
df['dataset'].unique()

array(['ahmedheakl/arocrbench_synthesizear',
       'ahmedheakl/arocrbench_patsocr', 'ahmedheakl/arocrbench_arabicocr',
       'ahmedheakl/arocrbench_hindawi'], dtype=object)

In [27]:
for dataset_name in datasets_to_benchmark:
    subset_df = df[df["dataset"] == dataset_name]
    print(f"\nResults for {dataset_name}:")
    
    wer_score = wer(subset_df["ground_truth"].tolist(), subset_df["ain"].tolist())
    cer_score = cer(subset_df["ground_truth"].tolist(), subset_df["ain"].tolist())
    print(f"ain - WER: {wer_score:.2f}, CER: {cer_score:.2f}")
    
    wer_score_t = wer(subset_df["ground_truth_t"].tolist(), subset_df["ain_t"].tolist())
    cer_score_t = cer(subset_df["ground_truth_t"].tolist(), subset_df["ain_t"].tolist())
    print(f"ain (no diacritics) - WER: {wer_score_t:.2f}, CER: {cer_score_t:.2f}")


Results for ahmedheakl/arocrbench_synthesizear:
ain - WER: 0.17, CER: 0.04
ain (no diacritics) - WER: 0.02, CER: 0.00

Results for ahmedheakl/arocrbench_patsocr:
ain - WER: 0.01, CER: 0.00
ain (no diacritics) - WER: 0.01, CER: 0.00

Results for ahmedheakl/arocrbench_arabicocr:
ain - WER: 0.01, CER: 0.00
ain (no diacritics) - WER: 0.01, CER: 0.00

Results for ahmedheakl/arocrbench_hindawi:
ain - WER: 0.14, CER: 0.07
ain (no diacritics) - WER: 0.09, CER: 0.05


In [28]:
df.to_csv("ain_ocr_benchmark_results.csv", index=False)
print("Results saved to 'ain_ocr_benchmark_results.csv'")

Results saved to 'ain_ocr_benchmark_results.csv'


In [32]:
print("\nSample of results:")
print(df[["dataset", "ground_truth", "ain"]].head())


Sample of results:
                              dataset  \
0  ahmedheakl/arocrbench_synthesizear   
1  ahmedheakl/arocrbench_synthesizear   
2  ahmedheakl/arocrbench_synthesizear   
3  ahmedheakl/arocrbench_synthesizear   
4  ahmedheakl/arocrbench_synthesizear   

                                                                       ground_truth  \
0                           وَإِذا ما سَأَلَتْنِي عَن مَعْنَى لَفَظَهُ "عرب" عِنْدَ   
1              أَمّا فَهُم النُصُوصِ وَاِسْتِنْباط مَعانِيها بِوَجْهٍ صَحِيحٌ دقيق،   
2                          تُثِير فِيها الاسود، وَهُوَ ما يَتَعارَض مَعَ اِكْتِشافِ   
3  الجماعي، وَكادَت تَصِل إِلَى المَرْحَلَةِ الاِشْتِراكِيَّة لِسَيْطَرَةِ أَكْبَرَ   
4                    مَعَهُ مَنْدِيلا فِيهِ جردقتان وَقَطَعَ لَحْم سَكْباج مُبَرَّد   

                                                                              qari  
0                            وإِذا ما سَأَلْتَني عَن مَعْنِي لَفَظَهُ "عرب" عَنْدَ  
1                     أما فهُم النُّصوص 