In [1]:
import triton
triton.__version__

'3.1.0'

In [2]:
import os
import pandas as pd
from datasets import load_dataset
from PIL import Image
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText
from jiwer import wer, cer
from tqdm import tqdm
import re

In [3]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)

In [4]:
datasets_to_benchmark = [    
    "ahmedheakl/arocrbench_synthesizear",
    "ahmedheakl/arocrbench_patsocr",
    "ahmedheakl/arocrbench_arabicocr",
    "ahmedheakl/arocrbench_hindawi",
]

In [None]:
processor = AutoProcessor.from_pretrained("NAMAA-Space/Qari-OCR-0.2.2-Arabic-2B-Instruct", use_fast=True,)
model = AutoModelForImageTextToText.from_pretrained("NAMAA-Space/Qari-OCR-0.2.2-Arabic-2B-Instruct", torch_dtype="auto", device_map="cuda:1",)

In [6]:
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": "Just return the plain text representation of this document as if you were reading it naturally. Do not hallucinate"}
        ]
    }
]
text_prompt = processor.apply_chat_template(messages, add_generation_prompt=True)

In [7]:
def qwen_extract_ocr(text_prompt, image):
    image = resize_image(image, min_factor=28, max_size=(1024, 1024))
    inputs = processor(text=[text_prompt], images=[image], padding=True, return_tensors="pt")
    inputs = inputs.to("cuda:1")
    output_ids = model.generate(**inputs, max_new_tokens=1024)
    generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
    output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    return " ".join(output_text)

In [8]:
def remove_diacritics(text):
    diacritics = [
        '\u0617', '\u0618', '\u0619', '\u061A',
        '\u064B', '\u064C', '\u064D', '\u064E', '\u064F', '\u0650',
        '\u0651', '\u0652', '\u0653', '\u0654', '\u0655', '\u0656',
        '\u0657', '\u0658', '\u0659', '\u065A', '\u065B', '\u065C',
        '\u065D', '\u065E', '\u065F', '\u0670'
    ]
    pattern = '[' + ''.join(diacritics) + ']'
    return re.sub(pattern, '', text)

def remove_english_letters(text):
    pattern = r'[a-zA-Z]'
    return re.sub(pattern, '', text)

def clean_text(text):
    cleaned = re.sub(r'[\n\t]+', ' ', text)
    cleaned = re.sub(r'\s+', ' ', cleaned)
    return cleaned.strip()



In [9]:
import math 
def resize_image(image, min_factor=28, max_size=(1024, 1024)):
    width, height = image.size
    factor = min_factor  # Must be divisible by this (processor's patch size * merge_size)
    
    # Calculate aspect ratio
    aspect_ratio = width / height
    
    # Ensure dimensions are at least factor and divisible by factor
    new_width = max(factor, width)
    new_height = max(factor, height)
    
    # Adjust to be multiples of factor while preserving aspect ratio as closely as possible
    new_width = math.ceil(new_width / factor) * factor
    new_height = math.ceil(new_height / factor) * factor
    
    # Recalculate to maintain aspect ratio
    if new_width / new_height > aspect_ratio:
        new_width = int(new_height * aspect_ratio)
        new_width = math.ceil(new_width / factor) * factor  # Ensure divisible by factor
    else:
        new_height = int(new_width / aspect_ratio)
        new_height = math.ceil(new_height / factor) * factor  # Ensure divisible by factor
    
    # Ensure within max_size
    if new_width > max_size[0] or new_height > max_size[1]:
        if new_width > max_size[0]:
            new_width = max_size[0] - (max_size[0] % factor)  # Largest multiple of factor <= max_size[0]
            new_height = int(new_width / aspect_ratio)
            new_height = math.ceil(new_height / factor) * factor
        if new_height > max_size[1]:
            new_height = max_size[1] - (max_size[1] % factor)  # Largest multiple of factor <= max_size[1]
            new_width = int(new_height * aspect_ratio)
            new_width = math.ceil(new_width / factor) * factor
    
    # Resize if necessary
    if (new_width, new_height) != (width, height):
        image = image.resize((new_width, new_height), Image.LANCZOS)
    return image

In [10]:
def benchmark_dataset(dataset_name, split="train"):
    print(f"Loading dataset: {dataset_name}")
    dataset = load_dataset(dataset_name, split=split)
    
    results = []
    sample_keys = dataset[0].keys()  
    ground_truth_key = "text" if "text" in sample_keys else "answer" if "answer" in sample_keys else None
    if ground_truth_key is None:
        raise ValueError(f"No suitable ground truth key ('text' or 'answer') found in dataset: {dataset_name}")
    
    # Prepare samples
    for sample in tqdm(dataset, desc=f"Preparing samples from {dataset_name}"):
        image = sample["image"]
        if image.mode != "RGB":
            image = image.convert("RGB")
        ground_truth = sample[ground_truth_key]
        results.append({
            "dataset": dataset_name,
            "image": image,
            "ground_truth": ground_truth,
            "qari2": None,
            "status": "pending"  # Add status to track processing outcome
        })
    
    skipped_samples = 0
    for i, sample in tqdm(enumerate(results), total=len(results), desc=f"Running Qari on {dataset_name}"):
        # torch.cuda.empty_cache()
        image = sample["image"]
        try:
            text_result = qwen_extract_ocr(text_prompt, image)
            results[i]["qari2"] = text_result
            results[i]["status"] = "success"
        except RuntimeError as e:
            if "CUDA out of memory" in str(e):
                print(f"Skipping sample {i} in {dataset_name} due to CUDA out of memory error.")
                results[i]["qari2"] = "Skipped - CUDA OOM"
                results[i]["status"] = "skipped"
                skipped_samples += 1
            else:
                print(f"Error processing sample {i} in {dataset_name}: {str(e)}")
                results[i]["qari2"] = f"Error - {str(e)}"
                results[i]["status"] = "error"
    
    print(f"Processed {len(results) - skipped_samples} samples successfully, skipped {skipped_samples} due to memory issues in {dataset_name}.")
    return results

In [11]:
all_results = []
for dataset_name in datasets_to_benchmark:
    dataset_results = benchmark_dataset(dataset_name)
    all_results.extend(dataset_results)

Loading dataset: ahmedheakl/arocrbench_synthesizear


Preparing samples from ahmedheakl/arocrbench_synthesizear: 100%|██████████| 500/500 [00:00<00:00, 875.17it/s]
Running Qari on ahmedheakl/arocrbench_synthesizear: 100%|██████████| 500/500 [54:43<00:00,  6.57s/it]  


Processed 500 samples successfully, skipped 0 due to memory issues in ahmedheakl/arocrbench_synthesizear.
Loading dataset: ahmedheakl/arocrbench_patsocr


Preparing samples from ahmedheakl/arocrbench_patsocr: 100%|██████████| 500/500 [00:00<00:00, 685.52it/s]
Running Qari on ahmedheakl/arocrbench_patsocr: 100%|██████████| 500/500 [1:00:41<00:00,  7.28s/it]


Processed 500 samples successfully, skipped 0 due to memory issues in ahmedheakl/arocrbench_patsocr.
Loading dataset: ahmedheakl/arocrbench_arabicocr


Preparing samples from ahmedheakl/arocrbench_arabicocr: 100%|██████████| 50/50 [00:00<00:00, 781.64it/s]
Running Qari on ahmedheakl/arocrbench_arabicocr: 100%|██████████| 50/50 [10:11<00:00, 12.22s/it]


Processed 50 samples successfully, skipped 0 due to memory issues in ahmedheakl/arocrbench_arabicocr.
Loading dataset: ahmedheakl/arocrbench_hindawi


Preparing samples from ahmedheakl/arocrbench_hindawi: 100%|██████████| 200/200 [00:01<00:00, 140.61it/s]
Running Qari on ahmedheakl/arocrbench_hindawi: 100%|██████████| 200/200 [2:44:45<00:00, 49.43s/it]  

Processed 200 samples successfully, skipped 0 due to memory issues in ahmedheakl/arocrbench_hindawi.





In [12]:
df = pd.DataFrame(all_results)

In [13]:
df[df['status']!='success']

Unnamed: 0,dataset,image,ground_truth,qari2,status


In [14]:

df.to_csv("uncleand_qari2.csv")

In [15]:
df.head()

Unnamed: 0,dataset,image,ground_truth,qari2,status
0,ahmedheakl/arocrbench_synthesizear,<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=639x114 at 0x775534276B10>,"وَإِذا ما سَأَلَتْنِي عَن مَعْنَى لَفَظَهُ ""عرب"" عِنْدَ","وَإِذا ما سَأَلْتَنِي عَن مَعْنَى لَفْظَهُ ""عرب"" عِنْدَ",success
1,ahmedheakl/arocrbench_synthesizear,<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1182x147 at 0x775534276ED0>,أَمّا فَهُم النُصُوصِ وَاِسْتِنْباط مَعانِيها بِوَجْهٍ صَحِيحٌ دقيق،,أمّا فَهْم النُّصوصِ وَإِسْتِنْباطِ مَعانِيها بِوَجْهٍ ضَحِيحٌ دقيقٍ،,success
2,ahmedheakl/arocrbench_synthesizear,<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=994x163 at 0x775534277450>,تُثِير فِيها الاسود، وَهُوَ ما يَتَعارَض مَعَ اِكْتِشافِ,تُثِير فِيها الاسود، وَهْوَ ما يَتَعارَض مَعَ إِكْتِشافِ,success
3,ahmedheakl/arocrbench_synthesizear,<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1315x224 at 0x7755342779D0>,الجماعي، وَكادَت تَصِل إِلَى المَرْحَلَةِ الاِشْتِراكِيَّة لِسَيْطَرَةِ أَكْبَرَ,وَكَادَت تَصل إِلَى المَرْحَلَةِ الاِشْتِراكِيَّةِ لِسَيْطَرَةِ أَكْبَرَ الجماعي، وَكَادَت تَصل إِلَى المَرْحَلَةِ الاِشْتِراكِيَّةِ لِسَيْطَرَةِ أَكْبَرَ الجماعي، وَكَادَت تَصل إِلَى المَرْحَلَةِ الاِشْتِراكِيَّةِ لِسَيْطَرَةِ أَكْبَرَ الجماعي، وَكَادَت تَصل إِلَى المَرْحَلَةِ الاِشْتِراكِيَّةِ لِسَيْطَرَةِ أَكْبَرَ الجماعي، وَكَادَت تَصل إِلَى المَرْحَلَةِ الاِشْتِراكِيَّةِ لِسَيْطَرَةِ أَكْبَرَ الجماعي، وَكَادَت تَصل إِلَى المَرْحَلَةِ الاِشْتِراكِيَّةِ لِسَيْطَرَةِ أَكْبَرَ الجماعي، وَكَادَت تَصل إِلَى المَرْحَلَةِ الاِشْتِراكِيَّةِ لِسَيْطَرَةِ أَكْبَرَ الجماعي، وَكَادَت تَصل إِلَى المَرْحَلَةِ الاِشْتِراكِيَّةِ لِسَيْطَرَةِ أَكْبَرَ الجماعي، وَكَادَت تَصل إِلَى المَرْحَلَةِ الاِشْتِراكِيَّةِ لِسَيْطَرَةِ أَكْبَرَ الجماعي، وَكَادَت تَصل إِلَى المَرْحَلَةِ الاِشْتِراكِيَّةِ لِسَيْطَرَةِ أَكْبَرَ الجماعي، وَكَادَت تَصل إِلَى المَرْحَلَةِ الاِشْتِراكِيَّةِ لِسَيْطَرَةِ أَكْبَرَ الجماعي، وَكَادَت تَصل إِلَى المَرْحَلَةِ الاِشْتِراكِيَّةِ لِسَيْطَرَةِ أَكْبَرَ الجماعي، وَكَادَت تَصل إِلَى المَرْحَلَةِ الاِشْتِراكِيَّةِ لِسَيْطَرَةِ أَكْبَرَ الجماعي، وَكَادَت تَصل إِلَى المَرْحَلَةِ الاِشْتِراكِيَّةِ لِسَيْطَرَةِ أَكْبَرَ الجماعي، وَكَادَت تَصل إِلَى المَرْحَلَةِ الاِشْتِراكِيَّةِ لِسَيْطَرَةِ أَكْبَرَ الجماعي، وَكَادَت تَصل إِلَى المَرْحَلَةِ الاِشْتِراكِيَّةِ لِسَيْطَرَةِ أَكْبَرَ الجماعي، وَكَادَت تَصل إِلَى الم,success
4,ahmedheakl/arocrbench_synthesizear,<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1224x129 at 0x775534277F50>,مَعَهُ مَنْدِيلا فِيهِ جردقتان وَقَطَعَ لَحْم سَكْباج مُبَرَّد,مَعَهْ مَنْدِيلا فِيهِ جردقتان وَقَطَعَ لَحْم سَكْباج مْبَرَّد,success


In [16]:
from evaluate import load

cer = load('cer')
wer = load('wer')


In [17]:


def evaluatemodel(dataset):
    preds = []
    refs = []

    for i in range(len(dataset)):
        preds.append(dataset.iloc[i]['qari2'])
        refs.append(dataset.iloc[i]['ground_truth'])

    wer_score = wer.compute(predictions=preds, references=refs)

    cer_score = cer.compute(predictions=preds, references=refs)
    return {"wer": wer_score, "cer": cer_score}


In [18]:
for dataset_name in datasets_to_benchmark:
    subset_df = df[df["dataset"] == dataset_name]
    print(dataset_name)
    print(evaluatemodel(subset_df))


ahmedheakl/arocrbench_synthesizear
{'wer': 0.9248366013071896, 'cer': 0.7236747667384394}
ahmedheakl/arocrbench_patsocr
{'wer': 1.1536427025727338, 'cer': 1.1630813307318038}
ahmedheakl/arocrbench_arabicocr
{'wer': 0.0694304010785305, 'cer': 0.016119812507145306}
ahmedheakl/arocrbench_hindawi
{'wer': 0.49612378770283416, 'cer': 0.28948339582074223}


In [19]:
for dataset_name in datasets_to_benchmark:
    subset_df = df[df["dataset"] == dataset_name]
    print(dataset_name)
    print(evaluatemodel(subset_df))


ahmedheakl/arocrbench_synthesizear
{'wer': 0.9248366013071896, 'cer': 0.7236747667384394}
ahmedheakl/arocrbench_patsocr
{'wer': 1.1536427025727338, 'cer': 1.1630813307318038}
ahmedheakl/arocrbench_arabicocr
{'wer': 0.0694304010785305, 'cer': 0.016119812507145306}
ahmedheakl/arocrbench_hindawi
{'wer': 0.49612378770283416, 'cer': 0.28948339582074223}


In [20]:
evaluatemodel(df)

{'wer': 0.6195266777735059, 'cer': 0.44736391053669955}

In [21]:
evaluatemodel(df)

{'wer': 0.6195266777735059, 'cer': 0.44736391053669955}

In [22]:
import pandas as pd
df = pd.read_csv("uncleand_qari2.csv",index_col="Unnamed: 0")

In [23]:
df["ground_truth"] = df["ground_truth"].astype(str).fillna("").apply(remove_english_letters).apply(clean_text).astype(str)
df["qari2"] = df["qari2"].astype(str).fillna("").apply(remove_english_letters).apply(clean_text).astype(str)
df["ground_truth_t"] = df["ground_truth"].astype(str).fillna("").apply(remove_diacritics).astype(str)
df["qari2_t"] = df["qari2"].astype(str).fillna("").apply(remove_diacritics).astype(str)

In [1]:
df['dataset'].unique()

NameError: name 'df' is not defined

In [25]:
for dataset_name in datasets_to_benchmark:
    subset_df = df[df["dataset"] == dataset_name]
    print(f"\nResults for {dataset_name}:")
    
    wer_score = wer(subset_df["ground_truth"].tolist(), subset_df["qari2"].tolist())
    cer_score = cer(subset_df["ground_truth"].tolist(), subset_df["qari2"].tolist())
    print(f"Qari2 - WER: {wer_score:.2f}, CER: {cer_score:.2f}")
    
    wer_score_t = wer(subset_df["ground_truth_t"].tolist(), subset_df["qari2_t"].tolist())
    cer_score_t = cer(subset_df["ground_truth_t"].tolist(), subset_df["qari2_t"].tolist())
    print(f"Qari2 (no diacritics) - WER: {wer_score_t:.2f}, CER: {cer_score_t:.2f}")


Results for ahmedheakl/arocrbench_synthesizear:


TypeError: 'WER' object is not callable

In [None]:
df.to_csv("qari2_ocr_benchmark_results.csv", index=False)
print("Results saved to 'qari2_ocr_benchmark_results.csv'")

In [None]:
print("\nSample of results:")
print(df[["dataset", "ground_truth", "qari2"]].head())