In [1]:
import os
import pandas as pd
from datasets import load_dataset
from PIL import Image
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText
from jiwer import wer, cer
from tqdm import tqdm
import re

In [2]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)

In [3]:
datasets_to_benchmark = [    
    "ahmedheakl/arocrbench_synthesizear",
    "ahmedheakl/arocrbench_patsocr",
    "ahmedheakl/arocrbench_historyar",
    "ahmedheakl/arocrbench_historicalbooks",
    "ahmedheakl/arocrbench_khattparagraph",
    "ahmedheakl/arocrbench_adab",
    "ahmedheakl/arocrbench_muharaf",
    "ahmedheakl/arocrbench_onlinekhatt",
    "ahmedheakl/arocrbench_khatt",
    "ahmedheakl/arocrbench_isippt",
    "ahmedheakl/arocrbench_arabicocr",
    "ahmedheakl/arocrbench_hindawi",
    "ahmedheakl/arocrbench_evarest"

]

In [4]:
processor = AutoProcessor.from_pretrained("NAMAA-Space/Qari-OCR-0.1-VL-2B-Instruct", use_fast=True)
model = AutoModelForImageTextToText.from_pretrained("NAMAA-Space/Qari-OCR-0.1-VL-2B-Instruct", torch_dtype="auto", device_map="cuda:1")

In [5]:
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image"},
            {"type": "text", "text": "Below is the image of one page of a document. Just return the plain text representation of this document as if you were reading it naturally. Do not hallucinate"}
        ]
    }
]
text_prompt = processor.apply_chat_template(messages, add_generation_prompt=True)

In [6]:
def qwen_extract_ocr(text_prompt, image):
    inputs = processor(text=[text_prompt], images=[image], padding=True, return_tensors="pt")
    inputs = inputs.to("cuda:1")
    output_ids = model.generate(**inputs, max_new_tokens=1024)
    generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
    output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    return " ".join(output_text).replace("The Arabic text in the image is:\n\n", "")

In [7]:
def remove_diacritics(text):
    diacritics = [
        '\u0617', '\u0618', '\u0619', '\u061A',
        '\u064B', '\u064C', '\u064D', '\u064E', '\u064F', '\u0650',
        '\u0651', '\u0652', '\u0653', '\u0654', '\u0655', '\u0656',
        '\u0657', '\u0658', '\u0659', '\u065A', '\u065B', '\u065C',
        '\u065D', '\u065E', '\u065F', '\u0670'
    ]
    pattern = '[' + ''.join(diacritics) + ']'
    return re.sub(pattern, '', text)

def remove_english_letters(text):
    pattern = r'[a-zA-Z]'
    return re.sub(pattern, '', text)

def clean_text(text):
    cleaned = re.sub(r'[\n\t]+', ' ', text)
    cleaned = re.sub(r'\s+', ' ', cleaned)
    return cleaned.strip()



In [8]:
def benchmark_dataset(dataset_name, split="train"):
    print(f"Loading dataset: {dataset_name}")
    dataset = load_dataset(dataset_name, split=split)
    
    results = []
    sample_keys = dataset[0].keys()  
    ground_truth_key = "text" if "text" in sample_keys else "answer" if "answer" in sample_keys else None
    if ground_truth_key is None:
        raise ValueError(f"No suitable ground truth key ('text' or 'answer') found in dataset: {dataset_name}")
    
    # Prepare samples
    for sample in tqdm(dataset, desc=f"Preparing samples from {dataset_name}"):
        image = sample["image"]
        if image.mode != "RGB":
            image = image.convert("RGB")
        ground_truth = sample[ground_truth_key]
        results.append({
            "dataset": dataset_name,
            "image": image,
            "ground_truth": ground_truth,
            "qari": None,
            "status": "pending"  # Add status to track processing outcome
        })
    
    skipped_samples = 0
    for i, sample in tqdm(enumerate(results), total=len(results), desc=f"Running Qari on {dataset_name}"):
        torch.cuda.empty_cache()
        image = sample["image"]
        try:
            text_result = qwen_extract_ocr(text_prompt, image)
            results[i]["qari"] = text_result
            results[i]["status"] = "success"
        except RuntimeError as e:
            if "CUDA out of memory" in str(e):
                print(f"Skipping sample {i} in {dataset_name} due to CUDA out of memory error.")
                results[i]["qari"] = "Skipped - CUDA OOM"
                results[i]["status"] = "skipped"
                skipped_samples += 1
            else:
                print(f"Error processing sample {i} in {dataset_name}: {str(e)}")
                results[i]["qari"] = f"Error - {str(e)}"
                results[i]["status"] = "error"
    
    print(f"Processed {len(results) - skipped_samples} samples successfully, skipped {skipped_samples} due to memory issues in {dataset_name}.")
    return results

In [None]:
all_results = []
for dataset_name in datasets_to_benchmark:
    dataset_results = benchmark_dataset(dataset_name)
    all_results.extend(dataset_results)

Loading dataset: ahmedheakl/arocrbench_synthesizear


Preparing samples from ahmedheakl/arocrbench_synthesizear: 100%|██████████| 500/500 [00:00<00:00, 1135.05it/s]
Running Qari on ahmedheakl/arocrbench_synthesizear:   3%|▎         | 16/500 [01:00<39:10,  4.86s/it]

In [None]:
df = pd.DataFrame(all_results)

In [None]:
df.to_csv("uncleand_qari.csv")

In [None]:
df["ground_truth"] = df["ground_truth"].apply(remove_english_letters).apply(clean_text)
df["qari"] = df["qari"].apply(remove_english_letters).apply(clean_text)
df["ground_truth_t"] = df["ground_truth"].apply(remove_diacritics)
df["qari_t"] = df["qari"].apply(remove_diacritics)

In [None]:
for dataset_name in datasets_to_benchmark:
    subset_df = df[df["dataset"] == dataset_name]
    print(f"\nResults for {dataset_name}:")
    
    wer_score = wer(subset_df["ground_truth"].tolist(), subset_df["qari"].tolist())
    cer_score = cer(subset_df["ground_truth"].tolist(), subset_df["qari"].tolist())
    print(f"Qari - WER: {wer_score:.2f}, CER: {cer_score:.2f}")
    
    wer_score_t = wer(subset_df["ground_truth_t"].tolist(), subset_df["qari_t"].tolist())
    cer_score_t = cer(subset_df["ground_truth_t"].tolist(), subset_df["qari_t"].tolist())
    print(f"Qari (no diacritics) - WER: {wer_score_t:.2f}, CER: {cer_score_t:.2f}")

In [None]:
df.to_csv("qari_ocr_benchmark_results.csv", index=False)
print("Results saved to 'qari_ocr_benchmark_results.csv'")

In [None]:
print("\nSample of results:")
print(df[["dataset", "ground_truth", "qari"]].head())