## Local

In [None]:
from huggingface_hub import list_models

def get_models_by_multi_filters():
    models = list(list_models(
        task="image-classification",
        library="pytorch",
        trained_dataset="imagenet",
    ))
    if models:
        for model in models[0:10]:
            print(f"- {model.modelId}")

def get_qwen_models(search_string):
    # Search for Qwen models
    qwen_models = list(list_models(filter=search_string)) # list() to convert generator to list
    if qwen_models:
        print(f"Found {len(qwen_models)} models (may be paginated):")
        # Print some information about the the first 10 models
        for model in qwen_models[:10]: # Print first 10
            print(f"- {model.modelId}")  # Access attributes of the model objects

def get_gpt2_models():
    # Filter by model task category
    gpt_models = list(list_models(task="gpt2"))
    if gpt_models:
        print(f"Found {len(gpt_models)} GPT models. Printing First 10:")
        for model in gpt_models[0:10]:
            print(f"- {model.modelId}")
    else:
        print("No GPT models found.")

def get_top_10_models():
    top_10_models = list(list_models(sort="downloads", direction=-1, limit=10))
    if top_10_models:
        for model in top_10_models[1:10]:
            print(f"- {model.modelId}   {model.created_at}   {model.downloads}   {model.pipeline_tag}")



In [None]:
"""
Application to provide benchmark timers for code.
Usage:
# from my_timer_class import MyTimer
from my_timer_func import my_timer
import time

@MyTimer3(name="decorator")
@my_timer
"""

import functools
import time

def my_timer(orig_func):
    import time
    @functools.wraps(orig_func)
    def wrapper_timer(*args, **kwargs):
        tic = time.perf_counter()
        value = orig_func(*args, **kwargs)
        toc = time.perf_counter()
        elapsed_time = toc - tic
        print(f"Elapsed time to run {orig_func.__name__}: {elapsed_time:0.4f} seconds")
        return value
    return wrapper_timer


class MyTimer():
    # usage:
    #
    # from MyTimer import MyTimer
    # with MyTimer():
    #    func(x,y)

    def __init__(self):
        self.start = time.time()
        self.start_p = time.perf_counter()

    def __enter__(self):
        return self

    def __exit__(self, *args, **kwargs):
        end = time.time()
        end_p = time.perf_counter()
        runtime = end - self.start
        runtime_p = end_p - self.start_p
        msg = 'The function took {time} seconds to complete'
        print(msg.format(time=runtime))
        msg_p = 'The function took {time} perf seconds to complete'
        print(msg_p.format(time=runtime_p))

In [None]:
!pip install qwen_vl_utils

In [None]:
import huggingface_hub
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info

In [None]:
get_qwen_models("Qwen")


Found 187 models (may be paginated):
- ystemsrx/Qwen2-Boundless
- DavidAU/AI_Autocorrect__Auto-Creative-Enhancement__Auto-Low-Quant-Optimization__gguf-exl2-hqq-SOFTWARE
- anemll/anemll-Qwen3-4B-ctx1024_0.3.0
- A2H0H0R1/Qwen-7B-Chat-Int4-Qlora-biology
- Existance/Qwen1-5-4B-Chat-hindi-sft-v2
- Alibaba-NLP/gte-Qwen1.5-7B-instruct
- MoMonir/gte-Qwen1.5-7B-instruct-GGUF
- agier9/gte-Qwen1.5-7B-instruct-Q5_K_M-GGUF
- SandLogicTechnologies/Nxcode-CQ-7B-orpo-GGUF
- Sri-Vigneshwar-DJ/hawky-ai-Qwen2-Math-72B-Instruct-GGUF


In [None]:
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    # "Qwen/Qwen2.5-VL-3B-Instruct", torch_dtype="auto", device_map="auto"
    # "Qwen/Qwen2.5-VL-3B-Instruct", torch_dtype="auto", device_map="cpu"
    "Qwen/Qwen2.5-VL-3B-Instruct", torch_dtype="auto", device_map="cuda"
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.


In [None]:
# larger_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
#     # "Qwen/Qwen2.5-VL-3B-Instruct", torch_dtype="auto", device_map="auto"
#     # "Qwen/Qwen2.5-VL-3B-Instruct", torch_dtype="auto", device_map="cpu"
#     "Qwen/Qwen2.5-VL-7B-Instruct", torch_dtype="auto", device_map="cuda"
# )

### Bank Statement

In [None]:
# image = r"..\images\dl1.jpg"
# prompt = "Extract all text found on the image, including handwritten signatures"

# image = r"/content/UAE_id.jpg"
image = r"/content/BankStatement2.jpeg"
prompt = """
You are a financial data extraction agent.

Your task is to:

1. Extract **all transactions** from the provided bank statement image.
2. Return the data in **valid, minified JSON format** with the following structure:

{
  "account_details": {
    "account_holder_name": "",
    "account_number": "",
    "iban_number": "",
    "currency": "",
    "account_type": "",
    "branch": "",
    "statement_period_from": "",
    "statement_period_to": ""
  },
  "transactions": [
    {
      "transaction_date": "",
      "value_date": "",
      "cheque_ref_no": "",
      "description": "",
      "debit": "",
      "credit": "",
      "balance": ""
    }
    // Continue for all transactions listed
  ],
  "notes": ""
}

3. For empty or missing fields, return an empty string ("").

4. Do NOT include any explanations or extra text. Only output a **valid JSON object**.

Ensure that:
- Dates are in YYYY-MM-DD format if possible.
- Debit, credit, and balance amounts are parsed as strings exactly as shown.
- Include any footnotes or important notes under the “notes” field.

Begin your extraction now.
"""


# image = r"..\images\WalmartReceipt.png"
# prompt = "What is the account number shown on this image?"

### ID

In [None]:
image = r"/content/UAE_id.jpg"

prompt = """
You are an identity card data extraction agent.

Your task is to:

1. Extract **all key details** from the provided UAE Identity Card image.
2. Return the data in **valid, minified JSON format** with the following structure:

{
  "id_number": "",
  "full_name_english": "",
  "full_name_arabic": "",
  "nationality_english": "",
  "nationality_arabic": "",
  "gender": "",
  "date_of_birth": "",
  "expiry_date": "",
  "additional_notes": ""
}

3. For empty or missing fields, return an empty string ("").

4. Do NOT include any explanations or extra text. Only output a **valid JSON object**.

Ensure that:
- Dates are in YYYY-MM-DD format if possible.
- Both English and Arabic fields are captured accurately.
- Any additional information such as profession, place of issue, or notes is included under “additional_notes”.

Begin your extraction now.
"""


### Passport

In [None]:
image = r"/content/Passport.png"

prompt = """
You are a passport data extraction agent.

Your task is to:

1. Extract **all key details** from the provided Egyptian passport image.
2. Return the data in **valid, minified JSON format** with the following structure:

{
  "passport_number": "",
  "full_name_english": "",
  "full_name_arabic": "",
  "date_of_birth": "",
  "place_of_birth": "",
  "nationality": "",
  "sex": "",
  "date_of_issue": "",
  "date_of_expiry": "",
  "issuing_office": "",
  "profession": "",
  "mrz": "",
  "additional_notes": ""
}

3. For empty or missing fields, return an empty string ("").

4. Do NOT include any explanations or extra text. Only output a **valid JSON object**.

Ensure that:
- Dates are in YYYY-MM-DD format if possible.
- Both English and Arabic names are captured accurately.
- MRZ (Machine Readable Zone) is extracted as a single string under “mrz”.
- Any additional information is included under “additional_notes”.

Begin your extraction now.
"""


In [None]:
image = r"/content/Passport.png"

prompt = """
You are a passport data extraction agent.

Your task is to:

1. Extract **all key details** from the provided Egyptian passport image in **both English and Arabic**.
2. Return the data in **valid, minified JSON format** with the following structure:

{
  "passport_number": "",
  "full_name_english": "",
  "full_name_arabic": "",
  "date_of_birth_english": "",
  "date_of_birth_arabic": "",
  "place_of_birth_english": "",
  "place_of_birth_arabic": "",
  "nationality_english": "",
  "nationality_arabic": "",
  "sex_english": "",
  "sex_arabic": "",
  "date_of_issue_english": "",
  "date_of_issue_arabic": "",
  "date_of_expiry_english": "",
  "date_of_expiry_arabic": "",
  "issuing_office": "",
  "profession_english": "",
  "profession_arabic": "",
  "mrz": "",
  "additional_notes": ""
}

3. For empty or missing fields, return an empty string ("").

4. Do NOT include any explanations or extra text. Only output a **valid JSON object**.

Ensure that:
- Dates are in YYYY-MM-DD format if possible.
- Arabic and English fields are mapped clearly as shown.
- MRZ (Machine Readable Zone) is extracted fully under “mrz”.
- Any additional information is included under “additional_notes”.

Begin your extraction now.
"""


### Bill

In [None]:
image = r"/content/bill1.png"

prompt = """
You are an invoice data extraction agent.

Your task is to:

1. Extract **all key details** from the provided invoice image.
2. Return the data in **valid, minified JSON format** with the following structure:

{
  "invoice_number": "",
  "purchase_order_number": "",
  "date": "",
  "due_date": "",
  "bill_by": {
    "name": "",
    "customer_address": ""
  },
  "company": {
    "name": "",
    "address": "",
    "email": ""
  },
  "items": [
    {
      "description": "",
      "quantity": "",
      "unit_price": "",
      "total_price": ""
    }
    // Continue for all line items
  ],
  "subtotal": "",
  "tax": "",
  "total": ""
}

3. For empty or missing fields, return an empty string ("").

4. Do NOT include any explanations or extra text. Only output a **valid JSON object**.

Ensure that:
- All monetary values include currency symbols as shown.
- The items table is extracted with accurate columns.
- Customer address, company address, and emails are captured fully.

Begin your extraction now.
"""


In [None]:
from PIL import Image

def resize_image(image_path, max_dim=768):
    img = Image.open(image_path)
    img.thumbnail((max_dim, max_dim), Image.LANCZOS)
    img.save(image_path)  # overwrite or save to new file if you want to keep original
    return img

# Example usage:
image = resize_image(image)
print(image.size)


(768, 548)


In [None]:
messages = [
{
    "role": "user",
    "content": [
        {
            "type": "image",
            "image": image,
        },
        {"type": "text", "text": prompt},
    ],
}
]

In [None]:
# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

In [None]:
image_inputs, video_inputs = process_vision_info(messages)


In [None]:
image_inputs

[<PIL.Image.Image image mode=RGB size=756x560>]

In [None]:
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)

In [None]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()


In [None]:
inputs = inputs.to("cuda")

# new!!
model = model.to("cuda")

In [None]:
# generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids = model.generate(**inputs, max_new_tokens=1024)
# generated_ids = model.generate(**inputs, max_new_tokens=2048)

# generated_ids = model.generate(**inputs, max_new_tokens=256)

In [None]:
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]

In [None]:
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)

In [None]:
output_text

['```json\n{\n  "passport_number": "A37365438",\n  "full_name_english": "YOUSSEF MAHMOUD MOHAMED MAHMOUD ABDALLA",\n  "full_name_arabic": "يوسف محمود محمد عد الله",\n  "date_of_birth_english": "18/09/2001",\n  "date_of_birth_arabic": "٢٠١١/٩/١٨",\n  "place_of_birth_english": "ALEXANDRIA",\n  "place_of_birth_arabic": "الأسكندرية",\n  "nationality_english": "EGYPTIAN",\n  "nationality_arabic": "مصر",\n  "sex_english": "M",\n  "sex_arabic": "ذكر",\n  "date_of_issue_english": "16/04/2024",\n  "date_of_issue_arabic": "٢٠٢٤/٤/١٦",\n  "date_of_expiry_english": "15/04/2031",\n  "date_of_expiry_arabic": "٢٠٣١/٤/١٥",\n  "issuing_office": "جهاة إدار الجواز",\n  "profession_english": "STUDENT",\n  "profession_arabic": "طالب كلية هندسة ج طوم وتكنولوجيا",\n  "mrz": "P<EGYABDALLA<<YOUSSEF<MAHMOUD<MOHAMED<MAHMOU A373654385EGY0109187M3104150<<<<<<<<<<<<<<02",\n  "additional_notes": ""\n}\n```']

In [None]:
import json
import re
from IPython.display import JSON as ColabJSON

def display_clean_json(model_output):
    """
    Cleans LLM JSON outputs with markdown fences and displays them nicely.
    """
    if isinstance(model_output, list):
        model_output = model_output[0]

    # Remove ```json fences
    model_output = re.sub(r'^```json\\n?', '', model_output).strip()
    model_output = re.sub(r'```$', '', model_output).strip()

    # Parse JSON
    try:
        parsed = json.loads(model_output)
    except json.JSONDecodeError as e:
        print("❌ JSON parsing failed:", e)
        print("🔎 Raw output:", model_output)
        return None

    # Display nicely in Colab, fallback to print
    try:
        return ColabJSON(parsed)
    except:
        print(json.dumps(parsed, indent=2, ensure_ascii=False))
        return parsed

# Example usage
parsed_json = display_clean_json(output_text)


❌ JSON parsing failed: Expecting value: line 1 column 1 (char 0)
🔎 Raw output: ```json
{
  "passport_number": "A37365438",
  "full_name_english": "YOUSSEF MAHMOUD MOHAMED MAHMOUD ABDALLA",
  "full_name_arabic": "يوسف محمود محمد عد الله",
  "date_of_birth_english": "18/09/2001",
  "date_of_birth_arabic": "٢٠١١/٩/١٨",
  "place_of_birth_english": "ALEXANDRIA",
  "place_of_birth_arabic": "الأسكندرية",
  "nationality_english": "EGYPTIAN",
  "nationality_arabic": "مصر",
  "sex_english": "M",
  "sex_arabic": "ذكر",
  "date_of_issue_english": "16/04/2024",
  "date_of_issue_arabic": "٢٠٢٤/٤/١٦",
  "date_of_expiry_english": "15/04/2031",
  "date_of_expiry_arabic": "٢٠٣١/٤/١٥",
  "issuing_office": "جهاة إدار الجواز",
  "profession_english": "STUDENT",
  "profession_arabic": "طالب كلية هندسة ج طوم وتكنولوجيا",
  "mrz": "P<EGYABDALLA<<YOUSSEF<MAHMOUD<MOHAMED<MAHMOU A373654385EGY0109187M3104150<<<<<<<<<<<<<<02",
  "additional_notes": ""
}


In [None]:
import json
import re
from IPython.display import JSON as ColabJSON

def display_parsed_json(model_output):
    """
    Clean model output, parse as JSON, and display nicely.
    Works in Colab (interactive) and VS Code (pretty print).
    """
    # Extract string from list if needed
    if isinstance(model_output, list):
        model_output = model_output[0]

    # Remove markdown fences
    model_output = re.sub(r'^```json\\n?', '', model_output).strip()
    model_output = re.sub(r'```$', '', model_output).strip()

    try:
        parsed = json.loads(model_output)
    except json.JSONDecodeError as e:
        print(f"JSON parsing failed: {e}")
        print("Raw output:", model_output)
        return None

    # Display interactively if in Colab
    try:
        return ColabJSON(parsed)
    except:
        # Pretty print fallback
        print(json.dumps(parsed, indent=2))
        return parsed

# Example usage with your variable
parsed_json = display_parsed_json(output_text)


JSON parsing failed: Expecting value: line 1 column 1 (char 0)
Raw output: ```json
{
  "account_details": {
    "account_holder_name": "NASEER AHMAD",
    "account_number": "65574",
    "iban_number": "AE84 0240 0299 2004 6395 901",
    "currency": "AED",
    "account_type": "CURRENT ACCOUNT",
    "branch": "DUBAI WATERFRONT MARKET BRANCH",
    "statement_period_from": "24/01/2021",
    "statement_period_to": "24/01/2022"
  },
  "transactions": [
    {
      "transaction_date": "01/10/2021",
      "value_date": "01/10/2021",
      "cheque_ref_no": "029885F212741037",
      "description": "SMS CHARGES SEP-2021",
      "debit": "",
      "credit": "5.25",
      "balance": "45,414.53"
    },
    {
      "transaction_date": "01/10/2021",
      "value_date": "01/10/2021",
      "cheque_ref_no": "99700gp212745610",
      "description": "ELECTRON DEBIT CARD TRANSACTION: EPPCO SITE -36 DUBAI",
      "debit": "60.00",
      "credit": "",
      "balance": "45,429.28"
    },
    {
      "transact

## Nanonets

In [None]:
!pip install -U transformers

In [None]:
# # Use a pipeline as a high-level helper
# from transformers import pipeline

# pipe = pipeline("image-text-to-text", model="nanonets/Nanonets-OCR-s")
# messages = [
#     {
#         "role": "user",
#         "content": [
#             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"},
#             {"type": "text", "text": "What animal is on the candy?"}
#         ]
#     },
# ]
# pipe(text=messages)

In [None]:
# Load model directly
from transformers import AutoProcessor, AutoModelForVision2Seq

processor = AutoProcessor.from_pretrained("nanonets/Nanonets-OCR-s")
model = AutoModelForVision2Seq.from_pretrained("nanonets/Nanonets-OCR-s")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# from PIL import Image
# from transformers import AutoTokenizer, AutoProcessor, AutoModelForImageTextToText

# model_path = "nanonets/Nanonets-OCR-s"

# model = AutoModelForImageTextToText.from_pretrained(
#     model_path,
#     torch_dtype="auto",
#     device_map="auto",
#     attn_implementation="flash_attention_2"
# )
# model.eval()

In [None]:
# tokenizer = AutoTokenizer.from_pretrained(model_path)
# processor = AutoProcessor.from_pretrained(model_path)


# def ocr_page_with_nanonets_s(image_path, model, processor, max_new_tokens=4096):
#     prompt = """Extract the text from the above document as if you were reading it naturally. Return the tables in html format. Return the equations in LaTeX representation. If there is an image in the document and image caption is not present, add a small description of the image inside the <img></img> tag; otherwise, add the image caption inside <img></img>. Watermarks should be wrapped in brackets. Ex: <watermark>OFFICIAL COPY</watermark>. Page numbers should be wrapped in brackets. Ex: <page_number>14</page_number> or <page_number>9/22</page_number>. Prefer using ☐ and ☑ for check boxes."""
#     image = Image.open(image_path)
#     messages = [
#         {"role": "system", "content": "You are a helpful assistant."},
#         {"role": "user", "content": [
#             {"type": "image", "image": f"file://{image_path}"},
#             {"type": "text", "text": prompt},
#         ]},
#     ]
#     text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
#     inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt")
#     inputs = inputs.to(model.device)

#     output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
#     generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]

#     output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
#     return output_text[0]

# image_path = "/path/to/your/document.jpg"
# result = ocr_page_with_nanonets_s(image_path, model, processor, max_new_tokens=15000)
# print(result)
