## Donut OCR

In [None]:
import torch
print(torch.__version__)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

In [None]:
from transformers import DonutProcessor, VisionEncoderDecoderModel
from PIL import Image
import json

In [None]:
# model_name = "naver-clova-ix/donut-base"  # base pre-trained Donut
model_name = "naver-clova-ix/donut-base-finetuned-cord-v2"
processor = DonutProcessor.from_pretrained(model_name)
model = VisionEncoderDecoderModel.from_pretrained(model_name, use_safetensors=True).to(device)
model.eval()

In [None]:
invoice_path = "../data/image_sample_invoice.jpeg"
image = Image.open(invoice_path).convert("RGB")

In [None]:
task_prompt = """
<s_invoice>
{
  "vendor": "",
  "invoice_no": "",
  "invoice_date": "",
  "tax": "",
  "total": ""
}
</s_invoice>
"""

decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids.to(device)
pixel_values = processor(image, return_tensors="pt").pixel_values.to(device)

In [None]:
with torch.no_grad():
    outputs = model.generate(
        pixel_values,
        decoder_input_ids=decoder_input_ids,
        max_length=512,
        num_beams=3,
        early_stopping=True,
        pad_token_id=processor.tokenizer.pad_token_id,
        eos_token_id=processor.tokenizer.eos_token_id
    )

    
# pred_str = processor.batch_decode(outputs, skip_special_tokens=True)[0]

In [None]:
pred_str = processor.batch_decode(outputs, skip_special_tokens=True)[0]

# Remove task tokens if present
pred_str = pred_str.replace("<s_invoice>", "").replace("</s_invoice>", "").strip()

print(pred_str)

## Paddle OCR

In [1]:
from paddleocr import PaddleOCR
import json

[33mChecking connectivity to the model hosters, this may take a while. To bypass this check, set `PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK` to `True`.[0m


In [27]:
ocr = PaddleOCR(use_textline_orientation=True, lang='en',use_doc_orientation_classify=True, use_doc_unwarping=True)

[32mCreating model: ('PP-LCNet_x1_0_doc_ori', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\Bilal\.paddlex\official_models\PP-LCNet_x1_0_doc_ori`.[0m
[32mCreating model: ('UVDoc', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\Bilal\.paddlex\official_models\UVDoc`.[0m
[32mCreating model: ('PP-LCNet_x1_0_textline_ori', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\Bilal\.paddlex\official_models\PP-LCNet_x1_0_textline_ori`.[0m
[32mCreating model: ('PP-OCRv5_server_det', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\Bilal\.paddlex\official_models\PP-OCRv5_server_det`.[0m
[32mCreating model: ('en_PP-OCRv5_mobile_rec', None)[0m
[32mModel files already exist. Using cached

In [29]:
result = ocr.predict("../data/image_sample_invoice.png")
result

[{'input_path': '../data/image_sample_invoice.png',
  'page_index': None,
  'doc_preprocessor_res': {'input_path': None,
   'page_index': None,
   'input_img': array([[[254, ..., 254],
           ...,
           [254, ..., 254]],
   
          ...,
   
          [[254, ..., 254],
           ...,
           [254, ..., 254]]], shape=(800, 680, 3), dtype=uint8),
   'model_settings': {'use_doc_orientation_classify': True,
    'use_doc_unwarping': True},
   'angle': 0,
   'rot_img': array([[[254, ..., 254],
           ...,
           [254, ..., 254]],
   
          ...,
   
          [[254, ..., 254],
           ...,
           [254, ..., 254]]], shape=(800, 680, 3), dtype=uint8),
   'output_img': array([[[ 44, ...,  41],
           ...,
           [153, ..., 200]],
   
          ...,
   
          [[ 70, ..., 146],
           ...,
           [ 57, ..., 111]]], shape=(800, 680, 3), dtype=uint8)},
  'dt_polys': [array([[ 65,  77],
          ...,
          [ 65, 107]], shape=(4, 2), dtype=int

In [16]:
ocr_outout = result[0]
lines = ocr_outout['rec_texts']
scores = ocr_outout['rec_scores']
# Build a list of lines with confidence > threshold (optional)
CONF_THRESHOLD = 0.7
filtered_lines = [line for line, score in zip(lines, scores) if score >= CONF_THRESHOLD]

print("Filtered Lines:")
for l in filtered_lines:
    print("-", l)

Filtered Lines:
- ABC Traders Pvt Ltd
- 123 Business St,
- Cityville, CA 90210
- info@abctraders.com
- INVOICE
- Invoice No:INV-223
- Date: 12/01/2026
- Description
- Quantity
- Unit Price
- Amount
- Office Supplies
- 5
- 300.00
- 1,500.00
- Printer Cartridge
- 2
- 425.00
- 850.00
- Subtotal:
- 2,350.00
- Tax (6%):
- 141.00
- Total:
- 2,491.00
- Payable To: ABC Traders Pvt Ltd
- Bank Account: 1234567890


In [14]:
import re

text_blob = "\n".join(filtered_lines)

def extract_invoice_fields(text):
    def find(pattern):
        m = re.search(pattern, text, re.IGNORECASE)
        return m.group(1).strip() if m else ""

    return {
        "vendor": filtered_lines[0] if filtered_lines else "",  # first line often vendor
        "invoice_no": find(r"invoice\s*no[:\s]*([A-Z0-9\-]+)"),
        "invoice_date": find(r"date[:\s]*([0-9\/\-]+)"),
        "tax": find(r"tax.*?([\d,]+\.\d{2})"),
        "total": find(r"total[:\s]*([\d,]+\.\d{2})"),
        "payable_to": find(r"payable\s*to[:\s]*(.+)"),
        "bank_account": find(r"bank\s*account[:\s]*(\d+)")
    }

fields = extract_invoice_fields(text_blob)
print(fields)


{'vendor': 'ABC Traders Pvt Ltd', 'invoice_no': 'INV-223', 'invoice_date': '12/01/2026', 'tax': '', 'total': '2,350.00', 'payable_to': 'ABC Traders Pvt Ltd', 'bank_account': '1234567890'}


## Structured Extraction

In [31]:
from paddleocr import PPStructureV3, PaddleOCR
import cv2
from PIL import Image
import os

In [32]:
engine = PPStructureV3( lang='en')

[32mCreating model: ('PP-LCNet_x1_0_doc_ori', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\Bilal\.paddlex\official_models\PP-LCNet_x1_0_doc_ori`.[0m
[32mCreating model: ('UVDoc', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\Bilal\.paddlex\official_models\UVDoc`.[0m
[32mCreating model: ('PP-DocBlockLayout', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\Bilal\.paddlex\official_models\PP-DocBlockLayout`.[0m
[32mCreating model: ('PP-DocLayout_plus-L', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\Bilal\.paddlex\official_models\PP-DocLayout_plus-L`.[0m
[32mCreating model: ('PP-LCNet_x1_0_textline_ori', None)[0m
[32mModel files already exist. Using cached files. To red

In [33]:
# 2. Run prediction on your image
img_path = "../data/image_sample_invoice.png"  # or .jpg / .pdf
output = engine.predict(img_path)

[32mCreating model: ('PP-LCNet_x1_0_doc_ori', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\Bilal\.paddlex\official_models\PP-LCNet_x1_0_doc_ori`.[0m
[32mCreating model: ('PP-LCNet_x1_0_textline_ori', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\Bilal\.paddlex\official_models\PP-LCNet_x1_0_textline_ori`.[0m
[32mCreating model: ('PP-OCRv5_server_det', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\Bilal\.paddlex\official_models\PP-OCRv5_server_det`.[0m
[32mCreating model: ('PP-OCRv5_server_rec', None)[0m
[32mModel files already exist. Using cached files. To redownload, please delete the directory manually: `C:\Users\Bilal\.paddlex\official_models\PP-OCRv5_server_rec`.[0m


In [34]:
output

[{'input_path': '../data/image_sample_invoice.png',
  'page_index': None,
  'page_count': None,
  'width': 680,
  'height': 800,
  'doc_preprocessor_res': {'input_path': None,
   'page_index': None,
   'input_img': array([[[254, ..., 254],
           ...,
           [254, ..., 254]],
   
          ...,
   
          [[254, ..., 254],
           ...,
           [254, ..., 254]]], shape=(800, 680, 3), dtype=uint8),
   'model_settings': {'use_doc_orientation_classify': True,
    'use_doc_unwarping': True},
   'angle': 0,
   'rot_img': array([[[254, ..., 254],
           ...,
           [254, ..., 254]],
   
          ...,
   
          [[254, ..., 254],
           ...,
           [254, ..., 254]]], shape=(800, 680, 3), dtype=uint8),
   'output_img': array([[[ 44, ...,  41],
           ...,
           [153, ..., 200]],
   
          ...,
   
          [[ 70, ..., 146],
           ...,
           [ 57, ..., 111]]], shape=(800, 680, 3), dtype=uint8)},
  'layout_det_res': {'input_path': None,

In [35]:
# 3. Process & print/save results
output_dir = "./invoice_output"
os.makedirs(output_dir, exist_ok=True)

markdown_list = []   # for combining multi-page if needed later

for res in output:
    # Print structured output to console (shows regions, text, tables, etc.)
    res.print()

    # Built-in save methods (replaces old draw_structure_result / save_structure_res)
    res.save_to_json(save_path=output_dir)       # JSON per result/page
    res.save_to_markdown(save_path=output_dir)   # Markdown (tables as | MD tables!)

    # If you want images/visualized boxes:
    res.save_to_img(save_path=output_dir)        # draws layout boxes on image

    # Optional: collect for multi-page Markdown concatenation (if PDF)
    md_info = res.markdown
    markdown_list.append(md_info)

# Example: If you have multiple pages/results, combine Markdown
# (Some versions have pipeline.concatenate_markdown_pages(markdown_list) â€” try if needed)
print("\nDone! Check folder:", output_dir)

[32m{'res': {'input_path': '../data/image_sample_invoice.png', 'page_index': None, 'page_count': None, 'width': 680, 'height': 800, 'model_settings': {'use_doc_preprocessor': True, 'use_seal_recognition': False, 'use_table_recognition': True, 'use_formula_recognition': True, 'use_chart_recognition': False, 'use_region_detection': True, 'format_block_content': False, 'markdown_ignore_labels': ['number', 'footnote', 'header', 'header_image', 'footer', 'footer_image', 'aside_text']}, 'parsing_res_list': [{'block_label': 'paragraph_title', 'block_content': 'ABCTradersPvtLtd ', 'block_bbox': [64, 78, 378, 105], 'block_id': 0, 'block_order': 1}, {'block_label': 'text', 'block_content': '123 Business St,Cityville,CA90210info@abctraders.com ', 'block_bbox': [59, 121, 252, 192], 'block_id': 1, 'block_order': 2}, {'block_label': 'paragraph_title', 'block_content': 'INVOICE ', 'block_bbox': [469, 201, 590, 226], 'block_id': 2, 'block_order': 3}, {'block_label': 'text', 'block_content': 'Invoice 


Done! Check folder: ./invoice_output


In [36]:
import pandas as pd
from bs4 import BeautifulSoup  # pip install beautifulsoup4 lxml if needed

for res in output:
    if hasattr(res, 'markdown') and 'html' in res.markdown:  # or check res.get('table_html')
        html = res.markdown.get('html', '')  # some versions expose HTML for tables
        if html:
            try:
                df_list = pd.read_html(html)
                for idx, df in enumerate(df_list):
                    print(f"\nTable {idx+1}:\n", df)
                    df.to_csv(os.path.join(output_dir, f"table_{idx}.csv"), index=False)
            except:
                print("No parseable HTML table found in this region.")