## Donut OCR

In [None]:
import torch
print(torch.__version__)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

In [None]:
from transformers import DonutProcessor, VisionEncoderDecoderModel
from PIL import Image
import json

In [None]:
# model_name = "naver-clova-ix/donut-base"  # base pre-trained Donut
model_name = "naver-clova-ix/donut-base-finetuned-cord-v2"
processor = DonutProcessor.from_pretrained(model_name)
model = VisionEncoderDecoderModel.from_pretrained(model_name, use_safetensors=True).to(device)
model.eval()

In [None]:
invoice_path = "../data/image_sample_invoice.jpeg"
image = Image.open(invoice_path).convert("RGB")

In [None]:
task_prompt = """
<s_invoice>
{
  "vendor": "",
  "invoice_no": "",
  "invoice_date": "",
  "tax": "",
  "total": ""
}
</s_invoice>
"""

decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids.to(device)
pixel_values = processor(image, return_tensors="pt").pixel_values.to(device)

In [None]:
with torch.no_grad():
    outputs = model.generate(
        pixel_values,
        decoder_input_ids=decoder_input_ids,
        max_length=512,
        num_beams=3,
        early_stopping=True,
        pad_token_id=processor.tokenizer.pad_token_id,
        eos_token_id=processor.tokenizer.eos_token_id
    )

    
# pred_str = processor.batch_decode(outputs, skip_special_tokens=True)[0]

In [None]:
pred_str = processor.batch_decode(outputs, skip_special_tokens=True)[0]

# Remove task tokens if present
pred_str = pred_str.replace("<s_invoice>", "").replace("</s_invoice>", "").strip()

print(pred_str)

## Paddle OCR

In [1]:
from paddleocr import PaddleOCR
import json

[33mChecking connectivity to the model hosters, this may take a while. To bypass this check, set `PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK` to `True`.[0m


In [None]:
ocr = PaddleOCR(use_textline_orientation=True, lang='en',use_doc_orientation_classify=True, use_doc_unwarping=True)

In [None]:
result = ocr.predict("../data/image_sample_invoice.png")
result

In [None]:
ocr_outout = result[0]
lines = ocr_outout['rec_texts']
scores = ocr_outout['rec_scores']
# Build a list of lines with confidence > threshold (optional)
CONF_THRESHOLD = 0.7
filtered_lines = [line for line, score in zip(lines, scores) if score >= CONF_THRESHOLD]

print("Filtered Lines:")
for l in filtered_lines:
    print("-", l)

In [None]:
import re

text_blob = "\n".join(filtered_lines)

def extract_invoice_fields(text):
    def find(pattern):
        m = re.search(pattern, text, re.IGNORECASE)
        return m.group(1).strip() if m else ""

    return {
        "vendor": filtered_lines[0] if filtered_lines else "",  # first line often vendor
        "invoice_no": find(r"invoice\s*no[:\s]*([A-Z0-9\-]+)"),
        "invoice_date": find(r"date[:\s]*([0-9\/\-]+)"),
        "tax": find(r"tax.*?([\d,]+\.\d{2})"),
        "total": find(r"total[:\s]*([\d,]+\.\d{2})"),
        "payable_to": find(r"payable\s*to[:\s]*(.+)"),
        "bank_account": find(r"bank\s*account[:\s]*(\d+)")
    }

fields = extract_invoice_fields(text_blob)
print(fields)


## Structured Extraction

In [1]:
from paddleocr import PPStructureV3, PaddleOCR
import cv2
from PIL import Image
import os

[33mChecking connectivity to the model hosters, this may take a while. To bypass this check, set `PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK` to `True`.[0m


In [None]:
engine = PPStructureV3(lang='en')

In [None]:
# 2. Run prediction on your image
img_path = "../data/image_sample_invoice.png"  # or .jpg / .pdf
output = engine.predict(img_path)

In [None]:
output

In [None]:
# 3. Process & print/save results
output_dir = "./invoice_output"
os.makedirs(output_dir, exist_ok=True)

markdown_list = []   # for combining multi-page if needed later

for res in output:
    # Print structured output to console (shows regions, text, tables, etc.)
    res.print()

    # Built-in save methods (replaces old draw_structure_result / save_structure_res)
    res.save_to_json(save_path=output_dir)       # JSON per result/page
    res.save_to_markdown(save_path=output_dir)   # Markdown (tables as | MD tables!)

    # If you want images/visualized boxes:
    res.save_to_img(save_path=output_dir)        # draws layout boxes on image

    # Optional: collect for multi-page Markdown concatenation (if PDF)
    md_info = res.markdown
    markdown_list.append(md_info)

# Example: If you have multiple pages/results, combine Markdown
# (Some versions have pipeline.concatenate_markdown_pages(markdown_list) — try if needed)
print("\nDone! Check folder:", output_dir)

In [None]:
import pandas as pd
from bs4 import BeautifulSoup  # pip install beautifulsoup4 lxml if needed

for res in output:
    if hasattr(res, 'markdown') and 'html' in res.markdown:  # or check res.get('table_html')
        html = res.markdown.get('html', '')  # some versions expose HTML for tables
        if html:
            try:
                df_list = pd.read_html(html)
                for idx, df in enumerate(df_list):
                    print(f"\nTable {idx+1}:\n", df)
                    df.to_csv(os.path.join(output_dir, f"table_{idx}.csv"), index=False)
            except:
                print("No parseable HTML table found in this region.")

## Parsing 

### Rule based

In [2]:
data['overall_ocr_res']['rec_texts']

['ABCTradersPvtLtd',
 '123 Business St,',
 'Cityville,CA90210',
 'info@abctraders.com',
 'INVOICE',
 'Invoice No:INV-223',
 'Date:12/01/2026',
 'Description',
 'Quantity',
 'Unit Price',
 'Amount',
 'Office Supplies',
 '5',
 '300.00',
 '1,500.00',
 'Printer Cartridge',
 '2',
 '425.00',
 '850.00',
 'Subtotal:',
 '2,350.00',
 'Tax(6%):',
 '141.00',
 'Total:',
 '2,491.00',
 'Payable To: ABC Traders Pvt Ltd',
 'Bank Account:1234567890']

In [1]:
import json

json_path = "../notebook/invoice_output/image_sample_invoice_res.json"
with open(json_path, "r") as f:
    data = json.load(f)

rec_texts = data['overall_ocr_res']['rec_texts']

extracted = {
    'vendor_name': None,
    'invoice_number': None,
    'invoice_date': None,
    'tax_amount': None,
    'total_amount': None
}

for line in rec_texts:
    line = line.strip()
    lower = line.lower()
    
    if 'payable to' in lower:
        extracted['vendor_name'] = line.split(':', 1)[1].strip() if ':' in line else line.replace('Payable To', '').strip()
    elif 'invoice no' in lower:
        extracted['invoice_number'] = line.split(':', 1)[1].strip() if ':' in line else None
    elif 'date:' in lower:
        extracted['invoice_date'] = line.split(':', 1)[1].strip() if ':' in line else None
    elif 'tax' in lower and '%' in lower:
        extracted['tax_amount'] = line.split(':', 1)[1].strip() if ':' in line else None
    elif 'total:' in lower:
        extracted['total_amount'] = line.split(':', 1)[1].strip() if ':' in line else None

# Fallback for vendor if "Payable To" missed
if not extracted['vendor_name']:
    for line in rec_texts:
        if 'abc traders' in line.lower():
            extracted['vendor_name'] = line.strip()
            break

print(extracted)

{'vendor_name': 'ABC Traders Pvt Ltd', 'invoice_number': 'INV-223', 'invoice_date': '12/01/2026', 'tax_amount': '', 'total_amount': ''}


In [7]:
import pandas as pd
json_path = "../notebook/invoice_output/image_sample_invoice_res.json"
with open(json_path, "r") as f:
    data = json.load(f)
def invoice_table_to_dataframe(json_data):
    table_block = next((b for b in json_data.get("parsing_res_list", []) if b["block_label"] == "table"), None)
    if not table_block or not table_block.get("block_content"):
        return pd.DataFrame()
    df = pd.read_html(table_block["block_content"])[0]
    if df.iloc[0].isna().sum() == 0:
        df.columns = df.iloc[0]
        df = df[1:].reset_index(drop=True)
    return df

df = invoice_table_to_dataframe(data)

In [8]:
df

Unnamed: 0,Description,Quantity,Unit Price,Amount
0,Office Supplies,5,300.0,1500.0
1,Printer Cartridge,2,425.0,850.0


In [1]:
import json
from PIL import Image
import torch
from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification

In [2]:
json_path = "../notebook/invoice_output/image_sample_invoice_res.json"
with open(json_path, "r") as f:
    data = json.load(f)

In [3]:
ocr_res = data["overall_ocr_res"]
words = ocr_res["rec_texts"]
boxes = ocr_res["rec_boxes"]  # list of [x0, y0, x1, y1]

width = data["width"]
height = data["height"] 


print(f"\n ocr_res {ocr_res}")
print(f"\n words {words}")
print(f"\n boxes {boxes}")
print(f"\n width {width}")
print(f"\n height {height}")


 ocr_res {'input_path': None, 'page_index': None, 'model_settings': {'use_doc_preprocessor': False, 'use_textline_orientation': True}, 'dt_polys': [[[67, 78], [381, 78], [381, 106], [67, 106]], [[62, 118], [221, 120], [221, 143], [62, 141]], [[63, 146], [247, 146], [247, 169], [63, 169]], [[60, 172], [253, 175], [253, 195], [60, 192]], [[468, 200], [591, 200], [591, 229], [468, 229]], [[409, 259], [599, 257], [599, 280], [409, 282]], [[454, 287], [602, 285], [602, 307], [454, 309]], [[64, 340], [177, 340], [177, 366], [64, 366]], [[350, 338], [431, 338], [431, 364], [350, 364]], [[441, 339], [529, 337], [530, 360], [441, 362]], [[541, 339], [616, 339], [616, 358], [541, 358]], [[66, 381], [208, 381], [208, 406], [66, 406]], [[381, 377], [400, 377], [400, 402], [381, 402]], [[450, 377], [516, 377], [516, 400], [450, 400]], [[538, 377], [620, 375], [620, 398], [539, 400]], [[66, 422], [223, 422], [223, 447], [66, 447]], [[380, 418], [403, 418], [403, 444], [380, 444]], [[451, 418], [517

In [4]:
# Normalize boxes to [0, 1000] scale (LayoutLMv3 requirement)
normalized_boxes = []
for box in boxes:
    x0, y0, x1, y1 = box
    norm_box = [
        int(round(x0 / width * 1000)),
        int(round(y0 / height * 1000)),
        int(round(x1 / width * 1000)),
        int(round(y1 / height * 1000))
    ]
    normalized_boxes.append(norm_box)

In [5]:
normalized_boxes

[[99, 98, 560, 132],
 [91, 148, 325, 179],
 [93, 182, 363, 211],
 [88, 215, 372, 244],
 [688, 250, 869, 286],
 [601, 321, 881, 352],
 [668, 356, 885, 386],
 [94, 425, 260, 458],
 [515, 422, 634, 455],
 [649, 421, 779, 452],
 [796, 424, 906, 448],
 [97, 476, 306, 507],
 [560, 471, 588, 502],
 [662, 471, 759, 500],
 [791, 469, 912, 500],
 [97, 528, 328, 559],
 [559, 522, 593, 555],
 [663, 522, 760, 551],
 [815, 520, 915, 549],
 [466, 622, 606, 659],
 [751, 618, 896, 654],
 [465, 674, 612, 714],
 [788, 669, 901, 701],
 [466, 728, 562, 761],
 [746, 716, 904, 758],
 [112, 810, 550, 851],
 [118, 865, 497, 900]]

In [6]:
# Load image (same as used in OCR)
image_path = "../data/image_sample_invoice.png"
image = Image.open(image_path).convert("RGB")

In [8]:
# Load the fine-tuned model and processor
model_name = "Theivaprakasham/layoutlmv3-finetuned-invoice"
processor = LayoutLMv3Processor.from_pretrained(model_name)
model = LayoutLMv3ForTokenClassification.from_pretrained(model_name,  use_safetensors=True)

Loading weights:   0%|          | 0/216 [00:00<?, ?it/s]

LayoutLMv3ForTokenClassification LOAD REPORT from: Theivaprakasham/layoutlmv3-finetuned-invoice
Key                                | Status     |  | 
-----------------------------------+------------+--+-
layoutlmv3.embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


In [9]:
# Prepare inputs (processor handles tokenization + image + boxes)
encoding = processor(
    image,
    words,
    boxes=normalized_boxes,
    max_length=512,
    padding="max_length",
    truncation=True,
    return_tensors="pt"
)

In [10]:
# Inference
with torch.no_grad():
    outputs = model(**encoding)

In [11]:
# Get predictions (token-level labels)
logits = outputs.logits
predictions = logits.argmax(-1).squeeze().tolist()

In [12]:
# Get label map
id2label = model.config.id2label
labels = [id2label[pred] for pred in predictions]

In [25]:
print(labels)
print("\n",predictions)

['O', 'B-BILLER', 'B-BILLER', 'B-BILLER', 'B-BILLER', 'B-BILLER', 'I-BILLER_ADDRESS', 'I-BILLER_ADDRESS', 'B-BILLER', 'O', 'I-BILLER_ADDRESS', 'O', 'O', 'O', 'O', 'O', 'B-BILLER_POST_CODE', 'B-BILLER_POST_CODE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-DUE_DATE', 'B-DUE_DATE', 'B-DUE_DATE', 'B-DUE_DATE', 'B-DUE_DATE', 'B-DUE_DATE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-GST', 'O', 'O', 'B-SUBTOTAL', 'B-SUBTOTAL', 'B-GST', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-GST', 'O', 'O', 'B-GST', 'O', 'O', 'O', 'O', 'O', 'B-SUBTOTAL', 'B-SUBTOTAL', 'B-GST', 'B-GST', 'O', 'O', 'O', 'O', 'O', 'O', 'B-GST', 'O', 'O', 'O', 'O', 'B-TOTAL', 'B-TOTAL', 'B-GST', 'B-TOTAL', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-BILLER_POST_CODE', 'B-BILLER_POST_CODE', 'B-BILLER_POST_CODE', 'B-BILLER_POST_CODE', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 

In [26]:
print(id2label)

{0: 'O', 1: 'B-ABN', 2: 'B-BILLER', 3: 'B-BILLER_ADDRESS', 4: 'B-BILLER_POST_CODE', 5: 'B-DUE_DATE', 6: 'B-GST', 7: 'B-INVOICE_DATE', 8: 'B-INVOICE_NUMBER', 9: 'B-SUBTOTAL', 10: 'B-TOTAL', 11: 'I-BILLER_ADDRESS'}


In [27]:
# Stricter BIO grouping: reset on any non-matching I- or O
entities = {}
current_entity = None
current_text = []

for word, label in zip(words, labels):
    if label == 'O':
        if current_entity:
            field = current_entity[2:]  # e.g. BILLER
            if field not in entities:
                entities[field] = []
            entities[field].append(" ".join(current_text).strip())
        current_entity = None
        current_text = []
        continue

    prefix, field_name = label.split('-', 1) if '-' in label else (None, None)

    if prefix == 'B':
        # Save previous if open
        if current_entity:
            prev_field = current_entity[2:]
            if prev_field not in entities:
                entities[prev_field] = []
            entities[prev_field].append(" ".join(current_text).strip())

        current_entity = label
        current_text = [word]

    elif prefix == 'I' and current_entity and field_name == current_entity[2:]:
        current_text.append(word)

    else:
        # Unexpected → save and reset
        if current_entity:
            field = current_entity[2:]
            if field not in entities:
                entities[field] = []
            entities[field].append(" ".join(current_text).strip())
        current_entity = None
        current_text = []

# Last entity
if current_entity:
    field = current_entity[2:]
    if field not in entities:
        entities[field] = []
    entities[field].append(" ".join(current_text).strip())

# Map (take first/best occurrence, or join all)
extracted = {
    "Vendor Name": entities.get("BILLER", [""])[0] if entities.get("BILLER") else "",  # take first only
    "Invoice Number": entities.get("INVOICE_NUMBER", [""])[0] if entities.get("INVOICE_NUMBER") else "",
    "Invoice Date": entities.get("INVOICE_DATE", [""])[0] if entities.get("INVOICE_DATE") else "",
    "Tax Amount": entities.get("GST", [""])[0] if entities.get("GST") else "",
    "Total Amount": entities.get("TOTAL", [""])[0] if entities.get("TOTAL") else "",
    "Sub Total": entities.get("SUBTOTAL", [""])[0] if entities.get("SUBTOTAL") else "",
}

print("Extracted Fields:")
for k, v in extracted.items():
    print(f"- {k}: {v.strip()}")

print("\nAll Detected Entities (debug):")
for f, lst in entities.items():
    print(f"{f}: {lst}")

Extracted Fields:
- Vendor Name: 123 Business St,
- Invoice Number: 
- Invoice Date: 
- Tax Amount: 
- Total Amount: 
- Sub Total: 

All Detected Entities (debug):
BILLER: ['123 Business St,', 'Cityville,CA90210', 'info@abctraders.com', 'INVOICE', 'Invoice No:INV-223', 'Quantity']
BILLER_POST_CODE: ['2', '425.00']


## Second approach

In [16]:
import json
import torch
from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification

In [None]:
from transformers import LayoutLMv3TokenizerFast, LayoutLMv3ImageProcessor

tokenizer = LayoutLMv3TokenizerFast.from_pretrained(MODEL_NAME)
image_processor = LayoutLMv3ImageProcessor.from_pretrained(MODEL_NAME, apply_ocr=False)


In [19]:
MODEL_NAME = "microsoft/layoutlmv3-base-finetuned-funsd"

processor = LayoutLMv3Processor.from_pretrained(
    MODEL_NAME, apply_ocr=False, trust_remote_code=False
)
model = LayoutLMv3ForTokenClassification.from_pretrained(MODEL_NAME)
model.eval()

# processor = LayoutLMv3Processor.from_pretrained(
#     MODEL_NAME,
#     apply_ocr=False,
#     trust_remote_code=False
# )

RepositoryNotFoundError: 404 Client Error. (Request ID: Root=1-697c46f3-7b51e2342ce01be635db391c;dbe05043-38b2-4cea-8c58-8cf68e9e4669)

Repository Not Found for url: https://huggingface.co/api/models/microsoft/layoutlmv3-base-finetuned-funsd/tree/main/additional_chat_templates?recursive=false&expand=false.
Please make sure you specified the correct `repo_id` and `repo_type`.
If you are trying to access a private or gated repo, make sure you are authenticated. For more details, see https://huggingface.co/docs/huggingface_hub/authentication

In [None]:




id2label = model.config.id2label


# -----------------------------
# 2. CONVERT PP-STRUCTURE JSON
# -----------------------------
def ppstructure_to_layoutlm(pp_json, image_size):
    """
    pp_json: PP-StructureV3 JSON output
    image_size: (width, height)
    """
    words = []
    boxes = []

    img_w, img_h = image_size

    for block in pp_json:
        if block.get("type") in ["text", "table"]:
            for line in block.get("res", []):
                text = line["text"].strip()
                if not text:
                    continue

                x1, y1, x2, y2 = line["bbox"]

                # normalize to 0–1000 (LayoutLM requirement)
                box = [
                    int(1000 * x1 / img_w),
                    int(1000 * y1 / img_h),
                    int(1000 * x2 / img_w),
                    int(1000 * y2 / img_h),
                ]

                words.append(text)
                boxes.append(box)

    return words, boxes


# -----------------------------
# 3. RUN INFERENCE
# -----------------------------
def extract_fields(pp_json, image, image_size):
    words, boxes = ppstructure_to_layoutlm(pp_json, image_size)

    encoding = processor(
        image,
        words,
        boxes=boxes,
        return_tensors="pt",
        truncation=True,
        padding="max_length"
    )

    with torch.no_grad():
        outputs = model(**encoding)

    predictions = outputs.logits.argmax(-1).squeeze().tolist()
    tokens = processor.tokenizer.convert_ids_to_tokens(
        encoding["input_ids"].squeeze()
    )

    results = []
    for token, pred in zip(tokens, predictions):
        label = id2label[pred]
        if label != "O" and not token.startswith("##"):
            results.append((token, label))

    return results


# -----------------------------
# 4. USAGE
# -----------------------------
"""
pp_structure_json = json.load(open("ppstructure_output.json"))
image = PIL.Image.open("invoice.png")
image_size = image.size  # (width, height)
"""

# result = extract_fields(pp_structure_json, image, image_size)
# print(result)

"""
OUTPUT EXAMPLE:
[
  ('Invoice', 'B-QUESTION'),
  ('INV-12345', 'B-ANSWER'),
  ('Total', 'B-QUESTION'),
  ('$1234.50', 'B-ANSWER')
]
"""
