## Paddle OCR

In [1]:
from paddleocr import PaddleOCR
import json

[33mChecking connectivity to the model hosters, this may take a while. To bypass this check, set `PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK` to `True`.[0m


In [None]:
ocr = PaddleOCR(use_textline_orientation=True, lang='en',use_doc_orientation_classify=True, use_doc_unwarping=True)

In [None]:
result = ocr.predict("../data/image_sample_invoice.png")
result

In [None]:
ocr_outout = result[0]
lines = ocr_outout['rec_texts']
scores = ocr_outout['rec_scores']
# Build a list of lines with confidence > threshold (optional)
CONF_THRESHOLD = 0.7
filtered_lines = [line for line, score in zip(lines, scores) if score >= CONF_THRESHOLD]

print("Filtered Lines:")
for l in filtered_lines:
    print("-", l)

In [None]:
import re

text_blob = "\n".join(filtered_lines)

def extract_invoice_fields(text):
    def find(pattern):
        m = re.search(pattern, text, re.IGNORECASE)
        return m.group(1).strip() if m else ""

    return {
        "vendor": filtered_lines[0] if filtered_lines else "",  # first line often vendor
        "invoice_no": find(r"invoice\s*no[:\s]*([A-Z0-9\-]+)"),
        "invoice_date": find(r"date[:\s]*([0-9\/\-]+)"),
        "tax": find(r"tax.*?([\d,]+\.\d{2})"),
        "total": find(r"total[:\s]*([\d,]+\.\d{2})"),
        "payable_to": find(r"payable\s*to[:\s]*(.+)"),
        "bank_account": find(r"bank\s*account[:\s]*(\d+)")
    }

fields = extract_invoice_fields(text_blob)
print(fields)


## Structured Extraction

In [1]:
from paddleocr import PPStructureV3, PaddleOCR
import cv2
from PIL import Image
import os

[33mChecking connectivity to the model hosters, this may take a while. To bypass this check, set `PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK` to `True`.[0m


In [None]:
engine = PPStructureV3(lang='en')

In [None]:
img_path = "../data/image_sample_invoice.png"
output = engine.predict(img_path)

In [None]:
output

In [None]:
output_dir = "./invoice_output"
os.makedirs(output_dir, exist_ok=True)

markdown_list = []

for res in output:
    res.print()
    res.save_to_json(save_path=output_dir)
    res.save_to_markdown(save_path=output_dir)
    res.save_to_img(save_path=output_dir)
    md_info = res.markdown
    markdown_list.append(md_info)
print("\nDone! Check folder:", output_dir)

## Parsing 

### Rule based

In [2]:
data['overall_ocr_res']['rec_texts']

['ABCTradersPvtLtd',
 '123 Business St,',
 'Cityville,CA90210',
 'info@abctraders.com',
 'INVOICE',
 'Invoice No:INV-223',
 'Date:12/01/2026',
 'Description',
 'Quantity',
 'Unit Price',
 'Amount',
 'Office Supplies',
 '5',
 '300.00',
 '1,500.00',
 'Printer Cartridge',
 '2',
 '425.00',
 '850.00',
 'Subtotal:',
 '2,350.00',
 'Tax(6%):',
 '141.00',
 'Total:',
 '2,491.00',
 'Payable To: ABC Traders Pvt Ltd',
 'Bank Account:1234567890']

In [1]:
import json

json_path = "../notebook/invoice_output/image_sample_invoice_res.json"
with open(json_path, "r") as f:
    data = json.load(f)

rec_texts = data['overall_ocr_res']['rec_texts']

extracted = {
    'vendor_name': None,
    'invoice_number': None,
    'invoice_date': None,
    'tax_amount': None,
    'total_amount': None
}

for line in rec_texts:
    line = line.strip()
    lower = line.lower()
    
    if 'payable to' in lower:
        extracted['vendor_name'] = line.split(':', 1)[1].strip() if ':' in line else line.replace('Payable To', '').strip()
    elif 'invoice no' in lower:
        extracted['invoice_number'] = line.split(':', 1)[1].strip() if ':' in line else None
    elif 'date:' in lower:
        extracted['invoice_date'] = line.split(':', 1)[1].strip() if ':' in line else None
    elif 'tax' in lower and '%' in lower:
        extracted['tax_amount'] = line.split(':', 1)[1].strip() if ':' in line else None
    elif 'total:' in lower:
        extracted['total_amount'] = line.split(':', 1)[1].strip() if ':' in line else None

# Fallback for vendor if "Payable To" missed
if not extracted['vendor_name']:
    for line in rec_texts:
        if 'abc traders' in line.lower():
            extracted['vendor_name'] = line.strip()
            break

print(extracted)

{'vendor_name': 'ABC Traders Pvt Ltd', 'invoice_number': 'INV-223', 'invoice_date': '12/01/2026', 'tax_amount': '', 'total_amount': ''}


In [7]:
import pandas as pd
json_path = "../notebook/invoice_output/image_sample_invoice_res.json"
with open(json_path, "r") as f:
    data = json.load(f)
def invoice_table_to_dataframe(json_data):
    table_block = next((b for b in json_data.get("parsing_res_list", []) if b["block_label"] == "table"), None)
    if not table_block or not table_block.get("block_content"):
        return pd.DataFrame()
    df = pd.read_html(table_block["block_content"])[0]
    if df.iloc[0].isna().sum() == 0:
        df.columns = df.iloc[0]
        df = df[1:].reset_index(drop=True)
    return df

df = invoice_table_to_dataframe(data)

In [8]:
df

Unnamed: 0,Description,Quantity,Unit Price,Amount
0,Office Supplies,5,300.0,1500.0
1,Printer Cartridge,2,425.0,850.0
