In [None]:
import torch
print(torch.__version__)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

In [None]:
from transformers import DonutProcessor, VisionEncoderDecoderModel
from PIL import Image
import json

In [None]:
# model_name = "naver-clova-ix/donut-base"  # base pre-trained Donut
model_name = "naver-clova-ix/donut-base-finetuned-cord-v2"
processor = DonutProcessor.from_pretrained(model_name)
model = VisionEncoderDecoderModel.from_pretrained(model_name, use_safetensors=True).to(device)
model.eval()

In [None]:
invoice_path = "../data/image_sample_invoice.jpeg"
image = Image.open(invoice_path).convert("RGB")

In [None]:
task_prompt = """
<s_invoice>
{
  "vendor": "",
  "invoice_no": "",
  "invoice_date": "",
  "tax": "",
  "total": ""
}
</s_invoice>
"""

decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids.to(device)
pixel_values = processor(image, return_tensors="pt").pixel_values.to(device)

In [None]:
with torch.no_grad():
    outputs = model.generate(
        pixel_values,
        decoder_input_ids=decoder_input_ids,
        max_length=512,
        num_beams=3,
        early_stopping=True,
        pad_token_id=processor.tokenizer.pad_token_id,
        eos_token_id=processor.tokenizer.eos_token_id
    )

    
# pred_str = processor.batch_decode(outputs, skip_special_tokens=True)[0]

In [None]:
pred_str = processor.batch_decode(outputs, skip_special_tokens=True)[0]

# Remove task tokens if present
pred_str = pred_str.replace("<s_invoice>", "").replace("</s_invoice>", "").strip()

print(pred_str)

## Paddle OCR

In [2]:
from paddleocr import PaddleOCR
import json

[33mChecking connectivity to the model hosters, this may take a while. To bypass this check, set `PADDLE_PDX_DISABLE_MODEL_SOURCE_CHECK` to `True`.[0m


In [3]:
ocr = PaddleOCR(
    use_angle_cls=True,
    use_gpu=False,
    lang="en"
)

  ocr = PaddleOCR(


ValueError: Unknown argument: use_gpu

In [None]:
result = ocr.ocr("../data/image_sample_invoice.jpeg", cls=True)
result

In [None]:
# Flatten OCR output
lines = []
for page in result:
    for line in page:
        text = line[1][0]
        confidence = line[1][1]
        bbox = line[0]
        lines.append({
            "text": text,
            "confidence": confidence,
            "bbox": bbox
        })

print(json.dumps(lines, indent=2))


In [None]:
from paddleocr import PaddleOCR

# paddleocr_bp = Blueprint('paddleocr', __name__)

def process_paddleocr():
    ocr = PaddleOCR(
        det_model_dir='paddle_models/en_PP-OCRv3_det_infer',
        rec_model_dir='paddle_models/en_PP-OCRv3_rec_infer',
        cls_model_dir='paddle_models/ch_ppocr_mobile_v2.0_cls_infer',
        use_angle_cls=True,
        lang='en'
    )
    img = load_image()
    ocr_method = 'PaddleOCR'

    start_time_recognition = time.time()
    result = ocr.ocr("../data/image_sample_invoice.jpeg", cls=True)
    recognition_time = time.time() - start_time_recognition

    average_confidence, text = process_paddleocr_text(result)

    start_time_parsing = time.time()
    parsed_data = parse_text(text)
    parsing_time = time.time() - start_time_parsing

    response = {
        'text': text,
        'parsed_data': parsed_data,
        'time': {
            'recognition': recognition_time,
            'parsing': parsing_time,
        },
        'average_confidence': average_confidence * 100
    }

    if check_if_invoice(parsed_data):
        pdf_file, image_file = get_files_from_request()
        invoice_id = add_invoice_to_db(parsed_data, text, pdf_file, image_file,
                                       average_confidence * 100, recognition_time, parsing_time, ocr_method)
        response['invoice_id'] = invoice_id

    return jsonify(response)

process_paddleocr()