In [1]:
import cv2
import pandas
import os

### Document parsing

In [2]:
import re
from transformers import DonutProcessor, VisionEncoderDecoderModel
import torch

processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
print(device)



cuda


In [3]:
images = os.listdir("train_subset_images")

for image in images:
    image_path = os.path.join("train_subset_images", image)
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
image = cv2.imread(os.path.join("train_subset_images", '41XoZSwXJ8L.jpg'))
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

print(type(image))
print(image_path)

<class 'numpy.ndarray'>
train_subset_images/51V9-NgHwFL.jpg


In [4]:
# prepare decoder inputs
task_prompt = "<s_cord-v2>"
decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids

pixel_values = processor(image, return_tensors="pt").pixel_values

outputs = model.generate(
    pixel_values.to(device),
    decoder_input_ids=decoder_input_ids.to(device),
    max_length=model.decoder.config.max_position_embeddings,
    pad_token_id=processor.tokenizer.pad_token_id,
    eos_token_id=processor.tokenizer.eos_token_id,
    use_cache=True,
    bad_words_ids=[[processor.tokenizer.unk_token_id]],
    return_dict_in_generate=True,
)

sequence = processor.batch_decode(outputs.sequences)[0]
sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
print(processor.token2json(sequence))

{'menu': {'nm': 'Chain length: 50+5cm', 'unitprice': '23.5mm', 'cnt': '2', 'price': '26.5mm'}, 'sub_total': {'subtotal_price': '50+5cm'}, 'total': {'total_price': '23.5mm', 'cashprice': '15g'}}


### Document VQA

In [5]:
import re

from transformers import DonutProcessor, VisionEncoderDecoderModel
import torch

processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

VisionEncoderDecoderModel(
  (encoder): DonutSwinModel(
    (embeddings): DonutSwinEmbeddings(
      (patch_embeddings): DonutSwinPatchEmbeddings(
        (projection): Conv2d(3, 128, kernel_size=(4, 4), stride=(4, 4))
      )
      (norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): DonutSwinEncoder(
      (layers): ModuleList(
        (0): DonutSwinStage(
          (blocks): ModuleList(
            (0-1): 2 x DonutSwinLayer(
              (layernorm_before): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
              (attention): DonutSwinAttention(
                (self): DonutSwinSelfAttention(
                  (query): Linear(in_features=128, out_features=128, bias=True)
                  (key): Linear(in_features=128, out_features=128, bias=True)
                  (value): Linear(in_features=128, out_features=128, bias=True)
                  (dropout): Dropout(p=0.0, inplace=False)
           

In [6]:
images = os.listdir("train_subset_images")

for image in images:
    image_path = os.path.join("train_subset_images", image)
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
image = cv2.imread(os.path.join("train_subset_images", '41XoZSwXJ8L.jpg'))
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

print(type(image))
print(image_path)

<class 'numpy.ndarray'>
train_subset_images/51V9-NgHwFL.jpg


In [7]:
# prepare decoder inputs
task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>"
# question = "What is the weight of the item (units: g, kg, lbs)?"
question = "What is weight?"
prompt = task_prompt.replace("{user_input}", question)
decoder_input_ids = processor.tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids

pixel_values = processor(image, return_tensors="pt").pixel_values

outputs = model.generate(
    pixel_values.to(device),
    decoder_input_ids=decoder_input_ids.to(device),
    max_length=model.decoder.config.max_position_embeddings,
    pad_token_id=processor.tokenizer.pad_token_id,
    eos_token_id=processor.tokenizer.eos_token_id,
    use_cache=True,
    bad_words_ids=[[processor.tokenizer.unk_token_id]],
    return_dict_in_generate=True,
)

sequence = processor.batch_decode(outputs.sequences)[0]
sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
print(processor.token2json(sequence))

{'question': 'What is weight?', 'answer': '15g'}


In [8]:
processor.tokenizer.decode(outputs.sequences[0])

# split numerical and alphabetic values
import re

def split_numerical_alphabetic(text):
    # This function splits the text into numerical and alphabetic parts
    # using regular expressions.
    # It returns a tuple where the first element is the numerical part and the second is the alphabetic part.
    # If there are no numerical or alphabetic parts, the corresponding tuple element will be an empty string.
    # Regular expression to match numerical and alphabetic parts
    num_pattern = r'\d+(\.\d+)?|\d+'
    alpha_pattern = r'[a-zA-Z]+'
    
    # Find all matches for numerical and alphabetic parts
    num_matches = re.findall(num_pattern, text)
    alpha_matches = re.findall(alpha_pattern, text)
    return {
        "numerical": num_matches,
        "alphabetic": alpha_matches
    }
    # Regular expression to match numerical and alphabetic parts
    
print(split_numerical_alphabetic("232 g")['numerical'])
print(split_numerical_alphabetic("232 g")['alphabetic'])

['']
['g']
