# Using DONUT for Document Visual Question Answering (DocVQA) pretrained model


🤗 Transformers and SentencePiece are required

#!pip install transformers[sentencepiece]

## Load libraries model and processor

In [2]:
import pandas as pd
from pathlib import Path
from PIL import Image

In [3]:
from transformers import DonutProcessor, VisionEncoderDecoderModel

processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")
model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa")

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


In [5]:
image_folder = Path("./Images")

In [None]:
dict_file_menu = {'id'}
for i,file in image_folder.iterdir():
    

## Prepare using processor

We prepare the image for the model using `DonutProcessor`.

In [5]:
pixel_values = processor(image, return_tensors="pt").pixel_values
print(pixel_values.shape)

torch.Size([1, 3, 2560, 1920])


## Generate

Finally, we let the model autoregressively generate the answer to the question.

In [6]:
import torch

task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>"
question = "When is the coffee break?"
prompt = task_prompt.replace("{user_input}", question)
decoder_input_ids = processor.tokenizer(prompt, add_special_tokens=False, return_tensors="pt")["input_ids"]

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

outputs = model.generate(pixel_values.to(device),
                               decoder_input_ids=decoder_input_ids.to(device),
                               max_length=model.decoder.config.max_position_embeddings,
                               early_stopping=True,
                               pad_token_id=processor.tokenizer.pad_token_id,
                               eos_token_id=processor.tokenizer.eos_token_id,
                               use_cache=True,
                               num_beams=1,
                               bad_words_ids=[[processor.tokenizer.unk_token_id]],
                               return_dict_in_generate=True,
                               output_scores=True)

In [7]:
import re

seq = processor.batch_decode(outputs.sequences)[0]
seq = seq.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
seq = re.sub(r"<.*?>", "", seq, count=1).strip()  # remove first task start token
print(seq)

<s_question> When is the coffee break?</s_question><s_answer> 11-14 to 11:39 a.m.</s_answer>


## Convert to JSON

We can convert the generated sequence to JSON if required:

In [8]:
processor.token2json(seq)

{'answer': '11-14 to 11:39 a.m.', 'question': 'When is the coffee break?'}