In [12]:
from pathlib import Path
from PIL import Image
from transformers import AutoProcessor, VisionEncoderDecoderModel, StoppingCriteria, StoppingCriteriaList
from collections import defaultdict
import fitz
import io
import torch


processor = AutoProcessor.from_pretrained("facebook/nougat-small")
model = VisionEncoderDecoderModel.from_pretrained("facebook/nougat-small")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)


image_path = Path("/bachelor/nougat_ori/nougat/0.png")
pdf_output_path = Path("/bachelor/nougat_ori/nougat/converted.pdf")


image = Image.open(image_path)
pdf_bytes = io.BytesIO()
image.save(pdf_bytes, format="PDF")
with open(pdf_output_path, "wb") as f:
    f.write(pdf_bytes.getvalue())

print(f"Image successfully converted to PDF: {pdf_output_path}")


def rasterize_paper(
    pdf: Path,
    dpi: int = 96,
    return_pil: bool = True,
    pages: None = None
):
    pillow_images = []
    try:
        pdf = fitz.open(pdf)
        if pages is None:
            pages = range(len(pdf))
        for i in pages:
            page_bytes = pdf[i].get_pixmap(dpi=dpi).pil_tobytes(format="PNG")
            if return_pil:
                pillow_images.append(io.BytesIO(page_bytes))
    except Exception as e:
        print(f"Error rasterizing: {e}")
    return pillow_images


images = rasterize_paper(pdf=pdf_output_path, return_pil=True)
image = Image.open(images[0])


pixel_values = processor(images=image, return_tensors="pt").pixel_values

outputs = model.generate(
    pixel_values.to(device),
    min_length=1,
    max_length=3584,
    bad_words_ids=[[processor.tokenizer.unk_token_id]],
    return_dict_in_generate=True,
    output_scores=True
)

generated = processor.batch_decode(outputs[0], skip_special_tokens=True)[0]
print("\n----- Raw Output -----\n")
print(generated)

generated = generated.replace("\\", "\\")
generated = f"$$\n{generated.strip()}\n$$"

print("\n----- Processed Output -----\n")
print(generated)


output_text_path = Path("outputs/extracted_text.md")
try:
    with open(output_text_path, "w", encoding="utf-8") as file:
        file.write("# Extracted Text from Image\n\n")
        file.write("\n")
        file.write(generated)
        file.write("\n")
    print(f"\nText successfully saved in {output_text_path}")
except Exception as e:
    print(f"Error saving file: {e}")


Config of the encoder: <class 'transformers.models.donut.modeling_donut_swin.DonutSwinModel'> is overwritten by shared encoder config: DonutSwinConfig {
  "attention_probs_dropout_prob": 0.0,
  "depths": [
    2,
    2,
    14,
    2
  ],
  "drop_path_rate": 0.1,
  "embed_dim": 128,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "image_size": [
    896,
    672
  ],
  "initializer_range": 0.02,
  "layer_norm_eps": 1e-05,
  "mlp_ratio": 4.0,
  "model_type": "donut-swin",
  "num_channels": 3,
  "num_heads": [
    4,
    8,
    16,
    32
  ],
  "num_layers": 4,
  "patch_size": 4,
  "qkv_bias": true,
  "transformers_version": "4.46.3",
  "use_absolute_embeddings": false,
  "window_size": 7
}

Config of the decoder: <class 'transformers.models.mbart.modeling_mbart.MBartForCausalLM'> is overwritten by shared decoder config: MBartConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "add_final_layer_norm": true

Processing PDF: ..\outputs\pdfs\10_em_81.pdf
Page 1 Text: ...
Rasterizing PDF to images...
Performing OCR on page 1 of 10_em_81.pdf...
OCR Text (Page 1): ## References

* [1] A. A. K. K. K. K. K. K. K. K. K. K. K. K. K. K. K. K. K. K. K. K. K. K. K. K. K...
Processing PDF: ..\outputs\pdfs\10_em_86.pdf
Page 1 Text: ...
Rasterizing PDF to images...
Performing OCR on page 1 of 10_em_86.pdf...
OCR Text (Page 1): ## References

* [1] A. A. K. K. K. K. K. K. K. K. K. K. K. K. K. K. K. K. K. K. K. K. K. K. K. K. K...
Processing PDF: ..\outputs\pdfs\10_em_87.pdf
Page 1 Text: ...
Rasterizing PDF to images...
Performing OCR on page 1 of 10_em_87.pdf...
OCR Text (Page 1): ....
Processing PDF: ..\outputs\pdfs\10_em_89.pdf
Page 1 Text: ...
Rasterizing PDF to images...
Performing OCR on page 1 of 10_em_89.pdf...
OCR Text (Page 1): ## References

* [1] A. A. K. K. K. K. K. K. K. K. K. K. K. K. K. K. K. K. K. K. K. K. K. K. K. K. K...
Processing PDF: ..\outputs\pdfs\11_em_91.pdf
Page 1 Text: ...
Raste