STEP 1 — Load Dataset (Invoices Dataset)

In [1]:
from datasets import load_dataset

# Load dataset from Hugging Face Hub
dataset = load_dataset("katanaml-org/invoices-donut-data-v1")
dataset

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['image', 'ground_truth'],
        num_rows: 425
    })
    validation: Dataset({
        features: ['image', 'ground_truth'],
        num_rows: 50
    })
    test: Dataset({
        features: ['image', 'ground_truth'],
        num_rows: 26
    })
})

Step 2 — Load Pre-trained DONUT Model and Processor

DONUT is a Transformer-based model designed to directly read an image and generate structured text output — like  invoice JSON — without needing separate OCR steps.

It has two main parts :

<b>1. Processor:</b>
<ul>
<li>Handles input preparation — turns images into pixel tensors the model understands.</li>
<li>Handles output preparation — tokenizes JSON strings into token IDs and vice versa.</li>
</ul>


<b>2. Model:</b>
<ul>
<li>VisionEncoderDecoderModel</li>
<li>The decoder generates the JSON tokens</li>
</ul>

In [3]:
from transformers import DonutProcessor, VisionEncoderDecoderModel
import torch

# Load processor (image & tokenizer) and model (vision encoder + text decoder)
processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base",use_fast=True)
model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base")

# Check if GPU is available, then move model to GPU for faster training
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

print(f"Using device: {device}")


Using device: cuda


# Architecture

Trainable Parameters

In [4]:
def count_parameters(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Total Parameters: {total_params:,}")
    print(f"Trainable Parameters: {trainable_params:,}")
    print(f"Frozen Parameters: {total_params - trainable_params:,}")

count_parameters(model)

Total Parameters: 201,852,024
Trainable Parameters: 201,852,024
Frozen Parameters: 0


In [5]:
count_parameters(model.encoder)

Total Parameters: 74,180,728
Trainable Parameters: 74,180,728
Frozen Parameters: 0


In [6]:
count_parameters(model.decoder)

Total Parameters: 127,671,296
Trainable Parameters: 127,671,296
Frozen Parameters: 0


In [7]:
print(model)

VisionEncoderDecoderModel(
  (encoder): DonutSwinModel(
    (embeddings): DonutSwinEmbeddings(
      (patch_embeddings): DonutSwinPatchEmbeddings(
        (projection): Conv2d(3, 128, kernel_size=(4, 4), stride=(4, 4))
      )
      (norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): DonutSwinEncoder(
      (layers): ModuleList(
        (0): DonutSwinStage(
          (blocks): ModuleList(
            (0): DonutSwinLayer(
              (layernorm_before): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
              (attention): DonutSwinAttention(
                (self): DonutSwinSelfAttention(
                  (query): Linear(in_features=128, out_features=128, bias=True)
                  (key): Linear(in_features=128, out_features=128, bias=True)
                  (value): Linear(in_features=128, out_features=128, bias=True)
                  (dropout): Dropout(p=0.0, inplace=False)
                )

In [8]:
print(model.encoder)  # Vision Encoder (ViT)
# print(model.decoder)  # Decoder (Transformer decoder)

DonutSwinModel(
  (embeddings): DonutSwinEmbeddings(
    (patch_embeddings): DonutSwinPatchEmbeddings(
      (projection): Conv2d(3, 128, kernel_size=(4, 4), stride=(4, 4))
    )
    (norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): DonutSwinEncoder(
    (layers): ModuleList(
      (0): DonutSwinStage(
        (blocks): ModuleList(
          (0): DonutSwinLayer(
            (layernorm_before): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
            (attention): DonutSwinAttention(
              (self): DonutSwinSelfAttention(
                (query): Linear(in_features=128, out_features=128, bias=True)
                (key): Linear(in_features=128, out_features=128, bias=True)
                (value): Linear(in_features=128, out_features=128, bias=True)
                (dropout): Dropout(p=0.0, inplace=False)
              )
              (output): DonutSwinSelfOutput(
                (dense): Linear(in

In [9]:
for name, module in model.encoder.named_modules():
    print(name)



embeddings
embeddings.patch_embeddings
embeddings.patch_embeddings.projection
embeddings.norm
embeddings.dropout
encoder
encoder.layers
encoder.layers.0
encoder.layers.0.blocks
encoder.layers.0.blocks.0
encoder.layers.0.blocks.0.layernorm_before
encoder.layers.0.blocks.0.attention
encoder.layers.0.blocks.0.attention.self
encoder.layers.0.blocks.0.attention.self.query
encoder.layers.0.blocks.0.attention.self.key
encoder.layers.0.blocks.0.attention.self.value
encoder.layers.0.blocks.0.attention.self.dropout
encoder.layers.0.blocks.0.attention.output
encoder.layers.0.blocks.0.attention.output.dense
encoder.layers.0.blocks.0.attention.output.dropout
encoder.layers.0.blocks.0.drop_path
encoder.layers.0.blocks.0.layernorm_after
encoder.layers.0.blocks.0.intermediate
encoder.layers.0.blocks.0.intermediate.dense
encoder.layers.0.blocks.0.intermediate.intermediate_act_fn
encoder.layers.0.blocks.0.output
encoder.layers.0.blocks.0.output.dense
encoder.layers.0.blocks.0.output.dropout
encoder.lay

In [10]:
for name, module in model.decoder.named_modules():
    print(name)


model
model.decoder
model.decoder.embed_tokens
model.decoder.embed_positions
model.decoder.layers
model.decoder.layers.0
model.decoder.layers.0.self_attn
model.decoder.layers.0.self_attn.k_proj
model.decoder.layers.0.self_attn.v_proj
model.decoder.layers.0.self_attn.q_proj
model.decoder.layers.0.self_attn.out_proj
model.decoder.layers.0.activation_fn
model.decoder.layers.0.self_attn_layer_norm
model.decoder.layers.0.encoder_attn
model.decoder.layers.0.encoder_attn.k_proj
model.decoder.layers.0.encoder_attn.v_proj
model.decoder.layers.0.encoder_attn.q_proj
model.decoder.layers.0.encoder_attn.out_proj
model.decoder.layers.0.encoder_attn_layer_norm
model.decoder.layers.0.fc1
model.decoder.layers.0.fc2
model.decoder.layers.0.final_layer_norm
model.decoder.layers.1
model.decoder.layers.1.self_attn
model.decoder.layers.1.self_attn.k_proj
model.decoder.layers.1.self_attn.v_proj
model.decoder.layers.1.self_attn.q_proj
model.decoder.layers.1.self_attn.out_proj
model.decoder.layers.1.activation

In [11]:
# Access encoder layers
encoder_layers = model.encoder.encoder.layers

# Count number of layers
num_layers = len(encoder_layers)
print(f"Number of encoder layers: {num_layers}")


Number of encoder layers: 4


In [15]:
for i, layer in enumerate(encoder_layers):
    print(f"Layer {i} has {len(layer.blocks)} blocks")


Layer 0 has 2 blocks
Layer 1 has 2 blocks
Layer 2 has 14 blocks
Layer 3 has 2 blocks


In [16]:
# Access decoder layers
decoder_layers = model.decoder.model.decoder.layers

# Count number of layers
num_decoder_layers = len(decoder_layers)
print(f"Number of decoder layers: {num_decoder_layers}")


Number of decoder layers: 4


In [17]:
for idx, layer in enumerate(decoder_layers):
    print(f"Decoder Layer {idx} has {len(list(layer.children()))} submodules.")

Decoder Layer 0 has 8 submodules.
Decoder Layer 1 has 8 submodules.
Decoder Layer 2 has 8 submodules.
Decoder Layer 3 has 8 submodules.


In [None]:
example = 

Step 3 — Preprocess Dataset for DONUT Fine-Tuning

<ul>
<li>The model expects pixel tensors (numerical arrays) instead of PIL images.</li>
<li>The model expects token IDs (numerical sequences) instead of raw JSON strings.</li>
<li>So we must convert both inputs and outputs into numbers for the model to learn.</li>

</ul>

In [18]:
def preprocess(example):
    # Convert PIL image to pixel values tensor (for vision encoder)
    pixel_values = processor(example["image"], return_tensors="pt").pixel_values[0]

    # Tokenize the ground truth JSON string (for text decoder)
    labels = processor.tokenizer(
        example["ground_truth"],
        add_special_tokens=False,
        return_tensors="pt"
    ).input_ids[0]

    return {"pixel_values": pixel_values, "labels": labels}


In [None]:
# processed_dataset = dataset.map(preprocess, remove_columns=["image", "ground_truth"])

Map:   0%|          | 0/425 [00:00<?, ? examples/s]

Map: 100%|██████████| 425/425 [03:59<00:00,  1.78 examples/s]


ArrowMemoryError: realloc of size 8053063680 failed

# GOT error as it tries to load the entire tensor dataset in RAM so rather lets define custom PyTorch Dataset class and preprocess the data by dataloader consisting of smaller chunk of data

<ul>
<li>.map() applies the preprocess function on every sample.</li>
<li>Removes original columns image and ground_truth because we replace them with numeric tensors.</li>
<li>Now the dataset has: pixel_values: the image as a tensor, labels: token IDs of the JSON</li>
</ul>

In [19]:
from torch.utils.data import Dataset

class InvoiceDataset(Dataset):
    def __init__(self, hf_dataset, processor):
        self.dataset = hf_dataset
        self.processor = processor

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        example = self.dataset[idx]
        # Process image
        pixel_values = self.processor(example["image"], return_tensors="pt").pixel_values.squeeze()
        # Process text
        labels = self.processor.tokenizer(
            example["ground_truth"],
            add_special_tokens=False,
            return_tensors="pt"
        ).input_ids.squeeze()
        return {"pixel_values": pixel_values, "labels": labels}


In [20]:
train_dataset = InvoiceDataset(dataset["train"], processor)
val_dataset = InvoiceDataset(dataset["validation"], processor)


Step 4 — Set Up DataLoader & Training Loop


<ul>
<li>DataLoader loads the data in small batches, processes them, and feeds them to the model.</li>
<li>Batching allows GPU to process multiple samples at once, making training much faster.</li>
<li>We'll set:<ul>
<li>Batch size (number of samples processed per step).</li>
<li>Shuffling for training data (helps model generalize better).</li>
<li>Collate function to pad sequences in the batch (since JSON lengths vary).</li>
</ul>
</ul>

4.a) Create Collate Function (for Padding Labels):<br>
<li>Different invoices have different JSON lengths,
so we need to pad them to match batch dimensions.
</li>

In [21]:
def collate_fn(batch):
    pixel_values = torch.stack([item["pixel_values"] for item in batch])
    labels = [item["labels"] for item in batch]
    # Pad labels to the same length
    labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=processor.tokenizer.pad_token_id)
    return {"pixel_values": pixel_values, "labels": labels}


4.b) Create Dataloaders:<br>

In [22]:
from torch.utils.data import DataLoader

# You can adjust batch_size based on your GPU VRAM (start with 1 or 2 if low VRAM)
train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=2, shuffle=False, collate_fn=collate_fn)


In [23]:
batch = next(iter(train_dataloader))
print(batch["pixel_values"].shape)


torch.Size([2, 3, 2560, 1920])


 Step 5 — Training Loop for Fine-Tuning DONUT

In training, the model learns to minimize the difference between:
<li>Predicted output (JSON)</li>
<li>Ground truth (actual JSON)</li>


We use:
<li>Loss function → measures this difference (called CrossEntropyLoss here).</li>
<li>Optimizer → adjusts the model weights to minimize the loss (here, we use AdamW).</li>


Training Steps:
<li>Loop through training data (many batches → 1 epoch).</li>
<li>For each batch:
<li>Move data to GPU.</li>
<li>Forward pass: Model predicts output.</li>
<li>Calculate loss.</li>
<li>Backward pass: Compute gradients.</li>
<li>Optimizer updates weights.</li>

</li>
<li>Optionally evaluate on validation set.</li>


In [None]:
from torch.optim import AdamW



# Set decoder start token ID (this solves your error)
model.config.decoder_start_token_id = processor.tokenizer.convert_tokens_to_ids(["<s>"])[0]
model.config.pad_token_id = processor.tokenizer.pad_token_id

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
num_epochs = 3  # You can adjust
for epoch in range(num_epochs):
    model.train()  # Training mode ON
    total_loss = 0

    for batch in train_dataloader:
        # Move data to GPU/CPU
        pixel_values = batch["pixel_values"].to(device)
        labels = batch["labels"].to(device)

        # Forward pass
        outputs = model(pixel_values=pixel_values, labels=labels)
        loss = outputs.loss  # Model internally computes CrossEntropyLoss

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {avg_loss:.4f}")

    # Validation step (optional)
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            pixel_values = batch["pixel_values"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(pixel_values=pixel_values, labels=labels)
            val_loss += outputs.loss.item()
    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Validation Loss: {avg_val_loss:.4f}")


OutOfMemoryError: CUDA out of memory. Tried to allocate 300.00 MiB. GPU 0 has a total capacity of 4.00 GiB of which 0 bytes is free. Of the allocated memory 2.64 GiB is allocated by PyTorch, and 17.48 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

Inference


In [None]:
# Take a sample from test split
sample = dataset["test"][0]  # You can change index for other samples
image = sample["image"]
ground_truth = sample["ground_truth"]
image.show()
print("Ground Truth JSON:", ground_truth)  # Reference JSON

None
Ground Truth JSON: {"gt_parse": {"header": {"invoice_no": "97159829", "invoice_date": "09/18/2015", "seller": "Bradley-Andrade 9879 Elizabeth Common Lake Jonathan, RI 12335", "client": "Castro PLC Unit 9678 Box 9664 DPO AP 69387", "seller_tax_id": "985-73-8194", "client_tax_id": "994-72-1270", "iban": "GB81LZWO32519172531418"}, "items": [{"item_desc": "12\" Marble Lapis Inlay Chess Table Top With 2\" Pieces & 15\" Wooden Stand W537", "item_qty": "2,00", "item_net_price": "444,60", "item_net_worth": "889,20", "item_vat": "10%", "item_gross_worth": "978,12"}], "summary": {"total_net_worth": "$ 889,20", "total_vat": "$ 88,92", "total_gross_worth": "$ 978,12"}}}


In [6]:
# Prepare image for model
pixel_values = processor(image, return_tensors="pt").pixel_values.to(device)

# Generate prediction (greedy decoding)
outputs = model.generate(pixel_values, max_length=512, return_dict_in_generate=True)
predicted_tokens = processor.batch_decode(outputs.sequences, skip_special_tokens=True)[0]

print("Predicted JSON:\n", predicted_tokens)


OutOfMemoryError: CUDA out of memory. Tried to allocate 470.00 MiB. GPU 0 has a total capacity of 4.00 GiB of which 0 bytes is free. Of the allocated memory 2.56 GiB is allocated by PyTorch, and 492.14 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# Clean up model prediction
predicted_json = processor.token2json(predicted_tokens)
print("Parsed Prediction:\n", predicted_json)