In [1]:
!pip install transformers datasets seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m43.6/43.6 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=de81ce2f4ca4ade2ac67acf50f23c929183b13ec9849bf9dc0d900304c465399
  Stored in directory: /root/.cache/pip/wheels/5f/b8/73/0b2c1a76b701a677653dd79ece07cfabd7457989dbfbdcd8d7
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [27]:
import json
from PIL import Image
import os

def load_funsd_split(split="training"):
    data = []
    
    base = "/kaggle/input/datasets/aravindram11/funsdform-understanding-noisy-scanned-documents/dataset"
    
    ann_dir = f"{base}/{split}_data/annotations"
    img_dir = f"{base}/{split}_data/images"
    
    for file in os.listdir(ann_dir):
        if not file.endswith(".json"):
            continue
        
        with open(os.path.join(ann_dir, file), "r") as f:
            ann = json.load(f)
        
        image_path = os.path.join(img_dir, file.replace(".json", ".png"))
        image = Image.open(image_path).convert("RGB")
        
        tokens = []
        bboxes = []
        ner_tags = []
        
        for item in ann["form"]:
            label = item["label"].upper()
            words = item["words"]
            
            for idx, word in enumerate(words):
                tokens.append(word["text"])
                bboxes.append(word["box"])
                
                if label == "OTHER":
                    ner_tags.append("O")
                else:
                    prefix = "B-" if idx == 0 else "I-"
                    ner_tags.append(prefix + label)
        
        data.append({
            "id": file,
            "tokens": tokens,
            "bboxes": bboxes,
            "ner_tags": ner_tags,
            "image": image
        })
    
    return data

In [28]:
train_data = load_funsd_split("training")
test_data = load_funsd_split("testing")

print("Train size:", len(train_data))
print("Test size:", len(test_data))

Train size: 149
Test size: 50


In [31]:
label_list = list(set(tag for doc in train_data for tag in doc["ner_tags"]))
label_list = sorted(label_list)

label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

print(label_list)

['B-ANSWER', 'B-HEADER', 'B-QUESTION', 'I-ANSWER', 'I-HEADER', 'I-QUESTION', 'O']


In [32]:
from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification

from transformers import LayoutLMv3ImageProcessor, LayoutLMv3TokenizerFast, LayoutLMv3Processor

image_processor = LayoutLMv3ImageProcessor.from_pretrained(
    "microsoft/layoutlmv3-base",
    apply_ocr=False  # üî• VERY IMPORTANT
)

tokenizer = LayoutLMv3TokenizerFast.from_pretrained("microsoft/layoutlmv3-base")

processor = LayoutLMv3Processor(image_processor, tokenizer)

model = LayoutLMv3ForTokenClassification.from_pretrained(
    "microsoft/layoutlmv3-base",
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

Loading weights:   0%|          | 0/212 [00:00<?, ?it/s]

[1mLayoutLMv3ForTokenClassification LOAD REPORT[0m from: microsoft/layoutlmv3-base
Key                                | Status     | 
-----------------------------------+------------+-
layoutlmv3.embeddings.position_ids | UNEXPECTED | 
classifier.weight                  | MISSING    | 
classifier.bias                    | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


In [33]:
from datasets import Dataset

train_dataset = Dataset.from_list(train_data)
test_dataset = Dataset.from_list(test_data)

def encode(example):
    encoding = processor(
        example["image"],
        example["tokens"],
        boxes=example["bboxes"],
        word_labels=[label2id[label] for label in example["ner_tags"]],
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )

    # üî• Remove batch dimension from each tensor
    encoding = {k: v.squeeze(0) for k, v in encoding.items()}

    return encoding

train_dataset = train_dataset.map(encode, batched=False)
test_dataset = test_dataset.map(encode, batched=False)

Map:   0%|          | 0/149 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [16]:
import transformers
print(transformers.__version__)

5.2.0


In [35]:
from seqeval.metrics import classification_report, f1_score

def compute_metrics(p):
    predictions, labels = p
    predictions = predictions.argmax(axis=2)

    true_predictions = [
        [id2label[p] for (p, l) in zip(pred, lab) if l != -100]
        for pred, lab in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(pred, lab) if l != -100]
        for pred, lab in zip(predictions, labels)
    ]

    return {
        "f1": f1_score(true_labels, true_predictions)
    }

In [37]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./layoutlmv3-funsd",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=10,
    eval_strategy="epoch",
    save_strategy="no",
    logging_steps=10,
    learning_rate=3e-5,
    weight_decay=0.01,
    warmup_ratio=0.1,
    max_grad_norm=1.0
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.train()
metrics = trainer.evaluate()
print(metrics)

warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.


Epoch,Training Loss,Validation Loss,F1
1,1.987033,1.619372,0.634624
2,1.349043,1.331499,0.720055
3,0.987176,1.205467,0.757371
4,0.840203,1.21525,0.782911
5,0.480419,1.313016,0.785679
6,0.62,1.376371,0.793069
7,0.358026,1.344589,0.793635
8,0.238793,1.388594,0.805963
9,0.218715,1.464901,0.809852
10,0.167741,1.457022,0.80713


{'eval_loss': 1.4570218324661255, 'eval_f1': 0.8071304778410497, 'eval_runtime': 6.9857, 'eval_samples_per_second': 7.157, 'eval_steps_per_second': 1.861, 'epoch': 10.0}


In [21]:
!pip install seqeval



In [39]:
import torch
import numpy as np

def extract_structured_json(example):
    model.eval()
    
    # Encode single example
    encoding = processor(
        example["image"],
        example["tokens"],
        boxes=example["bboxes"],
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )

    # Move to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    encoding = {k: v.to(device) for k, v in encoding.items()}

    with torch.no_grad():
        outputs = model(**encoding)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=2).cpu().numpy()[0]

    tokens = example["tokens"]

    structured_output = {}

    current_entity = None
    current_tokens = []

    for token, pred_id in zip(tokens, predictions[:len(tokens)]):
        label = id2label[pred_id]

        if label == "O":
            if current_entity:
                text = " ".join(current_tokens)
                structured_output.setdefault(current_entity, []).append(text)
                current_entity = None
                current_tokens = []
            continue

        if label.startswith("B-"):
            if current_entity:
                text = " ".join(current_tokens)
                structured_output.setdefault(current_entity, []).append(text)

            current_entity = label[2:]
            current_tokens = [token]

        elif label.startswith("I-") and current_entity == label[2:]:
            current_tokens.append(token)

        else:
            if current_entity:
                text = " ".join(current_tokens)
                structured_output.setdefault(current_entity, []).append(text)
            current_entity = None
            current_tokens = []

    # Catch last entity
    if current_entity and current_tokens:
        text = " ".join(current_tokens)
        structured_output.setdefault(current_entity, []).append(text)

    return {
        "document_id": example["id"],
        "extracted_fields": structured_output
    }

In [40]:
sample = test_data[0]

output_json = extract_structured_json(sample)

import json
print(json.dumps(output_json, indent=2))

{
  "document_id": "83823750.json",
  "extracted_fields": {
    "QUESTION": [
      "To",
      "2",
      "Sender",
      "Reference",
      "Message:",
      "83823750",
      "Confidentiality Note:",
      "212",
      "-450 -5578",
      "17560 -188",
      "and may contain information that is privileged, confidential or",
      "disclosure  distribution or",
      "copying of this facsimile or the",
      "the intended recipient, or an"
    ],
    "ANSWER": [
      "Company",
      "Charles Duggan Sender Voice Number Main Fax Operator",
      "Voice Number 212 -450-",
      "4785 Sender Fax Number",
      "This facsimile  intended only  the person",
      "or entity to which it is addressed",
      "otherwise protected from",
      "information herein by anyone other than",
      "employee or agent responsible for delivering",
      "the message to the"
    ],
    "HEADER": [
      "Robert H. Shaw, Esq. November 11, 1997 Lorillard Tobacco"
    ]
  }
}
