# LayoutLM Dry-Run auf FUNSD

Ziel des Dry-Runs: **Pipeline-Test, keine Bestleistung**. Dieser Ablauf überprüft einmal den vollständigen Weg von Laden → Vorverarbeitung → Training → Evaluation auf CPU.

In [1]:
import os
import json
import random
from datetime import datetime

import numpy as np
import torch
from datasets import load_dataset, Dataset

from transformers import (
    LayoutLMTokenizerFast,
    LayoutLMForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
)

# Entity-level (NER-span) Metrics
from seqeval.metrics import precision_score, recall_score, f1_score

# --- Notebook-Stabilität: Tokenizers parallelism aus (verhindert Deadlock-Warnungen) ---
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# --- Reproduzierbarkeit ---
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

# --- Dry-Run Konfiguration ---
train_subset = 50
eval_subset = 20
epochs = 1
batch_size = 1

# --- Dataset/Model ---
dataset_name = "nielsr/funsd"
model_name = "microsoft/layoutlm-base-uncased"

# --- Run Logging / Output ---
run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = os.path.join("results", "funsd", "layoutlm", run_id)
os.makedirs(output_dir, exist_ok=True)

# Konfiguration wird später als run_config.json gespeichert
run_config = {
    "dataset": dataset_name,
    "model": model_name,
    "train_subset": train_subset,
    "eval_subset": eval_subset,
    "epochs": epochs,
    "batch_size": batch_size,
    "seed": seed,
    "run_id": run_id,
    "output_dir": output_dir,
    # Welche Splits du nutzt, definieren wir unten explizit
}


## 1) FUNSD laden, Felder prüfen, Label-Liste erstellen

In [4]:
funsd = load_dataset(dataset_name)

# FUNSD hat typischerweise: train, validation, test
print(funsd)

# Split-Wahl:
# - Dry-Run/Entwicklung: test
# - Final Reporting: test
train_split = "train"
eval_split = "test"  # FUNSD hat nur "train" und "test"

run_config["train_split"] = train_split
run_config["eval_split"] = eval_split

train_raw = funsd[train_split].select(range(min(train_subset, len(funsd[train_split]))))
eval_raw  = funsd[eval_split].select(range(min(eval_subset, len(funsd[eval_split]))))

# FUNSD kann je nach Version "words" oder "tokens" heißen
sample = train_raw[0]
token_field = "words" if "words" in sample else "tokens"
print("Using token field:", token_field)

# Labels (BIO-Tags) als Strings
label_list = funsd[train_split].features["ner_tags"].feature.names
num_labels = len(label_list)

print("Number of labels:", num_labels)
print("First labels:", label_list[:10])


DatasetDict({
    train: Dataset({
        features: ['id', 'words', 'bboxes', 'ner_tags', 'image'],
        num_rows: 149
    })
    test: Dataset({
        features: ['id', 'words', 'bboxes', 'ner_tags', 'image'],
        num_rows: 50
    })
})
Using token field: words
Number of labels: 7
First labels: ['O', 'B-HEADER', 'I-HEADER', 'B-QUESTION', 'I-QUESTION', 'B-ANSWER', 'I-ANSWER']


## 2) Tokenizer/Model laden + Label-Mapping sauber setzen

In [5]:
tokenizer = LayoutLMTokenizerFast.from_pretrained(model_name)

model = LayoutLMForTokenClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
)

# Sehr hilfreich: id2label/label2id für saubere Logs & spätere Analyse setzen
id2label = {i: lab for i, lab in enumerate(label_list)}
label2id = {lab: i for i, lab in enumerate(label_list)}
model.config.id2label = id2label
model.config.label2id = label2id


Some weights of LayoutLMForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlm-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 3) Encoding-Funktion (WICHTIG: korrektes Subtoken-Alignment)

In [6]:
def encode_funsd_examples(raw_ds, tokenizer, token_field):
    """
    Wandelt FUNSD-Beispiele in LayoutLM-Input um:
    - input_ids, attention_mask
    - bbox (token-level)
    - labels (token-level; nur erster Subtoken pro Wort bekommt Label, Rest -100)
    """
    encoded = []

    for example in raw_ds:
        words = example[token_field]
        word_boxes = example["bboxes"]
        word_labels = example["ner_tags"]

        # Tokenize (Wortliste -> Token)
        encoding = tokenizer(
            words,
            truncation=True,
            padding=False,  # Padding macht der DataCollator dynamisch
            is_split_into_words=True,
        )

        word_ids = encoding.word_ids()  # token -> word index (oder None)
        labels = []
        bboxes = []

        previous_word_id = None
        for word_id in word_ids:
            if word_id is None:
                # Special tokens: keine Labels, bbox dummy
                labels.append(-100)
                bboxes.append([0, 0, 0, 0])
            else:
                # bbox immer setzen
                bboxes.append(word_boxes[word_id])

                # Label nur für ersten Subtoken eines Wortes, sonst -100
                if word_id != previous_word_id:
                    labels.append(word_labels[word_id])
                else:
                    labels.append(-100)

            previous_word_id = word_id

        item = {k: encoding[k] for k in encoding.keys()}
        item["bbox"] = bboxes      # LayoutLM erwartet "bbox"
        item["labels"] = labels    # Trainer erwartet "labels"
        encoded.append(item)

    return Dataset.from_list(encoded)


## 4) Datasets erzeugen + Torch-Format setzen

In [7]:
train_dataset = encode_funsd_examples(train_raw, tokenizer, token_field)
eval_dataset  = encode_funsd_examples(eval_raw, tokenizer, token_field)

# Trainer arbeitet gut mit Torch-Tensors
train_dataset.set_format("torch")
eval_dataset.set_format("torch")

print(train_dataset)
print(eval_dataset)


Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'labels'],
    num_rows: 50
})
Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'labels'],
    num_rows: 20
})


## 5) seqeval compute_metrics (entity-level)

In [8]:
def compute_metrics(eval_pred):
    """
    Entity-level Evaluation (seqeval):
    - ignoriert -100 Labels
    - mappt label IDs -> Label Strings (BIO tags)
    """
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    true_labels = []
    true_preds = []

    for pred_seq, label_seq in zip(preds, labels):
        seq_true = []
        seq_pred = []
        for p, l in zip(pred_seq, label_seq):
            if l == -100:
                continue
            seq_true.append(label_list[int(l)])
            seq_pred.append(label_list[int(p)])
        true_labels.append(seq_true)
        true_preds.append(seq_pred)

    return {
        "precision": precision_score(true_labels, true_preds),
        "recall": recall_score(true_labels, true_preds),
        "f1": f1_score(true_labels, true_preds),
    }


## 6) TrainingArguments (CPU-only, notebook-stabil) + Trainer + Train/Eval

In [None]:
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,

    evaluation_strategy="epoch",
    save_strategy="no",

    logging_strategy="steps",
    logging_steps=10,

    seed=seed,
    report_to="none",

    # Notebook/macOS Stabilität
    dataloader_num_workers=0,
    dataloader_pin_memory=False,

    # CPU erzwingen (statt no_cuda=True)
    use_cpu=True,
)

# Dynamisches Padding: pad to longest in batch (viel schneller als max_length auf CPU)
data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer,
    padding=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

train_result = trainer.train()
eval_metrics = trainer.evaluate()

print("Eval metrics:", eval_metrics)
print("Train result:", train_result)


  0%|          | 0/50 [00:00<?, ?it/s]

{'loss': 1.8238, 'grad_norm': 3.1697402000427246, 'learning_rate': 4e-05, 'epoch': 0.2}
{'loss': 1.8142, 'grad_norm': 7.966146945953369, 'learning_rate': 3e-05, 'epoch': 0.4}


## 7) Artefakte speichern (metrics.json + run_config.json)

In [None]:
# run_config erweitern um relevante Infos
run_config.update({
    "num_labels": num_labels,
    "label_list_preview": label_list[:10],
    "device": "cpu",
})

with open(os.path.join(output_dir, "run_config.json"), "w", encoding="utf-8") as f:
    json.dump(run_config, f, indent=2)

with open(os.path.join(output_dir, "metrics.json"), "w", encoding="utf-8") as f:
    json.dump(eval_metrics, f, indent=2)

print(f"Saved run_config.json and metrics.json to: {output_dir}")
