# Final Experiment Run – LayoutLM on FUNSD

## Purpose
This notebook executes the **final training and evaluation run** of the LayoutLM model on the **full FUNSD dataset**.
The underlying preprocessing pipeline, model configuration, and evaluation procedure were previously validated
in a separate **dry-run notebook** using reduced dataset subsets.

## Experimental Setup
- **Model:** LayoutLM (microsoft/layoutlm-base-uncased)
- **Dataset:** FUNSD
- **Task:** Token Classification (entity-level evaluation using BIO tags)
- **Training Mode:** Fine-tuning (CPU-only)
- **Evaluation:** Entity-level Precision / Recall / F1 (seqeval)
- **Seed:** 42

## Notes
- This notebook differs from the dry-run **only in runtime parameters** (dataset size, number of epochs).
- No changes were made to the model architecture, preprocessing, or evaluation logic.
- All results and configurations are stored in a dedicated output directory to ensure reproducibility.


In [1]:
import os
import json
import random
from datetime import datetime

import numpy as np
import torch
from datasets import load_dataset, Dataset

from transformers import (
    LayoutLMTokenizerFast,
    LayoutLMForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
)

# Entity-level (NER-span) Metrics
from seqeval.metrics import precision_score, recall_score, f1_score

# --- Notebook-Stabilität ---
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# --- Reproduzierbarkeit ---
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

# =========================
# Full Run Konfiguration
# =========================

epochs = 3          # Empfehlung für finalen Run (alternativ 2 bei Zeitdruck)
batch_size = 1      # CPU-only, 8 GB RAM

# --- Dataset / Model ---
dataset_name = "nielsr/funsd"
model_name = "microsoft/layoutlm-base-uncased"

# --- Run Logging / Output ---
run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = os.path.join("results", "funsd", "layoutlm", "fullrun", run_id)
os.makedirs(output_dir, exist_ok=True)

# --- Run-Konfiguration (für Reproduzierbarkeit) ---
run_config = {
    "dataset": dataset_name,
    "model": model_name,
    "train_split": "train",
    "eval_split": "test",      # finaler Report
    "epochs": epochs,
    "batch_size": batch_size,
    "seed": seed,
    "device": "cpu",
    "run_id": run_id,
    "output_dir": output_dir,
}


## 1) FUNSD laden, Felder prüfen, Label-Liste erstellen

In [2]:
# --- Load dataset ---
funsd = load_dataset(dataset_name)
print(funsd)

# --- Full Run: use full splits (no subsetting) ---
train_split = "train"
eval_split = "test"  # final reporting split; for development use "validation"

run_config["train_split"] = train_split
run_config["eval_split"] = eval_split

# Full run = take the entire split (no .select(range(...)))
train_raw = funsd[train_split]
eval_raw = funsd[eval_split]

# FUNSD can use different token field names depending on dataset version
sample = train_raw[0]
token_field = "words" if "words" in sample else "tokens"
print("Using token field:", token_field)

# Label set (BIO tags) as strings
label_list = funsd[train_split].features["ner_tags"].feature.names
num_labels = len(label_list)

print("Number of labels:", num_labels)
print("First labels:", label_list[:10])


DatasetDict({
    train: Dataset({
        features: ['id', 'words', 'bboxes', 'ner_tags', 'image'],
        num_rows: 149
    })
    test: Dataset({
        features: ['id', 'words', 'bboxes', 'ner_tags', 'image'],
        num_rows: 50
    })
})
Using token field: words
Number of labels: 7
First labels: ['O', 'B-HEADER', 'I-HEADER', 'B-QUESTION', 'I-QUESTION', 'B-ANSWER', 'I-ANSWER']


## 2) Tokenizer/Model laden + Label-Mapping sauber setzen

In [3]:
tokenizer = LayoutLMTokenizerFast.from_pretrained(model_name)

model = LayoutLMForTokenClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
)

# Sehr hilfreich: id2label/label2id für saubere Logs & spätere Analyse setzen
id2label = {i: lab for i, lab in enumerate(label_list)}
label2id = {lab: i for i, lab in enumerate(label_list)}
model.config.id2label = id2label
model.config.label2id = label2id


Some weights of LayoutLMForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlm-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 3) Encoding-Funktion (WICHTIG: korrektes Subtoken-Alignment)

In [4]:
def encode_funsd_examples(raw_ds, tokenizer, token_field, max_length=None):
    """
    Encode FUNSD examples for LayoutLM (FULL RUN).

    Produces token-level inputs:
    - input_ids, attention_mask
    - bbox (token-level bounding boxes)
    - labels (token-level labels, BIO)

    Important:
    - Only the FIRST subtoken of each word receives a label
    - All subsequent subtokens are assigned -100
    - Required for correct entity-level (seqeval) evaluation
    """
    encoded = []

    for example in raw_ds:
        words = example[token_field]
        word_boxes = example["bboxes"]
        word_labels = example["ner_tags"]

        encoding = tokenizer(
            words,
            truncation=True,
            padding=False,          # dynamic padding via DataCollator
            is_split_into_words=True,
            max_length=max_length,  # None = no hard limit (model default, typically 512)
        )

        word_ids = encoding.word_ids()
        labels = []
        bboxes = []

        previous_word_id = None
        for word_id in word_ids:
            if word_id is None:
                # Special tokens (e.g., [CLS], [SEP])
                labels.append(-100)
                bboxes.append([0, 0, 0, 0])
            else:
                # Robust bbox handling (int + clamp to [0,1000])
                x0, y0, x1, y1 = word_boxes[word_id]
                x0 = int(max(0, min(1000, x0)))
                y0 = int(max(0, min(1000, y0)))
                x1 = int(max(0, min(1000, x1)))
                y1 = int(max(0, min(1000, y1)))
                bboxes.append([x0, y0, x1, y1])

                # Label only for first subtoken of each word
                if word_id != previous_word_id:
                    labels.append(word_labels[word_id])
                else:
                    labels.append(-100)

            previous_word_id = word_id

        item = {k: encoding[k] for k in encoding.keys()}
        item["bbox"] = bboxes
        item["labels"] = labels
        encoded.append(item)

    return Dataset.from_list(encoded)

## 4) Datasets erzeugen + Torch-Format setzen

In [5]:
train_dataset = encode_funsd_examples(train_raw, tokenizer, token_field)
eval_dataset  = encode_funsd_examples(eval_raw, tokenizer, token_field)

# Trainer arbeitet gut mit Torch-Tensors
train_dataset.set_format("torch")
eval_dataset.set_format("torch")

print(train_dataset)
print(eval_dataset)


Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'labels'],
    num_rows: 149
})
Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'labels'],
    num_rows: 50
})


## 5) seqeval compute_metrics (entity-level)

In [6]:
def compute_metrics(eval_pred):
    """
    Entity-level Evaluation (seqeval):
    - ignoriert -100 Labels
    - mappt label IDs -> Label Strings (BIO tags)
    """
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    true_labels = []
    true_preds = []

    for pred_seq, label_seq in zip(preds, labels):
        seq_true = []
        seq_pred = []
        for p, l in zip(pred_seq, label_seq):
            if l == -100:
                continue
            seq_true.append(label_list[int(l)])
            seq_pred.append(label_list[int(p)])
        true_labels.append(seq_true)
        true_preds.append(seq_pred)

    return {
        "precision": precision_score(true_labels, true_preds),
        "recall": recall_score(true_labels, true_preds),
        "f1": f1_score(true_labels, true_preds),
    }


In [9]:
len(train_raw), len(eval_raw)

(149, 50)

## 6) TrainingArguments (CPU-only, notebook-stabil) + Trainer + Train/Eval

In [7]:
# =========================
# Training & Evaluation
# =========================

training_args = TrainingArguments(
    output_dir=output_dir,

    # --- Full Run Parameters ---
    num_train_epochs=epochs,                # z. B. 1 (Light) oder 3 (Final)
    per_device_train_batch_size=batch_size, # empfohlen: 1
    per_device_eval_batch_size=batch_size,

    # --- Optimization (explizit für Reproduzierbarkeit) ---
    learning_rate=5e-5,
    warmup_ratio=0.1,
    weight_decay=0.01,

    # --- Evaluation & Saving ---
    evaluation_strategy="epoch",
    save_strategy="no",          # wir speichern Metriken/Configs selbst
    
    # --- Logging (CPU-/Notebook-freundlich) ---
    logging_strategy="steps",
    logging_steps=50,            # weniger Overhead als 10
    disable_tqdm=False,          # Fortschritt sichtbar (okay über Nacht)

    # --- Reproducibility ---
    seed=seed,
    report_to="none",

    # --- Notebook / macOS Stability ---
    dataloader_num_workers=0,    # verhindert Fork-Probleme
    dataloader_pin_memory=False,

    # --- Device ---
    use_cpu=True,                # explizit CPU (statt no_cuda=True)
)

# --- Dynamic padding (entscheidend für CPU-Performance) ---
data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer,
    padding=True,  # pad to longest sequence in batch
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# --- Train ---
train_result = trainer.train()

# --- Final Evaluation ---
eval_metrics = trainer.evaluate()

print("Eval metrics:", eval_metrics)
print("Train result:", train_result)


  0%|          | 0/447 [00:00<?, ?it/s]

{'loss': 1.7459, 'grad_norm': 7.434628963470459, 'learning_rate': 4.9378109452736324e-05, 'epoch': 0.34}
{'loss': 1.0034, 'grad_norm': 8.642791748046875, 'learning_rate': 4.3159203980099506e-05, 'epoch': 0.67}


  0%|          | 0/50 [00:00<?, ?it/s]

{'eval_loss': 0.6608246564865112, 'eval_precision': 0.6719404374127501, 'eval_recall': 0.7245358755644756, 'eval_f1': 0.6972477064220183, 'eval_runtime': 20.2777, 'eval_samples_per_second': 2.466, 'eval_steps_per_second': 2.466, 'epoch': 1.0}
{'loss': 0.8809, 'grad_norm': 5.578529357910156, 'learning_rate': 3.694029850746269e-05, 'epoch': 1.01}
{'loss': 0.5805, 'grad_norm': 8.722719192504883, 'learning_rate': 3.0721393034825876e-05, 'epoch': 1.34}
{'loss': 0.5853, 'grad_norm': 4.068142890930176, 'learning_rate': 2.4502487562189054e-05, 'epoch': 1.68}


  0%|          | 0/50 [00:00<?, ?it/s]

{'eval_loss': 0.6193162798881531, 'eval_precision': 0.7001375515818432, 'eval_recall': 0.7661816357250376, 'eval_f1': 0.7316722568279826, 'eval_runtime': 20.2539, 'eval_samples_per_second': 2.469, 'eval_steps_per_second': 2.469, 'epoch': 2.0}
{'loss': 0.6393, 'grad_norm': 5.557031631469727, 'learning_rate': 1.828358208955224e-05, 'epoch': 2.01}
{'loss': 0.4071, 'grad_norm': 2.3904988765716553, 'learning_rate': 1.2064676616915425e-05, 'epoch': 2.35}
{'loss': 0.374, 'grad_norm': 7.742086410522461, 'learning_rate': 5.845771144278607e-06, 'epoch': 2.68}


  0%|          | 0/50 [00:00<?, ?it/s]

{'eval_loss': 0.6192845106124878, 'eval_precision': 0.7162855809612693, 'eval_recall': 0.77019568489714, 'eval_f1': 0.7422630560928433, 'eval_runtime': 20.4779, 'eval_samples_per_second': 2.442, 'eval_steps_per_second': 2.442, 'epoch': 3.0}
{'train_runtime': 928.9561, 'train_samples_per_second': 0.481, 'train_steps_per_second': 0.481, 'train_loss': 0.7303057484978798, 'epoch': 3.0}


  0%|          | 0/50 [00:00<?, ?it/s]

Eval metrics: {'eval_loss': 0.6192845106124878, 'eval_precision': 0.7162855809612693, 'eval_recall': 0.77019568489714, 'eval_f1': 0.7422630560928433, 'eval_runtime': 20.3556, 'eval_samples_per_second': 2.456, 'eval_steps_per_second': 2.456, 'epoch': 3.0}
Train result: TrainOutput(global_step=447, training_loss=0.7303057484978798, metrics={'train_runtime': 928.9561, 'train_samples_per_second': 0.481, 'train_steps_per_second': 0.481, 'train_loss': 0.7303057484978798, 'epoch': 3.0})


## 7) Artefakte speichern (metrics.json + run_config.json)

In [8]:
# run_config erweitern um relevante Infos
run_config.update({
    "num_labels": num_labels,
    "label_list_preview": label_list[:10],
    "device": "cpu",
})

with open(os.path.join(output_dir, "run_config.json"), "w", encoding="utf-8") as f:
    json.dump(run_config, f, indent=2)

with open(os.path.join(output_dir, "metrics.json"), "w", encoding="utf-8") as f:
    json.dump(eval_metrics, f, indent=2)

print(f"Saved run_config.json and metrics.json to: {output_dir}")


Saved run_config.json and metrics.json to: results/funsd/layoutlm/fullrun/20260111_205332


In [13]:
import pandas as pd


def _to_device(batch, device):
    """Move tensors in batch dict to device."""
    out = {}
    for k, v in batch.items():
        if isinstance(v, torch.Tensor):
            out[k] = v.to(device)
        else:
            out[k] = v
    return out

@torch.no_grad()
def predict_single_example(model, collator, features, device):
    """
    Run model forward pass on one encoded example (features dict of torch tensors).
    Returns logits (seq_len, num_labels) and label_ids (seq_len,) if present.
    """
    model.eval()
    batch = collator([features])
    batch = _to_device(batch, device)
    outputs = model(**{k: v for k, v in batch.items() if k != "labels"})
    logits = outputs.logits[0].detach().cpu().numpy()
    labels = batch.get("labels", None)
    if labels is not None:
        labels = labels[0].detach().cpu().numpy()
    return logits, labels

def decode_token_level_to_word_level(tokenizer_or_processor, words, word_boxes, token_logits, token_label_ids, label_list, max_length=512):
    """
    Maps token predictions to word-level predictions using word_ids.
    - Keeps first subtoken prediction per word (like your encoding)
    - Ignores special tokens / -100 labels
    Returns: list of dicts per word: {word, true_label, pred_label, box}
    """
    # tokenizer in LayoutLMv3 requires boxes
    try:
        tok = tokenizer_or_processor(
            words,
            boxes=word_boxes,
            truncation=True,
            padding="max_length",
            max_length=max_length,
            is_split_into_words=True,
            return_tensors=None,
        )
        word_ids = tok.word_ids()
    except TypeError:
        # LayoutLM tokenizer: boxes not required
        tok = tokenizer_or_processor(
            words,
            truncation=True,
            padding="max_length",
            max_length=max_length,
            is_split_into_words=True,
            return_tensors=None,
        )
        word_ids = tok.word_ids()

    token_pred_ids = np.argmax(token_logits, axis=-1)

    rows = []
    seen_word_ids = set()
    for t_idx, w_id in enumerate(word_ids):
        if w_id is None:
            continue
        if w_id in seen_word_ids:
            continue  # only first subtoken
        seen_word_ids.add(w_id)

        true_lab = None
        if token_label_ids is not None:
            true_id = int(token_label_ids[t_idx])
            if true_id == -100:
                true_lab = None
            else:
                true_lab = label_list[true_id]

        pred_lab = label_list[int(token_pred_ids[t_idx])]
        rows.append({
            "word_id": w_id,
            "word": words[w_id],
            "true_label": true_lab,
            "pred_label": pred_lab,
            "bbox": word_boxes[w_id],
        })

    return rows

def format_ner_df(rows):
    df = pd.DataFrame(rows)
    # optional: show only rows with entities or mismatches
    df["is_entity_true"] = df["true_label"].fillna("O").ne("O")
    df["is_entity_pred"] = df["pred_label"].ne("O")
    df["mismatch"] = (df["true_label"].fillna("O") != df["pred_label"])
    return df

def pick_examples(eval_raw, n=3, strategy="mixed"):
    """
    Picks indices for qualitative analysis.
    strategy:
      - "first": first n
      - "random": random n
      - "mixed": mix of short/long docs by number of words
    """
    if strategy == "first":
        return list(range(min(n, len(eval_raw))))

    if strategy == "random":
        rng = np.random.default_rng(42)
        return rng.choice(len(eval_raw), size=min(n, len(eval_raw)), replace=False).tolist()

    # mixed: choose short, medium, long
    lengths = []
    for i in range(len(eval_raw)):
        ex = eval_raw[i]
        words = ex["words"] if "words" in ex else ex["tokens"]
        lengths.append((i, len(words)))
    lengths.sort(key=lambda x: x[1])
    if len(lengths) == 0:
        return []

    picks = []
    picks.append(lengths[0][0])                        # shortest
    picks.append(lengths[len(lengths)//2][0])         # median
    if len(lengths) > 2:
        picks.append(lengths[-1][0])                  # longest
    return picks[:min(n, len(picks))]

In [14]:
# =========================
# Qualitative examples (per model)
# =========================

# 1) Set these two based on your notebook:
# LayoutLM notebook:
#   ENCODE_FN = encode_funsd_examples
#   TOKENIZER_FOR_WORDIDS = tokenizer
#
# LayoutLMv3 notebook:
#   ENCODE_FN = encode_funsd_examples_layoutlmv3
#   TOKENIZER_FOR_WORDIDS = processor.tokenizer

ENCODE_FN = encode_funsd_examples
TOKENIZER_FOR_WORDIDS = tokenizer

assert ENCODE_FN is not None, "Please set ENCODE_FN to your encoding function."
assert TOKENIZER_FOR_WORDIDS is not None, "Please set TOKENIZER_FOR_WORDIDS (tokenizer or processor.tokenizer)."

device = trainer.args.device if "trainer" in globals() else torch.device("cpu")
model_for_pred = trainer.model if "trainer" in globals() else model

# Use the same collator you trained with:
# LayoutLM: DataCollatorForTokenClassification(...)
# LayoutLMv3: default_data_collator
collator_for_pred = trainer.data_collator if "trainer" in globals() else data_collator

# Select 3 examples from eval_raw (short/median/long)
example_indices = pick_examples(eval_raw, n=3, strategy="mixed")
print("Selected eval example indices:", example_indices)

qual_dir = os.path.join(output_dir, "qualitative")
os.makedirs(qual_dir, exist_ok=True)

for idx in example_indices:
    raw_ex = eval_raw[idx]
    words = raw_ex[token_field]
    word_boxes = raw_ex["bboxes"] if "bboxes" in raw_ex else raw_ex["bbox"]

    # Encode THIS single example using your model-specific encoding function
    encoded_ds = ENCODE_FN([raw_ex], processor if "processor" in globals() else tokenizer, token_field) \
        if "processor" in globals() else ENCODE_FN([raw_ex], tokenizer, token_field)

    # Your ENCODE_FN may return a Dataset; take first row as features
    if isinstance(encoded_ds, Dataset):
        features = encoded_ds[0]
        # ensure torch tensors
        # if not already tensors, Trainer collator can handle lists/np; but safer:
        for k, v in list(features.items()):
            if isinstance(v, np.ndarray):
                features[k] = torch.tensor(v)
            elif isinstance(v, list):
                features[k] = torch.tensor(v)
    else:
        # If ENCODE_FN returns list of dicts
        features = encoded_ds[0]

    logits, label_ids = predict_single_example(model_for_pred, collator_for_pred, features, device)

    # Decode token-level to word-level using word_ids
    rows = decode_token_level_to_word_level(
        TOKENIZER_FOR_WORDIDS,
        words=words,
        word_boxes=word_boxes,
        token_logits=logits,
        token_label_ids=label_ids,
        label_list=label_list,
        max_length=512,
    )
    df = format_ner_df(rows)

    # Save full table + mismatch-only table
    out_csv = os.path.join(qual_dir, f"qual_example_{idx}.csv")
    df.to_csv(out_csv, index=False)

    mismatch_df = df[df["mismatch"] == True].copy()
    out_csv_mismatch = os.path.join(qual_dir, f"qual_example_{idx}_mismatch.csv")
    mismatch_df.to_csv(out_csv_mismatch, index=False)

    print(f"[{idx}] saved: {out_csv}")
    print(f"[{idx}] saved mismatches: {out_csv_mismatch}")

# Also save which indices were used
with open(os.path.join(qual_dir, "selected_example_indices.json"), "w", encoding="utf-8") as f:
    json.dump({"indices": example_indices}, f, indent=2)

Selected eval example indices: [26, 5, 45]
[26] saved: results/funsd/layoutlm/fullrun/20260111_205332/qualitative/qual_example_26.csv
[26] saved mismatches: results/funsd/layoutlm/fullrun/20260111_205332/qualitative/qual_example_26_mismatch.csv
[5] saved: results/funsd/layoutlm/fullrun/20260111_205332/qualitative/qual_example_5.csv
[5] saved mismatches: results/funsd/layoutlm/fullrun/20260111_205332/qualitative/qual_example_5_mismatch.csv
[45] saved: results/funsd/layoutlm/fullrun/20260111_205332/qualitative/qual_example_45.csv
[45] saved mismatches: results/funsd/layoutlm/fullrun/20260111_205332/qualitative/qual_example_45_mismatch.csv


In [None]:
# =========================
# End-of-run summary + environment logging
# =========================

import platform
import transformers
import datasets

summary = {
    "model": run_config.get("model", None),
    "dataset": run_config.get("dataset", None),
    "train_split": run_config.get("train_split", None),
    "eval_split": run_config.get("eval_split", None),
    "epochs": run_config.get("epochs", None),
    "batch_size": run_config.get("batch_size", None),
    "seed": run_config.get("seed", None),
    "device": run_config.get("device", "cpu"),
}

# Pull the key metrics from eval_metrics
summary.update({
    "eval_precision": float(eval_metrics.get("eval_precision", eval_metrics.get("precision", np.nan))),
    "eval_recall": float(eval_metrics.get("eval_recall", eval_metrics.get("recall", np.nan))),
    "eval_f1": float(eval_metrics.get("eval_f1", eval_metrics.get("f1", np.nan))),
    "eval_loss": float(eval_metrics.get("eval_loss", np.nan)),
    "eval_runtime": float(eval_metrics.get("eval_runtime", np.nan)),
})

summary_df = pd.DataFrame([summary])
print(summary_df)

# Save summary files
os.makedirs(output_dir, exist_ok=True)
summary_path_json = os.path.join(output_dir, "summary.json")
summary_path_csv  = os.path.join(output_dir, "summary.csv")

with open(summary_path_json, "w", encoding="utf-8") as f:
    json.dump(summary, f, indent=2)

summary_df.to_csv(summary_path_csv, index=False)

print("Saved:", summary_path_json)
print("Saved:", summary_path_csv)

# Environment logging (very helpful for reproducibility)
env = {
    "python_version": platform.python_version(),
    "platform": platform.platform(),
    "torch_version": torch.__version__,
    "transformers_version": transformers.__version__,
    "datasets_version": datasets.__version__,
}

env_path = os.path.join(output_dir, "environment.json")
with open(env_path, "w", encoding="utf-8") as f:
    json.dump(env, f, indent=2)

print("Saved:", env_path)


                             model       dataset train_split eval_split  \
0  microsoft/layoutlm-base-uncased  nielsr/funsd       train       test   

   epochs  batch_size  seed device  eval_precision  eval_recall   eval_f1  \
0       3           1    42    cpu        0.716286     0.770196  0.742263   

   eval_loss  eval_runtime  
0   0.619285       20.3556  
Saved: results/funsd/layoutlm/fullrun/20260111_205332/summary.json
Saved: results/funsd/layoutlm/fullrun/20260111_205332/summary.csv
Saved: results/funsd/layoutlm/fullrun/20260111_205332/environment.json
