In [1]:
import importlib
import segment
importlib.reload(segment)
from segment import load_report_with_images, get_docs_with_ocr
from doctr.models import ocr_predictor

ocr_model = ocr_predictor(det_arch='db_resnet50', reco_arch='crnn_vgg16_bn', pretrained=True).cuda()
items = load_report_with_images(limit=32)
print("Loaded:", len(items))
docs = get_docs_with_ocr(items, ocr_model)
print("Parsed:", len(docs))

Loaded: 32


Constructing documents: 100%|██████████| 32/32 [00:00<00:00, 1943.69it/s]
OCR: 100%|██████████| 32/32 [00:04<00:00,  6.75it/s]

Parsed: 32





In [2]:
import matplotlib.pyplot as plt
from PIL import Image, ImageDraw
import importlib
import visualize
importlib.reload(visualize)
from visualize import visualize_word_boxes

data = docs[1]
img = visualize_word_boxes(data["image_path"], data["boxes"], data["labels"])
# plt.figure(figsize=(12, 12))
# plt.imshow(img)
# plt.axis("off")
# plt.show()

In [7]:
from datasets import Dataset
from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
from PIL import Image, ImageDraw, ImageFont
import torch
from datasets import Features, Sequence, ClassLabel, Value, Array2D, Array3D
from torch.utils.data import DataLoader


processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=False)

# we need to define custom features
# features = Features({
#     'image': Array3D(dtype="int64", shape=(3, 224, 224)),
#     'input_ids': Sequence(feature=Value(dtype='int64')),
#     'attention_mask': Sequence(Value(dtype='int64')),
#     'token_type_ids': Sequence(Value(dtype='int64')),
#     'bbox': Array2D(dtype="int64", shape=(512, 4)),
#     'labels': Sequence(ClassLabel(names=labels)),
# })

def normalize_doc(d):
    h, w = d['dimensions']
    for box in d['boxes']:
        box[0] = int(box[0] / w * 1000)
        box[1] = int(box[1] / h * 1000)
        box[2] = int(box[2] / w * 1000)
        box[3] = int(box[3] / h * 1000)
    return {
        "words": d["words"],          # list of strings
        "boxes": d["boxes"],
        "labels": d["labels"],        # list of ints
        "image_path": d["image_path"]
    }


def preprocess_data(examples):
  images = [Image.open(path).convert("RGB") for path in examples['image_path']]
  words = examples['words']
  boxes = examples['boxes']
  word_labels = examples['labels']
  
  encoded_inputs = processor(images, words, boxes=boxes, word_labels=word_labels,
                             padding="max_length", truncation=True)
  
  return encoded_inputs

rows = [normalize_doc(d) for d in docs]
ds = Dataset.from_list(rows)
ds = ds.train_test_split(test_size=0.1, seed=42)
train_dataset = ds['train'].map(preprocess_data, batched=True, remove_columns=ds['train'].column_names)
test_dataset = ds['test'].map(preprocess_data, batched=True, remove_columns=ds['test'].column_names)

train_dataset.set_format(type="torch")
test_dataset.set_format(type="torch")

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=2)

Map:   0%|          | 0/28 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

In [8]:
from transformers import LayoutLMv3ForTokenClassification

device = 'cuda'
# load the fine-tuned model from the hub
model = LayoutLMv3ForTokenClassification.from_pretrained('microsoft/layoutlmv3-base', num_labels=128)
model = model.to(device)

Some weights of LayoutLMv3ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [52]:
import evaluate
metric_acc = evaluate.load("accuracy")
metric_f1 = evaluate.load("f1")



In [53]:
import numpy as np

# Metrics
return_entity_level_metrics = True

id2label = [str(n) for n in np.arange(128)]
label2id = {label: idx for idx, label in enumerate(id2label)}

def compute_metrics(p):
    preds = p.predictions.argmax(-1)
    labels = p.label_ids
    
    # Flatten but remove -100
    true_preds = []
    true_labels = []
    
    for pred_row, label_row in zip(preds, labels):
        for p_i, l_i in zip(pred_row, label_row):
            if l_i != -100:       # keep only real tokens
                true_preds.append(int(p_i))
                true_labels.append(int(l_i))

    return {
        "accuracy": metric_acc.compute(predictions=true_preds, references=true_labels)['accuracy'],
        "f1_macro": metric_f1.compute(predictions=true_preds, references=true_labels, average="macro")['f1']
    }


In [54]:
from transformers import TrainingArguments, Trainer


class LayoutTrainer(Trainer):
    def get_train_dataloader(self):
      return train_dataloader

    def get_test_dataloader(self, test_dataset):
      return test_dataloader

args = TrainingArguments(
    output_dir="../models/layoutlmv3-finetuned", # name of directory to store the checkpoints
    max_steps=100, # we train for a maximum of 1,000 batches
    warmup_ratio=0.1, # we warmup a bit
    fp16=True, # we use mixed precision (less memory consumption)
    push_to_hub=False, # after training, we'd like to push our model to the hub
)

# Initialize our Trainer
trainer = LayoutTrainer(
    model=model,
    args=args,
    compute_metrics=compute_metrics,
)

trainer.train()



Step,Training Loss


TrainOutput(global_step=100, training_loss=0.03136849403381348, metrics={'train_runtime': 20.8816, 'train_samples_per_second': 38.311, 'train_steps_per_second': 4.789, 'total_flos': 106282195353600.0, 'train_loss': 0.03136849403381348, 'epoch': 14.285714285714286})

In [56]:
predictions, labels, metrics = trainer.predict(test_dataset)
print(metrics)



{'test_loss': 7.942758560180664, 'test_accuracy': 0.11551724137931034, 'test_f1_macro': 0.03160774300408237, 'test_runtime': 1.0949, 'test_samples_per_second': 3.653, 'test_steps_per_second': 0.913}


In [7]:
labels

array([[-100,    0,    0, ..., -100, -100, -100],
       [-100,    0,    0, ..., -100, -100, -100],
       [-100,    0,    1, ...,    7,    7, -100],
       [-100,    0,    0, ..., -100, -100, -100]], shape=(4, 512))

In [15]:
import evaluate
metric = evaluate.load("seqeval")

