# 🧠 LayoutLMv3 Demo: Extract Contract Key-Value from PDF (Text or Scan)

This notebook demonstrates how to use `LayoutLMv3` from HuggingFace Transformers to extract structured information from contract PDFs.

You can use this for scanned documents (with OCR) or text-based PDFs.

---

In [None]:
# ✅ Install required packages
!pip install transformers datasets pytesseract pdf2image torch torchvision --quiet
!apt install poppler-utils tesseract-ocr -y  # For pdf2image and pytesseract on Colab (Linux)

In [None]:
# ✅ Imports
import pytesseract
from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification
from PIL import Image
import torch
import os
from pdf2image import convert_from_path

In [None]:
# ✅ Load pretrained model and processor
processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=True)
model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base")
model.eval()

In [None]:
# ✅ Convert PDF to image
images = convert_from_path("../store/input/your_contract_file.pdf", dpi=300)
image = images[0]  # use first page
image.save("page.jpg")
image

In [None]:
# ✅ Prepare input for LayoutLMv3
encoding = processor(image, return_tensors="pt")
with torch.no_grad():
    outputs = model(**encoding)

predictions = outputs.logits.argmax(-1).squeeze().tolist()
tokens = processor.tokenizer.convert_ids_to_tokens(encoding['input_ids'].squeeze())

for token, pred in zip(tokens, predictions):
    if pred != 0:
        print(f"Token: {token}, Label ID: {pred}")