In [None]:
!pip install -q easyocr

In [None]:
import easyocr
import torch
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import json

from torch.nn import functional as F
from PIL import Image, ImageDraw, ImageFont
from transformers import LayoutLMv3Processor, LayoutLMv3ForTokenClassification, LayoutLMv3Config
from datasets import load_dataset, Dataset

In [None]:
dataset = load_dataset("nielsr/funsd-layoutlmv3")#, split="train", trust_remote_code=True)
dataset

In [None]:
example = dataset["test"][0]
print(example.keys())

In [None]:
# Initialize EasyOCR reader
reader = easyocr.Reader(['en'])  # You can specify more languages

In [None]:
# Extract the first image (replace 0 with another index if you need a different image)
image = example['image']
image = image.convert("RGB")

# Check if the image is a PIL object
if isinstance(image, Image.Image):  # If it's already a PIL Image
    # Convert PIL Image to NumPy array (RGB)
    image = np.array(image.convert("RGB"))

# Get the resolution (dimensions) of the image
height, width, channels = image.shape
print(f"Original Image resolution: {width}x{height} (Width x Height)")

# Target height
target_height = 1000

# Calculate scale factor to preserve the aspect ratio
if height != target_height:
    scale_factor = target_height / height
    new_width = int(width * scale_factor)
    new_height = target_height

    # Resize the image to match the target height, while preserving aspect ratio
    image = Image.fromarray(image)
    image = image.resize((new_width, new_height))

    # Convert back to NumPy array if needed
    image = np.array(image)

# Get the new resolution
height, width, channels = image.shape
print(f"Resized Image resolution: {width}x{height} (Width x Height)")

# Initialize EasyOCR reader (English language)
reader = easyocr.Reader(['en'])

# Apply OCR to the image (now in NumPy format)
results = reader.readtext(image)

# Extract text and bounding boxes
texts = []
bboxes = []
confidences = []
for result in results:
    box, detected_text, confidence = result
    texts.append(detected_text)
    bboxes.append(box)
    confidences.append(confidence)

# Print extracted text and bounding boxes
print("Extracted Text:", texts)
print("Bounding Boxes:", bboxes)

In [None]:
# Prepare the plot with a larger figure size
fig, ax = plt.subplots(1, figsize=(20, 20))  # Adjust this for a bigger size
ax.imshow(image)

# Loop through the EasyOCR results
for result in results:
    bbox, text, _ = result  # bbox: coordinates, text: the recognized text
    
    # Unpack the bounding box coordinates
    xmin, ymin = bbox[0]  # Top-left corner (xmin, ymin)
    xmax, ymax = bbox[2]  # Bottom-right corner (xmax, ymax)
    
    # Draw the bounding box around the text
    rect = patches.Rectangle((xmin, ymin), xmax - xmin, ymax - ymin, linewidth=2, edgecolor='violet', facecolor='none')
    ax.add_patch(rect)
    
    # Annotate the text inside the bounding box
    plt.text(xmin, ymin, text, color='violet', fontsize=12)

# Display the result
plt.axis('off')  # Hide axes for better image focus
plt.show()

In [None]:
id2label = {0: 'O', 1: 'B-HEADER', 2: 'I-HEADER', 3: 'B-QUESTION', 4: 'I-QUESTION', 5: 'B-ANSWER', 6: 'I-ANSWER'}
label2id = {'O' : 0, 'B-HEADER' : 1, 'I-HEADER' : 2, 'B-QUESTION' : 3, 'I-QUESTION' : 4, 'B-ANSWER' : 5, 'I-ANSWER' : 6}

In [None]:
model_name = 'nielsr/layoutlmv3-finetuned-funsd'
model_name = 'nielsr/layoutlmv3-finetuned-funsd'

# Define the configuration with updated labels
config = LayoutLMv3Config.from_pretrained(model_name, num_labels=len(id2label))
config.id2label = id2label
config.label2id = {label: idx for idx, label in id2label.items()}

processor = LayoutLMv3Processor.from_pretrained(model_name)#, apply_ocr=False)
model = LayoutLMv3ForTokenClassification.from_pretrained(model_name)#, id2label=id2label, label2id=label2id)

## v1

In [None]:
# Extract the first image (replace 0 with another index if you need a different image)
image = example['image']

In [None]:
encoding = processor(image, return_tensors="pt")
for k,v in encoding.items():
  print(k,v.shape)

In [None]:
# Pass the encoded data through the model
outputs = model(**encoding)
for k,v in outputs.items():
  print(k,v.shape)

In [None]:
logits = outputs.logits
logits

In [None]:
predictions = logits.argmax(-1).squeeze().tolist()
print(predictions)

In [None]:
true_predictions = [model.config.id2label[pred] for pred in predictions]
#print(true_predictions)

In [None]:
token_boxes = encoding.bbox.squeeze().tolist()
#print(token_boxes)

In [None]:
def unnormalize_box(bbox, width, height):
     return [
         width * (bbox[0] / 1000),
         height * (bbox[1] / 1000),
         width * (bbox[2] / 1000),
         height * (bbox[3] / 1000),
     ]

width, height = image.size

true_boxes = [unnormalize_box(box, width, height) for box in token_boxes]

In [None]:
image_new = image.copy()
draw = ImageDraw.Draw(image_new)
font = ImageFont.load_default()

# Function to adjust bounding boxes using x, y, w, h
def adjust_bbox(box, x_offset=0, y_offset=0, width_offset=0, height_offset=0):
    x1, y1, x2, y2 = box
    x1 += x_offset
    y1 += y_offset
    x2 += x_offset + width_offset
    y2 += y_offset + height_offset
    return [x1, y1, x2, y2]

def iob_to_label(label):
    label = label[2:]
    return label if label else 'other'

label2color = {'question': 'blue', 'answer': 'green', 'header': 'orange', 'other': 'violet'}

# Define the offsets (adjust as needed)
x_offset = -35   # Shift right
y_offset = 0   # Shift down
width_offset = 0  # Increase width
height_offset = 0  # Increase height

# Set to keep track of already drawn bounding boxes
drawn_boxes = set()

for prediction, box in zip(true_predictions, token_boxes):
    predicted_label = iob_to_label(prediction).lower()

    # Adjust the bounding box using the offsets
    adjusted_box = adjust_bbox(box, x_offset, y_offset, width_offset, height_offset)

    # Convert the adjusted box to a tuple for set comparison (since lists aren't hashable)
    box_tuple = tuple(adjusted_box)

    # Skip if this bounding box has already been drawn
    if box_tuple not in drawn_boxes:
        # Draw the adjusted bounding box
        draw.rectangle(adjusted_box, outline=label2color.get(predicted_label, 'red'), width=2)

        # Add text label
        draw.text((adjusted_box[0] + 10, adjusted_box[1] - 10), text=predicted_label, 
                  fill=label2color.get(predicted_label, 'red'), font=font)

        # Add the box to the set of drawn boxes
        drawn_boxes.add(box_tuple)

image_new