# Legacy Import

In [34]:
import os
import pandas as pd
from PIL import Image, ImageDraw, ImageFont
from transformers import LayoutLMForSequenceClassification, LayoutLMTokenizer
import torch
from torch.utils.data import Dataset, DataLoader
import pytesseract
from datasets import Features, Sequence, ClassLabel, Value, Array2D
import numpy as np

classes = ["birth certificate", "driving", "ssn", "tax_document"]

# Legacy Methods

In [35]:
from datasets import Dataset

def normalize_box(box, width, height):
     return [
         int(1000 * (box[0] / width)),
         int(1000 * (box[1] / height)),
         int(1000 * (box[2] / width)),
         int(1000 * (box[3] / height)),
     ]

def apply_ocr(example):
        # get the image
        image = Image.open(example['image_path'])

        width, height = image.size
        
        # apply ocr to the image 
        ocr_df = pytesseract.image_to_data(image, output_type='data.frame')
        float_cols = ocr_df.select_dtypes('float').columns
        ocr_df = ocr_df.dropna().reset_index(drop=True)
        ocr_df[float_cols] = ocr_df[float_cols].round(0).astype(int)
        ocr_df = ocr_df.replace(r'^\s*$', np.nan, regex=True)
        ocr_df = ocr_df.dropna().reset_index(drop=True)

        # get the words and actual (unnormalized) bounding boxes
        #words = [word for word in ocr_df.text if str(word) != 'nan'])
        words = list(ocr_df.text)
        words = [str(w) for w in words]
        coordinates = ocr_df[['left', 'top', 'width', 'height']]
        actual_boxes = []
        for idx, row in coordinates.iterrows():
            x, y, w, h = tuple(row) # the row comes in (left, top, width, height) format
            actual_box = [x, y, x+w, y+h] # we turn it into (left, top, left+width, top+height) to get the actual box 
            actual_boxes.append(actual_box)
        
        # normalize the bounding boxes
        boxes = []
        for box in actual_boxes:
            boxes.append(normalize_box(box, width, height))
        
        # add as extra columns 
        assert len(words) == len(boxes)
        example['words'] = words
        example['bbox'] = boxes
        return example


In [36]:
tokenizer = LayoutLMTokenizer.from_pretrained("microsoft/layoutlm-base-uncased")

def encode_example(example, max_seq_length=512, pad_token_box=[0, 0, 0, 0]):
  words = example['words']
  normalized_word_boxes = example['bbox']

  assert len(words) == len(normalized_word_boxes)

  token_boxes = []
  for word, box in zip(words, normalized_word_boxes):
      word_tokens = tokenizer.tokenize(word)
      token_boxes.extend([box] * len(word_tokens))
  
  # Truncation of token_boxes
  special_tokens_count = 2 
  if len(token_boxes) > max_seq_length - special_tokens_count:
      token_boxes = token_boxes[: (max_seq_length - special_tokens_count)]
  
  # add bounding boxes of cls + sep tokens
  token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
  
  encoding = tokenizer(' '.join(words), padding='max_length', truncation=True)
  # Padding of token_boxes up the bounding boxes to the sequence length.
  input_ids = tokenizer(' '.join(words), truncation=True)["input_ids"]
  padding_length = max_seq_length - len(input_ids)
  token_boxes += [pad_token_box] * padding_length
  encoding['bbox'] = token_boxes

  assert len(encoding['input_ids']) == max_seq_length
  assert len(encoding['attention_mask']) == max_seq_length
  assert len(encoding['token_type_ids']) == max_seq_length
  assert len(encoding['bbox']) == max_seq_length

  return encoding

In [37]:
# we need to define the features ourselves as the bbox of LayoutLM are an extra feature
features = Features({
    'input_ids': Sequence(feature=Value(dtype='int64')),
    'bbox': Array2D(dtype="int64", shape=(512, 4)),
    'attention_mask': Sequence(Value(dtype='int64')),
    'token_type_ids': Sequence(Value(dtype='int64')),
    'image_path': Value(dtype='string'),
    'words': Sequence(feature=Value(dtype='string')),
})


In [38]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LayoutLMForSequenceClassification.from_pretrained("C:/Users/atulp/Desktop/document classifier/layoutclassification/Document-Classification-using-LayoutLM/saved_model")
model.to(device)

LayoutLMForSequenceClassification(
  (layoutlm): LayoutLMModel(
    (embeddings): LayoutLMEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (x_position_embeddings): Embedding(1024, 768)
      (y_position_embeddings): Embedding(1024, 768)
      (h_position_embeddings): Embedding(1024, 768)
      (w_position_embeddings): Embedding(1024, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): LayoutLMEncoder(
      (layer): ModuleList(
        (0-11): 12 x LayoutLMLayer(
          (attention): LayoutLMAttention(
            (self): LayoutLMSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True

# Data Processing Flow

In [62]:
images = []
labels = []
dataset_path = 'test_data'

for label_folder, _, file_names in os.walk(dataset_path):
    print(label_folder, _, file_names)
    print(file_names)
    relative_image_names = []
    relative_image_names.append(dataset_path + "/" + file_names[0])
    images.extend(relative_image_names)
test_data = pd.DataFrame.from_dict({'image_path': images})
test_data.head()

test_data [] ['s1 - Copy (4).jpg']
['s1 - Copy (4).jpg']


Unnamed: 0,image_path
0,test_data/s1 - Copy (4).jpg


In [63]:

import pytesseract
import numpy as np
pytesseract.pytesseract.tesseract_cmd = 'C:/Program Files/Tesseract-OCR/tesseract.exe'
test_dataset = Dataset.from_pandas(test_data)
updated_test_dataset = test_dataset.map(apply_ocr)

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [64]:
import pandas as pd
df = pd.DataFrame.from_dict(updated_test_dataset)
print(len(df["words"][0]))
print(df["words"][0])

10
['000-000-0000', 'THIS', 'NUMBER', 'HAS', 'BEE', 'NIESTABLISHED', 'FOR', 'JOHN', 'SMITH', 'SIGNATURE']


In [73]:
encoded_test_dataset = updated_test_dataset.map(lambda example: encode_example(example))

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [74]:
encoded_test_dataset.set_format(type='torch', columns=['input_ids', 'bbox', 'attention_mask', 'token_type_ids'])
test_dataloader = torch.utils.data.DataLoader(encoded_test_dataset, batch_size=1, shuffle=True)
test_batch = next(iter(test_dataloader))

In [75]:
input_ids = test_batch["input_ids"].to(device)
bbox = test_batch["bbox"].to(device)
attention_mask = test_batch["attention_mask"].to(device)
token_type_ids = test_batch["token_type_ids"].to(device)

# forward pass
outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, 
                token_type_ids=token_type_ids)

# prediction = int(torch.max(outputs.data, 1)[1].numpy())
print(outputs)

SequenceClassifierOutput(loss=None, logits=tensor([[-1.7278, -3.1298, -3.0508,  6.4619]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


In [76]:
# import torch.nn.functional as F
# pt_predictions = F.softmax(outputs[0], dim=-1)
# pt_predictions

classification_logits = outputs.logits
classification_results = torch.softmax(classification_logits, dim=1).tolist()[0]
for i in range(len(classes)):
    print(f"{classes[i]}: {int(round(classification_results[i] * 100))}%")

birth certificate: 0%
driving: 0%
ssn: 0%
tax_document: 100%


In [77]:
thisdict ={}
for i in range(len(classes)):
    thisdict[classes[i]] = str(int(round(classification_results[i] * 100))) + "%"
print(thisdict)


{'birth certificate': '0%', 'driving': '0%', 'ssn': '0%', 'tax_document': '100%'}


In [78]:
import torch.nn.functional as F
pt_predictions = F.softmax(outputs[0], dim=-1)
pt_predictions

tensor([[2.7737e-04, 6.8262e-05, 7.3876e-05, 9.9958e-01]],
       grad_fn=<SoftmaxBackward0>)

In [79]:
predictions = outputs.logits.argmax(-1).squeeze().tolist()
predictions

3

In [72]:
# # NATIVE T5

# generated_answer = model.generate(input_ids, attention_mask=attention_mask, 
#                                  max_length=decoder_max_len, top_p=0.98, top_k=50)
# decoded_answer = tokenizer.decode(generated_answer.numpy()[0])
# print("Answer: ", decoded_answer)

In [81]:
!pip freeze > requirementsss.txt