In [5]:
import os
import glob
import json 
import random
from pathlib import Path
from difflib import SequenceMatcher


import cv2
import pandas as pd
import numpy as np
from PIL import Image
from tqdm import tqdm
from IPython.display import display
import matplotlib
from matplotlib import pyplot, patches

## Preparing the dataset
The location of the SROIE dataset and the name of an example file used for demonstration purposes

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
os.chdir('/content/drive/MyDrive/mulltiply/layout')

In [69]:
sroie_folder_path = Path('/content/drive/MyDrive/mulltiply/layout/invoices')
example_file = Path('X51005365187.txt')

### Reading the words and bounding boxes
So, the first step is reading the OCR data, where every line in the file includes a group of words and a bounding box which defines them. All we have to do is read the file, discard the unneeded points in the bounding box (because the model requires only the top-left and bottom-right points) and save it in Pandas Dataframe.

In [11]:
def read_bbox_and_words(path: Path):
  bbox_and_words_list = []

  with open(path, 'r', errors='ignore') as f:
    for line in f.read().splitlines():
      if len(line) == 0:
        continue
        
      split_lines = line.split(",")

      bbox = np.array(split_lines[0:8], dtype=np.int32)
      text = ",".join(split_lines[8:])

      # From the splited line we save (filename, [bounding box points], text line).
      # The filename will be useful in the future
      bbox_and_words_list.append([path.stem, *bbox, text])
    
  dataframe = pd.DataFrame(bbox_and_words_list, columns=['filename', 'x0', 'y0', 'x1', 'y1', 'x2', 'y2', 'x3', 'y3', 'line'], dtype=np.int16)
  dataframe = dataframe.drop(columns=['x1', 'y1', 'x3', 'y3'])

  return dataframe


# Example usage
bbox_file_path = sroie_folder_path / "test/box" / example_file
print("== File content ==")
!head -n 5 "{bbox_file_path}"

bbox = read_bbox_and_words(path=bbox_file_path)
print("\n== Dataframe ==")
bbox.head(5)

== File content ==
17,35,371,35,371,91,17,91,3-1707067
222,115,511,115,511,143,222,143,F&P PHARMACY
274,148,457,148,457,174,274,174,(002309592-P)
204,183,529,183,529,210,204,210,NO.20. GROUND FLOOR,
99,218,632,218,632,244,99,244,JALAN BS 10/6 TAMAN BUKIT SERDANG,

== Dataframe ==




Unnamed: 0,filename,x0,y0,x2,y2,line
0,X51005365187,17,35,371,91,3-1707067
1,X51005365187,222,115,511,143,F&P PHARMACY
2,X51005365187,274,148,457,174,(002309592-P)
3,X51005365187,204,183,529,210,"NO.20. GROUND FLOOR,"
4,X51005365187,99,218,632,244,"JALAN BS 10/6 TAMAN BUKIT SERDANG,"


### Reading the entities file
Now we need to read the entities file to know what to label in our text.

In [12]:
def read_entities(path: Path):
  with open(path, 'r') as f:
    data = json.load(f)

  dataframe = pd.DataFrame([data])
  return dataframe


# Example usage
entities_file_path = sroie_folder_path /  "test/entities" / example_file
print("== File content ==")
!head "{entities_file_path}"

entities = read_entities(path=entities_file_path)
print("\n\n== Dataframe ==")
entities

== File content ==
{
    "company": "F&P PHARMACY",
    "date": "02/03/2018",
    "address": "NO.20. GROUND FLOOR, JALAN BS 10/6 TAMAN BUKIT SERDANG, SEKSYEN 10, 43300 SERI KEMBANGAN. SELANGOR DARUL EHSAN",
    "total": "31.90"
}

== Dataframe ==


Unnamed: 0,company,date,address,total
0,F&P PHARMACY,02/03/2018,"NO.20. GROUND FLOOR, JALAN BS 10/6 TAMAN BUKIT...",31.9


### Assigning labels to words using the entities data
We have our words/lines and entities, now we just need to put them together by labeling our lines using the entities values. We'll be doing that by substring matching the entities values with the lines and if they don't match to a similarity check using pythons _difflib.SequenceMatcher_ and assigning anything above the 0.8 (80%) prediction match.

The **label "O"** will define all our words not labeled during the assignment step, because it's required for us to label everything.

In [13]:
# Assign a label to the line by checking the similarity
# of the line and all the entities
def assign_line_label(line: str, entities: pd.DataFrame):
    line_set = line.replace(",", "").strip().split()
    for i, column in enumerate(entities):
        entity_values = entities.iloc[0, i].replace(",", "").strip()
        entity_set = entity_values.split()
        
        
        matches_count = 0
        for l in line_set:
            if any(SequenceMatcher(a=l, b=b).ratio() > 0.8 for b in entity_set):
                matches_count += 1
            
            if (column.upper() == 'ADDRESS' and (matches_count / len(line_set)) >= 0.5) or \
               (column.upper() != 'ADDRESS' and (matches_count == len(line_set))) or \
               matches_count == len(entity_set):
                return column.upper()

    return "O"


line = bbox.loc[1,"line"]
label = assign_line_label(line, entities)
print("Line:", line)
print("Assigned label:", label)

Line: F&P PHARMACY
Assigned label: COMPANY


With a function which can handle the labeling of our lines, we'll create another function to label all our line in one DataFrame (so one receipt).

As simple as this could be, the problem arises when we get lines which would all pass the same match, like **TOTAL** for example; a receipt could have only one item on it and its price could be the same as the final total, so duplicate labels. Or maybe part of the address is also present at the end of the receipt.

To ignore such examples, I wrote simple hard-coded rules to assign *total* and *date* to only the largest bounding boxes it could find (based on its area) and to not allow the address to be assigned after date or total.

In [26]:
def assign_labels(words: pd.DataFrame, entities: pd.DataFrame):
    max_area = {"TOTAL": (0, -1), "DATE": (0, -1)}  # Value, index
    already_labeled = {"TOTAL": False,
                       "DATE": False,
                       "ADDRESS": False,
                       "COMPANY": False,
                       "O": False
    }

    # Go through every line in $words and assign it a label
    labels = []
    for i, line in enumerate(words['line']):
        label = assign_line_label(line, entities)

        already_labeled[label] = True
        if (label == "ADDRESS" and already_labeled["TOTAL"]) or \
           (label == "COMPANY" and (already_labeled["DATE"] or already_labeled["TOTAL"])):
            label = "O"

        # Assign to the largest bounding box
        if label in ["TOTAL", "DATE"]:
            x0_loc = words.columns.get_loc("x0")
            bbox = words.iloc[i, x0_loc:x0_loc+4].to_list()
            area = (bbox[2] - bbox[0]) + (bbox[3] - bbox[1])

            if max_area[label][0] < area:
                max_area[label] = (area, i)

            label = "O"

        labels.append(label)

    labels[max_area["DATE"][1]] = "DATE"
    labels[max_area["TOTAL"][1]] = "TOTAL"

    words["label"] = labels
    return words


# Example usage
bbox_labeled = assign_labels(bbox, entities)
bbox_labeled.head(15)

Unnamed: 0,filename,x0,y0,x2,y2,line,label
0,X51005365187,17,35,371,91,3-1707067,O
1,X51005365187,222,115,511,143,F&P PHARMACY,COMPANY
2,X51005365187,274,148,457,174,(002309592-P),O
3,X51005365187,204,183,529,210,"NO.20. GROUND FLOOR,",ADDRESS
4,X51005365187,99,218,632,244,"JALAN BS 10/6 TAMAN BUKIT SERDANG,",ADDRESS
5,X51005365187,111,250,623,275,"SEKSYEN 10, 43300 SERI KEMBANGAN,",ADDRESS
6,X51005365187,186,284,538,308,SELANGOR DARUL EHSAN,ADDRESS
7,X51005365187,251,316,484,339,TEL 03-89599823,O
8,X51005365187,182,351,346,378,GST REG NO,O
9,X51005365187,274,387,461,409,TAX INVOICE,O


In [21]:
bbox_labeled

Unnamed: 0,filename,x0,y0,x2,y2,line,label
0,X51005365187,17,35,371,91,3-1707067,O
1,X51005365187,222,115,511,143,F&P PHARMACY,COMPANY
2,X51005365187,274,148,457,174,(002309592-P),O
3,X51005365187,204,183,529,210,"NO.20. GROUND FLOOR,",ADDRESS
4,X51005365187,99,218,632,244,"JALAN BS 10/6 TAMAN BUKIT SERDANG,",ADDRESS
...,...,...,...,...,...,...,...
101,X51005365187,430,1528,498,1548,30.68,O
102,X51005365187,623,1528,677,1550,1.22,O
103,X51005365187,48,1580,685,1604,"GOODS SOLD ARE NOT RETURNABLE & EXCHANGABLE,",O
104,X51005365187,296,1607,437,1628,THANK YOU.,O


In [22]:
bbox_labeled = pd.read_csv('/content/drive/MyDrive/mulltiply/layout/invoices/labels.csv')
bbox_labeled

Unnamed: 0,filename,width,height,class,xmin,ymin,xmax,ymax
0,0001 (2).jpg,2480,3509,TAX INVOICE CUM CHALLAN,849,254,1709,333
1,0001 (2).jpg,2480,3509,ALL Subject to Howrah Jurisdiction,856,347,1713,411
2,0001 (2).jpg,2480,3509,Email:dineshjaiswal19792gmail.com,124,79,1041,211
3,0001 (2).jpg,2480,3509,Ph:2643-4958,131,208,813,276
4,0001 (2).jpg,2480,3509,Mb:9836557024,134,276,831,351
...,...,...,...,...,...,...,...,...
347,"Sell Tax Invoice April,2021-0001.jpg",2480,3509,"PLACE OF SUPPLY:Vadodara,GUJarat",109,2826,1377,2883
348,"Sell Tax Invoice April,2021-0001.jpg",2480,3509,Declaration,116,3101,1834,3154
349,"Sell Tax Invoice April,2021-0001.jpg",2480,3509,1.We declare that this invoice shows actual pr...,116,3161,1109,3243
350,"Sell Tax Invoice April,2021-0001.jpg",2480,3509,2.E.O.E.,113,3243,1109,3283


### Split words
For the last part we're splitting the lines into separate tokens with their own bounding boxes.

Splitting the bounding boxes based on word length is probably not the best approach, but it's good enough.

In [27]:
def split_line(line: pd.Series):
  line_copy = line.copy()

  line_str = line_copy.loc["line"]
  words = line_str.split(" ")

  # Filter unwanted tokens
  words = [word for word in words if len(word) >= 1]

  x0, y0, x2, y2 = line_copy.loc[['x0', 'y0', 'x2', 'y2']]
  bbox_width = x2 - x0
  

  new_lines = []
  for index, word in enumerate(words):
    x2 = x0 + int(bbox_width * len(word)/len(line_str))
    line_copy.at['x0', 'x2', 'line'] = [x0, x2, word]
    new_lines.append(line_copy.to_list())
    x0 = x2 + 5 

  return new_lines


# Example usage
new_lines = split_line(bbox_labeled.loc[1])
print("Original row:")
display(bbox_labeled.loc[1:1,:])

print("Splitted row:")
pd.DataFrame(new_lines, columns=bbox_labeled.columns)

Original row:


Unnamed: 0,filename,x0,y0,x2,y2,line,label
1,X51005365187,222,115,511,143,F&P PHARMACY,COMPANY


Splitted row:


Unnamed: 0,filename,x0,y0,x2,y2,line,label
0,X51005365187,222,115,294,143,F&P,COMPANY
1,X51005365187,299,115,491,143,PHARMACY,COMPANY


### Putting it all together
We defined all our functions, now we just have to use them on every file and transform the dataset into a format which the script/model can parse.

In [28]:
from time import perf_counter
def dataset_creator(folder: Path):
  bbox_folder = folder / 'box'
  entities_folder = folder / 'entities'
  img_folder = folder / 'img'

  # Sort by filename so that when zipping them together
  # we don't get some other file (just in case)
  entities_files = sorted(entities_folder.glob("*.txt"))
  bbox_files = sorted(bbox_folder.glob("*.txt"))
  img_files = sorted(img_folder.glob("*.jpg"))

  data = []

  print("Reading dataset:")
  for bbox_file, entities_file, img_file in tqdm(zip(bbox_files, entities_files, img_files), total=len(bbox_files)):            
    # Read the files
    bbox = read_bbox_and_words(bbox_file)
    entities = read_entities(entities_file)
    image = Image.open(img_file)

    # Assign labels to lines in bbox using entities
    bbox_labeled = assign_labels(bbox, entities)
    del bbox

    # Split lines into separate tokens
    new_bbox_l = []
    for index, row in bbox_labeled.iterrows():
      new_bbox_l += split_line(row)
    new_bbox = pd.DataFrame(new_bbox_l, columns=bbox_labeled.columns, dtype=np.int16)
    del bbox_labeled


    # Do another label assignment to keep the labeling more precise 
    for index, row in new_bbox.iterrows():
      label = row['label']

      if label != "O":
        entity_values = entities.iloc[0, entities.columns.get_loc(label.lower())]
        entity_set = entity_values.split()
        
        if any(SequenceMatcher(a=row['line'], b=b).ratio() > 0.7 for b in entity_set):
            label = "S-" + label
        else:
            label = "O"
      
      new_bbox.at[index, 'label'] = label

    width, height = image.size
  
    data.append([new_bbox, width, height])

  return data

Because the dataset has two folders, one meant for training the model and one for testing its performance, we can use the same script to read them both and save them in their respected variables.

In [37]:
dataset_train = dataset_creator(sroie_folder_path / 'train')
dataset_test = dataset_creator(sroie_folder_path / 'test')

Reading dataset:


  """Entry point for launching an IPython kernel.
100%|██████████| 626/626 [01:42<00:00,  6.09it/s]


Reading dataset:


  
100%|██████████| 347/347 [00:53<00:00,  6.52it/s]


In [30]:
type(dataset_train)

list

In [58]:
source_dir = '/content/drive/MyDrive/mulltiply/layout/invoices/label'
dirs = os.listdir(source_dir)
train_dataset = []
for file in dirs:
  file = source_dir +'/' + file

  dfx = pd.read_excel(file)
  print(dfx.head(2))
  dfy = pd.DataFrame()
  dfy['filename'] = dfx['filename']
  dfy['x0'] = dfx['xmin']
  dfy['y0'] = dfx['ymin']
  dfy['x2'] = dfx['xmax']
  dfy['y2'] = dfx['ymax']
  dfy['line'] = dfx['class']
  dfy['label'] = dfx['label']

  print(dfy.head(2))
  train_dataset.append(dfy)
train_dataset[1]


       filename   width  height                               class   xmin  \
0  0001 (2).jpg  2480.0  3509.0             TAX INVOICE CUM CHALLAN  849.0   
1  0001 (2).jpg  2480.0  3509.0  ALL Subject to Howrah Jurisdiction  856.0   

    ymin    xmax   ymax  label  
0  254.0  1709.0  333.0  Other  
1  347.0  1713.0  411.0  Other  
       filename     x0     y0      x2     y2  \
0  0001 (2).jpg  849.0  254.0  1709.0  333.0   
1  0001 (2).jpg  856.0  347.0  1713.0  411.0   

                                 line  label  
0             TAX INVOICE CUM CHALLAN  Other  
1  ALL Subject to Howrah Jurisdiction  Other  
   filename  width  height                                              class  \
0  0002.jpg   2480    3509  R.K. TRADERS NAYABAZ, G.U.P.COLONY,HOWRAH GST ...   
1  0002.jpg   2480    3509  Consignee I L ENGINEERING UNIT JHOREHAT HOWRAH...   

   xmin  ymin  xmax  ymax    label  
0   181   140   970   333  Address  
1   181   472   834   726    Other  
   filename   x0   y0   x

Unnamed: 0,filename,x0,y0,x2,y2,line,label
0,0002.jpg,181,140,970,333,"R.K. TRADERS NAYABAZ, G.U.P.COLONY,HOWRAH GST ...",Address
1,0002.jpg,181,472,834,726,Consignee I L ENGINEERING UNIT JHOREHAT HOWRAH...,Other
2,0002.jpg,1202,147,1456,247,Invoice No. 04/05/20-21,Other
3,0002.jpg,1202,258,1459,315,Delivery Note,Other
4,0002.jpg,1209,358,1477,454,Supplier's Ref 04/05/20-21,Other
5,0002.jpg,1206,461,1499,540,Buyer's Order No.,Other
6,0002.jpg,1220,558,1552,622,Despatch Document No.,Other
7,0002.jpg,1213,665,1534,729,Despatched through,Other
8,0002.jpg,1209,772,1506,868,Terms of Delivery,Other
9,0002.jpg,1713,151,1984,251,Dated 18-May-2020,Other


## Writting the transformed dataset
Now that we transformed our dataset into a format the model can understand for its training, we need save everything into files.

### Defining the writing function
We'll use the same function to write into the train and test files.

The normalization function is meant to normalize the bounding boxes points in a range [0,1000] using the width and height of the image of the receipt [\[source\]](https://huggingface.co/transformers/model_doc/layoutlm.html#overview).

In [61]:
def normalize(points: list, width: int, height: int) -> list:
  x0, y0, x2, y2 = [int(p) for p in points]
  
  x0 = int(1000 * (x0 / width))
  x2 = int(1000 * (x2 / width))
  y0 = int(1000 * (y0 / height))
  y2 = int(1000 * (y2 / height))

  return [x0, y0, x2, y2]


def write_dataset(dataset: list, output_dir: Path, name: str):
  print(f"Writing {name}ing dataset:")
  with open(output_dir / f"{name}.txt", "w+", encoding="utf8") as file, \
       open(output_dir / f"{name}_box.txt", "w+", encoding="utf8") as file_bbox, \
       open(output_dir / f"{name}_image.txt", "w+", encoding="utf8") as file_image:

      # Go through each dataset
      for datas in tqdm(dataset, total=len(dataset)):
        print(len(dataset))
        print(datas)
        data, width, height = datas, 2480, 3509
        
        filename = data.iloc[0, data.columns.get_loc('filename')]

        # Go through every row in dataset
        for index, row in data.iterrows():
          bbox = [int(p) for p in row[['x0', 'y0', 'x2', 'y2']]]
          normalized_bbox = normalize(bbox, width, height)

          file.write("{}\t{}\n".format(row['line'], row['label']))
          file_bbox.write("{}\t{} {} {} {}\n".format(row['line'], *normalized_bbox))
          file_image.write("{}\t{} {} {} {}\t{} {}\t{}\n".format(row['line'], *bbox, width, height, filename))

        # Write a second newline to separate dataset from others
        file.write("\n")
        file_bbox.write("\n")
        file_image.write("\n")

In [62]:
dataset_directory = Path('/content/drive/MyDrive/mulltiply/layout/work','dataset')

dataset_directory.mkdir(parents=True, exist_ok=True)

write_dataset(train_dataset, dataset_directory, 'train')
write_dataset(train_dataset, dataset_directory, 'test')

# Creating the 'labels.txt' file to the the model what categories to predict.
labels = ['Company', 'GSTIN', 'Address', 'Total', 'Other']
IOB_tags = ['S']
with open(dataset_directory / 'labels.txt', 'w') as f:
  for tag in IOB_tags:
    for label in labels:
      f.write(f"{tag}-{label}\n")
  # Writes in the last label O - meant for all non labeled words
  f.write("O")

Writing training dataset:


 30%|███       | 3/10 [00:00<00:00, 27.83it/s]

10
        filename      x0      y0      x2      y2  \
0   0001 (2).jpg   849.0   254.0  1709.0   333.0   
1   0001 (2).jpg   856.0   347.0  1713.0   411.0   
2   0001 (2).jpg   124.0    79.0  1041.0   211.0   
3   0001 (2).jpg   131.0   208.0   813.0   276.0   
4   0001 (2).jpg   134.0   276.0   831.0   351.0   
5   0001 (2).jpg  1541.0    76.0  2459.0   222.0   
6   0001 (2).jpg  1731.0   226.0  2459.0   293.0   
7   0001 (2).jpg  1752.0   293.0  2438.0   351.0   
8   0001 (2).jpg  1759.0   358.0  2449.0   418.0   
9   0001 (2).jpg   466.0   393.0  2124.0   522.0   
10  0001 (2).jpg   134.0   365.0   434.0   533.0   
11  0001 (2).jpg   134.0   540.0  2466.0   654.0   
12  0001 (2).jpg   138.0   647.0  2445.0   729.0   
13  0001 (2).jpg   138.0   743.0  1234.0   847.0   
14  0001 (2).jpg   134.0   843.0  1245.0   933.0   
15  0001 (2).jpg   131.0   926.0   663.0   993.0   
16  0001 (2).jpg   684.0   936.0  1249.0  1001.0   
17  0001 (2).jpg  1277.0   751.0  2391.0   815.0   
18  0001 

 80%|████████  | 8/10 [00:00<00:00, 37.98it/s]

10
    filename    x0    y0    x2    y2  \
0   img9.jpg   687    19   959    76   
1   img9.jpg   373    86  1278   183   
2   img9.jpg   544   174  1140   210   
3   img9.jpg   344   202  1297   267   
4   img9.jpg   470   257  1101   286   
5   img9.jpg   430   283   849   341   
6   img9.jpg   854   274  1232   336   
7   img9.jpg    87   333   820   374   
8   img9.jpg    78   383   823   438   
9   img9.jpg    82   436   818   498   
10  img9.jpg    78   507   794   548   
11  img9.jpg   830   336  1570   383   
12  img9.jpg   830   393  1556   431   
13  img9.jpg   830   448  1575   488   
14  img9.jpg   830   502  1566   538   
15  img9.jpg    82   555   818   838   
16  img9.jpg   832   543  1573   826   
17  img9.jpg    82   838   825  1543   
18  img9.jpg   832   829   961  1541   
19  img9.jpg   968   826  1130  1545   
20  img9.jpg  1137   838  1301  1567   
21  img9.jpg  1306   843  1544  1538   
22  img9.jpg    73  1555   970  1641   
23  img9.jpg    82  1645   544  1848 

100%|██████████| 10/10 [00:00<00:00, 35.41it/s]


10
    filename    x0    y0    x2    y2  \
0   0001.jpg   984    61  1413   168   
1   0001.jpg   191   183   941   515   
2   0001.jpg   195   529  1070   872   
3   0001.jpg  1213   193  1484   301   
4   0001.jpg  1213   301  1527   397   
5   0001.jpg  1213   408  1549   508   
6   0001.jpg  1209   522  1584   601   
7   0001.jpg  1209   618  1656   668   
8   0001.jpg  1202   722  1577   793   
9   0001.jpg  1209   829  1602   901   
10  0001.jpg  1727   211  2020   308   
11  0001.jpg  1731   311  2170   397   
12  0001.jpg  1720   426  2131   476   
13  0001.jpg  1734   526  1891   601   
14  0001.jpg  1720   626  2124   704   
15  0001.jpg  1716   743  2013   804   
16  0001.jpg   177  1158  1170  1718   
17  0001.jpg  1195  1183  1381  1447   
18  0001.jpg  1395  1186  1588  1476   
19  0001.jpg  1609  1186  1809  1465   
20  0001.jpg  1824  1193  1902  1454   
21  0001.jpg  1920  1190  2202  1683   
22  0001.jpg   220  1993  2199  2051   
23  0001.jpg   163  2043  2199  2158 

 30%|███       | 3/10 [00:00<00:00, 29.93it/s]

10
        filename      x0      y0      x2      y2  \
0   0001 (2).jpg   849.0   254.0  1709.0   333.0   
1   0001 (2).jpg   856.0   347.0  1713.0   411.0   
2   0001 (2).jpg   124.0    79.0  1041.0   211.0   
3   0001 (2).jpg   131.0   208.0   813.0   276.0   
4   0001 (2).jpg   134.0   276.0   831.0   351.0   
5   0001 (2).jpg  1541.0    76.0  2459.0   222.0   
6   0001 (2).jpg  1731.0   226.0  2459.0   293.0   
7   0001 (2).jpg  1752.0   293.0  2438.0   351.0   
8   0001 (2).jpg  1759.0   358.0  2449.0   418.0   
9   0001 (2).jpg   466.0   393.0  2124.0   522.0   
10  0001 (2).jpg   134.0   365.0   434.0   533.0   
11  0001 (2).jpg   134.0   540.0  2466.0   654.0   
12  0001 (2).jpg   138.0   647.0  2445.0   729.0   
13  0001 (2).jpg   138.0   743.0  1234.0   847.0   
14  0001 (2).jpg   134.0   843.0  1245.0   933.0   
15  0001 (2).jpg   131.0   926.0   663.0   993.0   
16  0001 (2).jpg   684.0   936.0  1249.0  1001.0   
17  0001 (2).jpg  1277.0   751.0  2391.0   815.0   
18  0001 

100%|██████████| 10/10 [00:00<00:00, 36.77it/s]

10
    filename    x0    y0    x2    y2  \
0   0001.jpg   984    61  1413   168   
1   0001.jpg   191   183   941   515   
2   0001.jpg   195   529  1070   872   
3   0001.jpg  1213   193  1484   301   
4   0001.jpg  1213   301  1527   397   
5   0001.jpg  1213   408  1549   508   
6   0001.jpg  1209   522  1584   601   
7   0001.jpg  1209   618  1656   668   
8   0001.jpg  1202   722  1577   793   
9   0001.jpg  1209   829  1602   901   
10  0001.jpg  1727   211  2020   308   
11  0001.jpg  1731   311  2170   397   
12  0001.jpg  1720   426  2131   476   
13  0001.jpg  1734   526  1891   601   
14  0001.jpg  1720   626  2124   704   
15  0001.jpg  1716   743  2013   804   
16  0001.jpg   177  1158  1170  1718   
17  0001.jpg  1195  1183  1381  1447   
18  0001.jpg  1395  1186  1588  1476   
19  0001.jpg  1609  1186  1809  1465   
20  0001.jpg  1824  1193  1902  1454   
21  0001.jpg  1920  1190  2202  1683   
22  0001.jpg   220  1993  2199  2051   
23  0001.jpg   163  2043  2199  2158 




# 2. Fine tune LayoutLM
We downloaded and transformed our dataset into a trainable and testable set, now we can start the fine-tuning of the model.

## Download the model
First we're going to clone the LayoutLM Github project which contains the script to fine tune our model.

In [None]:
%%bash
git clone https://github.com/microsoft/unilm.git
cd unilm/layoutlm/deprecated
pip install .

## Training

In [70]:
sroie_folder_path1 = Path('/content/drive/MyDrive/mulltiply/layout/SROIE2019')
pretrained_model_folder_input= sroie_folder_path1 / Path('layoutlm-base-uncased') # Define it so we can copy it into our working directory

pretrained_model_folder=Path('/content/drive/MyDrive/mulltiply/layout/SROIE2019/layoutlm-base-uncased') 
label_file=Path(dataset_directory, "labels.txt")

# Move to the script directory
os.chdir("/content/drive/MyDrive/mulltiply/layout/unilm/layoutlm/deprecated/examples/seq_labeling")

First I'm going to copy the pretrained base model into our working directory to change its configuration file. I'm just changing the number of attention heads from **16** to **12**, because that's the original size.

In [71]:
! cp -r "{pretrained_model_folder_input}" "{pretrained_model_folder}"
! sed -i 's/"num_attention_heads": 16,/"num_attention_heads": 12,/' "{pretrained_model_folder}/"config.json

cp: cannot copy a directory, '/content/drive/MyDrive/mulltiply/layout/SROIE2019/layoutlm-base-uncased', into itself, '/content/drive/MyDrive/mulltiply/layout/SROIE2019/layoutlm-base-uncased/layoutlm-base-uncased'


In [78]:
! cat "/content/drive/MyDrive/mulltiply/layout/SROIE2019/layoutlm-base-uncased/config.json"

{
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "max_2d_position_embeddings": 1024,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 6,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "use_bfloat16": false,
  "vocab_size": 30522
}

In [75]:
!cp -r /content/drive/MyDrive/mulltiply/layout/invoices/train /content/drive/MyDrive/mulltiply/layout/invoices/test

In [80]:
! python run_seq_labeling.py \
                            --data_dir /content/drive/MyDrive/mulltiply/layout/work/dataset \
                            --labels /content/drive/MyDrive/mulltiply/layout/working/dataset/labels.txt \
                            --model_name_or_path "{pretrained_model_folder}" \
                            --model_type layoutlm \
                            --max_seq_length 512 \
                            --do_lower_case \
                            --do_train \
                            --num_train_epochs 1 \
                            --logging_steps 50 \
                            --save_steps -1 \
                            --output_dir output \
                            --overwrite_output_dir \
                            --per_gpu_train_batch_size 1 \
                            --per_gpu_eval_batch_size 1

Traceback (most recent call last):
  File "run_seq_labeling.py", line 811, in <module>
    main()
  File "run_seq_labeling.py", line 701, in main
    args, tokenizer, labels, pad_token_label_id, mode="train"
  File "/usr/local/lib/python3.7/dist-packages/layoutlm/data/funsd.py", line 29, in __init__
    examples = read_examples_from_file(args.data_dir, mode)
  File "/usr/local/lib/python3.7/dist-packages/layoutlm/data/funsd.py", line 174, in read_examples_from_file
    assert len(splits) == 2
AssertionError


## Predicting

In [79]:
# Evaluate for test set and make predictions
! python run_seq_labeling.py \
                            --data_dir /content/drive/MyDrive/mulltiply/layout/working/dataset \
                            --labels /content/drive/MyDrive/mulltiply/layout/working/dataset/labels.txt \
                            --model_name_or_path "{pretrained_model_folder}" \
                            --model_type layoutlm \
                            --do_lower_case \
                            --max_seq_length 512 \
                            --do_predict \
                            --logging_steps 10 \
                            --save_steps -1 \
                            --output_dir output \
                            --per_gpu_eval_batch_size 8

Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/transformers/configuration_utils.py", line 249, in get_config_dict
    local_files_only=local_files_only,
  File "/usr/local/lib/python3.7/dist-packages/transformers/file_utils.py", line 274, in cached_path
    raise EnvironmentError("file {} not found".format(url_or_filename))
OSError: file output/config.json not found

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "run_seq_labeling.py", line 811, in <module>
    main()
  File "run_seq_labeling.py", line 769, in main
    model = model_class.from_pretrained(args.output_dir)
  File "/usr/local/lib/python3.7/dist-packages/transformers/modeling_utils.py", line 546, in from_pretrained
    **kwargs,
  File "/usr/local/lib/python3.7/dist-packages/transformers/configuration_utils.py", line 202, in from_pretrained
    config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)

In [None]:
!cat output/test_results.txt

f1 = 0.9405450041288191
loss = 0.06454576554030857
precision = 0.9366776315789473
recall = 0.9444444444444444


# Results samples
The example shows two side by side images of the same receipt, where the colored boxes are the labeled lines. The left is the *original*, so the data we labeled and the right is the model's prediction.

In [None]:
import cv2
from matplotlib import pyplot, patches
import matplotlib

data = pd.read_csv("/working/dataset/test_image.txt", delimiter="\t", names=["name", "bbox", "size", "image"])
data_category = pd.read_csv("/working/dataset/test.txt", delimiter="\t", names=["name", "true_category"]).drop(columns=["name"])
data_prediction_category = pd.read_csv("output/test_predictions.txt", delimiter=" ", names=["name", "prediction_category"]).drop(columns=["name"])

data_merge = data.merge(data_category, left_index=True, right_index=True)
merged = data_merge.merge(data_prediction_category, left_index=True, right_index=True)
merged_groups = list(merged.groupby("image"))

In [None]:
def display_prediction(data, file):
  colors = {
      "S-TOTAL": (255,0,0),
      "S-DATE": (0,255,0),
      "S-ADDRESS": (0,0, 255),
      "S-COMPANY": (255,255,0),
      "O": (192,192,192)
  }


  imagename = data[0].split(".")[0] + ".jpg"
  print("Filename:",imagename)
  image_path = str(sroie_folder_path / 'test' / 'img' / imagename)

  img=cv2.imread(image_path)
  img_prediction=cv2.imread(image_path)

  data = data[1]
  for bbox, category, prediction_category in zip(data['bbox'], data['true_category'], data['prediction_category']):
    (x1, y1, x2, y2) = [int(coordinate) for coordinate in bbox.split()]

    img_prediction = cv2.rectangle(img_prediction, (x1, y1), (x2, y2), colors[prediction_category], 2 if "O" in prediction_category else 4)
    img = cv2.rectangle(img, (x1, y1), (x2, y2), colors[category], 2 if "O" in category else 4)

  matplotlib.rcParams['figure.figsize'] = 15 ,18

  cv2.imwrite("prediction.jpg", img_prediction)

  # Plot
  fig, ax = matplotlib.pyplot.subplots(1,2)
  ax[0].set_title("Original", fontsize= 30)
  ax[0].imshow(img);
  ax[1].set_title("Prediction", fontsize= 30)
  ax[1].imshow(img_prediction);

  # Legend
  handles = [
      patches.Patch(color='yellow', label='Company'),
      patches.Patch(color='blue', label='Address'),
      patches.Patch(color='green', label='Date'),
      patches.Patch(color='red', label='Total'),
      patches.Patch(color='gray', label='Other')
  ]

  fig.legend(handles=handles, prop={'size': 25}, loc='lower center')

This is an example when my preprocessing wasn't perfect, but the model still predicted the correct result. From this we can see that if my preprocessing was better, the model would have a better accuracy score.

In [None]:
display_prediction(merged_groups[0], 'test')

In [None]:
display_prediction(merged_groups[34], 'test')