In [None]:
import os
import glob
import json
import random
from pathlib import Path
from difflib import SequenceMatcher
import cv2
import pandas as pd
import numpy as np
from PIL import Image
from tqdm import tqdm
from IPython.display import display
import matplotlib
from matplotlib import pyplot, patches
!pip install tqdm --upgrade



In [None]:
sroie_folder_path = Path('/content/drive/MyDrive/SROIE2019')
example_file = Path('X00016469670.txt')

In [None]:
def read_bbox_and_words(path: Path):
  bbox_and_words_list = []

  with open(path, 'r', errors='ignore') as f:
    for line in f.read().splitlines():
      if len(line) == 0:
        continue

      split_lines = line.split(",")

      bbox = np.array(split_lines[0:8], dtype=np.int32)
      text = ",".join(split_lines[8:])

      # From the splited line we save (filename, [bounding box points], text line).
      # The filename will be useful in the future
      bbox_and_words_list.append([path.stem, *bbox, text])

  dataframe = pd.DataFrame(bbox_and_words_list, columns=['filename', 'x0', 'y0', 'x1', 'y1', 'x2', 'y2', 'x3', 'y3', 'line'], dtype=np.int16)
  dataframe = dataframe.drop(columns=['x1', 'y1', 'x3', 'y3'])

  return dataframe


# Example usage
bbox_file_path = sroie_folder_path / "test/box" / example_file
print("== File content ==")
!head -n 5 "{bbox_file_path}"

bbox = read_bbox_and_words(path=bbox_file_path)
print("\n== Dataframe ==")
bbox.head(5)

== File content ==
98,26,321,26,321,66,98,66,TAN CHAY YEE
138,95,279,95,279,120,138,120,*** COPY ***
80,119,329,119,329,140,80,140,OJC MARKETING SDN BHD
129,142,287,142,287,160,129,160,ROC NO: 538358-H
104,163,306,163,306,182,104,182,NO 2 & 4, JALAN BAYU 4,

== Dataframe ==


  dataframe = pd.DataFrame(bbox_and_words_list, columns=['filename', 'x0', 'y0', 'x1', 'y1', 'x2', 'y2', 'x3', 'y3', 'line'], dtype=np.int16)


Unnamed: 0,filename,x0,y0,x2,y2,line
0,X00016469670,98,26,321,66,TAN CHAY YEE
1,X00016469670,138,95,279,120,*** COPY ***
2,X00016469670,80,119,329,140,OJC MARKETING SDN BHD
3,X00016469670,129,142,287,160,ROC NO: 538358-H
4,X00016469670,104,163,306,182,"NO 2 & 4, JALAN BAYU 4,"


In [None]:
def read_entities(path: Path):
  with open(path, 'r') as f:
    data = json.load(f)

  dataframe = pd.DataFrame([data])
  return dataframe


# Example usage
entities_file_path = sroie_folder_path /  "test/entities" / example_file
print("== File content ==")
!head "{entities_file_path}"

entities = read_entities(path=entities_file_path)
print("\n\n== Dataframe ==")
entities

== File content ==
{
    "company": "OJC MARKETING SDN BHD",
    "date": "15/01/2019",
    "address": "NO 2 & 4, JALAN BAYU 4, BANDAR SERI ALAM, B1750 MASAI, JOHOR",
    "total": "193.00"
}

== Dataframe ==


Unnamed: 0,company,date,address,total
0,OJC MARKETING SDN BHD,15/01/2019,"NO 2 & 4, JALAN BAYU 4, BANDAR SERI ALAM, B175...",193.0


In [None]:
# Assign a label to the line by checking the similarity
# of the line and all the entities
def assign_line_label(line: str, entities: pd.DataFrame):
    line_set = line.replace(",", "").strip().split()
    for i, column in enumerate(entities):
        entity_values = entities.iloc[0, i].replace(",", "").strip()
        entity_set = entity_values.split()


        matches_count = 0
        for l in line_set:
            if any(SequenceMatcher(a=l, b=b).ratio() > 0.8 for b in entity_set):
                matches_count += 1

            if (column.upper() == 'ADDRESS' and (matches_count / len(line_set)) >= 0.5) or \
               (column.upper() != 'ADDRESS' and (matches_count == len(line_set))) or \
               matches_count == len(entity_set):
                return column.upper()

    return "O"


line = bbox.loc[1,"line"]
label = assign_line_label(line, entities)
print("Line:", line)
print("Assigned label:", label)

Line: *** COPY ***
Assigned label: O


In [None]:
def assign_labels(words: pd.DataFrame, entities: pd.DataFrame):
    max_area = {"TOTAL": (0, -1), "DATE": (0, -1)}  # Value, index
    already_labeled = {"TOTAL": False,
                       "DATE": False,
                       "ADDRESS": False,
                       "COMPANY": False,
                       "O": False
    }

    # Go through every line in $words and assign it a label
    labels = []
    for i, line in enumerate(words['line']):
        label = assign_line_label(line, entities)

        already_labeled[label] = True
        if (label == "ADDRESS" and already_labeled["TOTAL"]) or \
           (label == "COMPANY" and (already_labeled["DATE"] or already_labeled["TOTAL"])):
            label = "O"

        # Assign to the largest bounding box
        if label in ["TOTAL", "DATE"]:
            x0_loc = words.columns.get_loc("x0")
            bbox = words.iloc[i, x0_loc:x0_loc+4].to_list()
            area = (bbox[2] - bbox[0]) + (bbox[3] - bbox[1])

            if max_area[label][0] < area:
                max_area[label] = (area, i)

            label = "O"

        labels.append(label)

    labels[max_area["DATE"][1]] = "DATE"
    labels[max_area["TOTAL"][1]] = "TOTAL"

    words["label"] = labels
    return words


# Example usage
bbox_labeled = assign_labels(bbox, entities)
bbox_labeled.head(15)

Unnamed: 0,filename,x0,y0,x2,y2,line,label
0,X00016469670,98,26,321,66,TAN CHAY YEE,O
1,X00016469670,138,95,279,120,*** COPY ***,O
2,X00016469670,80,119,329,140,OJC MARKETING SDN BHD,COMPANY
3,X00016469670,129,142,287,160,ROC NO: 538358-H,O
4,X00016469670,104,163,306,182,"NO 2 & 4, JALAN BAYU 4,",ADDRESS
5,X00016469670,123,185,286,205,"BANDAR SERI ALAM,",ADDRESS
6,X00016469670,116,205,292,223,"81750 MASAI, JOHOR",ADDRESS
7,X00016469670,69,226,339,248,TEL:07-388 2218 FAX:07-388 8218,O
8,X00016469670,110,249,300,272,EMAIL:NG@OJCGROUP.COM,O
9,X00016469670,145,291,266,313,TAX INVOICE,O


Tranforming data to JSON FILE for Label Studio

In [None]:
import pandas as pd

# create a dataframe
bbox_labeled
# loop through the rows using iterrows()
#for index, row in bbox_labeled.iterrows():
 #   print(row['filename'], row['x0'])

Unnamed: 0,filename,x0,y0,x2,y2,line,label
0,X00016469670,98,26,321,66,TAN CHAY YEE,O
1,X00016469670,138,95,279,120,*** COPY ***,O
2,X00016469670,80,119,329,140,OJC MARKETING SDN BHD,COMPANY
3,X00016469670,129,142,287,160,ROC NO: 538358-H,O
4,X00016469670,104,163,306,182,"NO 2 & 4, JALAN BAYU 4,",ADDRESS
5,X00016469670,123,185,286,205,"BANDAR SERI ALAM,",ADDRESS
6,X00016469670,116,205,292,223,"81750 MASAI, JOHOR",ADDRESS
7,X00016469670,69,226,339,248,TEL:07-388 2218 FAX:07-388 8218,O
8,X00016469670,110,249,300,272,EMAIL:NG@OJCGROUP.COM,O
9,X00016469670,145,291,266,313,TAX INVOICE,O


In [None]:
import os
import PIL
predictions = []

def row_to_json(row, ogwidth, ogheight,i):
  filename = row["filename"]
  x0 = row["x0"]
  y0 = row["y0"]
  x2 = row["x2"]
  y2 = row["y2"]
  line = row["line"]
  label = row["label"]
  width = row["x2"] - row["x0"]
  height = row["y2"] - row["y0"]
  prediction = {
    "original_width": ogwidth,
    "original_height": ogheight,
    "image_rotation": 0,
    "value": {
        "x": (x0*100)/ogwidth,
        "y": (y0*100)/ogheight,
        "width": (width*100)/ogwidth,
        "height": (height*100)/ogheight,
        "rotation": 0 ,
    },
          "id": filename + "-"  + str(i),
          "from_name": "bbox",
          "to_name": "image",
          "type": "rectangle"
  }
  prediction2 = {
    "original_width": ogwidth,
    "original_height": ogheight,
    "image_rotation": 0,
    "value": {
        "x": (x0*100)/ogwidth,
        "y": (y0*100)/ogheight,
        "width": (width*100)/ogwidth,
        "height": (height*100)/ogheight,
        "rotation": 0 ,
         "labels": [
                     "Text"
                  ]
    },
               "id": filename + "-"  + str(i) ,
               "from_name": "label",
               "to_name": "image",
               "type": "labels"
  }
  prediction3 = {
          "original_width": ogwidth,
    "original_height": ogheight,
    "image_rotation": 0,
    "value": {
        "x": (x0*100)/ogwidth,
        "y": (y0*100)/ogheight,
        "width": (width*100)/ogwidth,
        "height": (height*100)/ogheight,
        "rotation": 0 ,
        "text": [
                     line
                  ]
    },
          "id": filename + "-" + str(i) ,
               "from_name": "transcription",
               "to_name": "image",
               "type": "textarea"
  }
  predictions.append(prediction)
  predictions.append(prediction2)
  predictions.append(prediction3)



class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)


def read_dataframe_json(bbox_labeled,folder):

  last_location =  get_everything_after_sroie2019(folder)
  folder_path = "/content/drive/MyDrive/SROIE2"+ last_location + "/" + 'img/'
  folder_image = "/data/local-files?d=SROIE2" + last_location + "/" + 'img/'
  fileinfolder=bbox_labeled.iloc[0]
  image = PIL.Image.open(folder_path + fileinfolder["filename"] + '.jpg')
  ogwidth, ogheight = image.size # get the width and height in pixels
#for i in range(2): # loop through the first two rows
  for index, row in bbox_labeled.iterrows():

    row_to_json(row, ogwidth, ogheight , index)

  final_dict = { "data": {
          "ocr": folder_image + fileinfolder["filename"] + ".jpg"
      },
                "predictions": [ { "model_version": 1, "result": predictions , "score": 0.89 } ] }
  output_file = "/content/results/" + fileinfolder["filename"] +".json"
  import json
  import numpy as np
  json_object = json.dumps(final_dict,cls=NpEncoder)
  with open(output_file, "w") as outfile:
      outfile.write(json_object)
  predictions.clear()

In [None]:
from time import perf_counter
def get_everything_after_sroie2019(folder_image):
  folder_image_str = str(folder_image)
  last_component = folder_image_str[-len("SROIE2019"):]
  return last_component


In [None]:


def dataset_creator(folder: Path):
    bbox_folder = folder / "box"
    entities_folder = folder / "entities"
    entities_files = sorted(entities_folder.glob("*.txt"))
    bbox_files = sorted(Path(bbox_folder).glob("*.txt"))
    for bbox_file, entities_file in tqdm(zip(bbox_files, entities_files), total=len(bbox_files)):
      # Read the files

      bbox = read_bbox_and_words(bbox_file)
      entities = read_entities(entities_file)
      # Assign labels to lines in bbox using entities
      bbox_labeled = assign_labels(bbox, entities)
      del bbox
      read_dataframe_json(bbox_labeled,folder)

In [None]:
SROIE_path = Path('/content/drive/MyDrive/SROIE2019')
dataset_creator(Path(SROIE_path / 'train' ))

  dataframe = pd.DataFrame(bbox_and_words_list, columns=['filename', 'x0', 'y0', 'x1', 'y1', 'x2', 'y2', 'x3', 'y3', 'line'], dtype=np.int16)
  dataframe = pd.DataFrame(bbox_and_words_list, columns=['filename', 'x0', 'y0', 'x1', 'y1', 'x2', 'y2', 'x3', 'y3', 'line'], dtype=np.int16)
  dataframe = pd.DataFrame(bbox_and_words_list, columns=['filename', 'x0', 'y0', 'x1', 'y1', 'x2', 'y2', 'x3', 'y3', 'line'], dtype=np.int16)
  dataframe = pd.DataFrame(bbox_and_words_list, columns=['filename', 'x0', 'y0', 'x1', 'y1', 'x2', 'y2', 'x3', 'y3', 'line'], dtype=np.int16)
  dataframe = pd.DataFrame(bbox_and_words_list, columns=['filename', 'x0', 'y0', 'x1', 'y1', 'x2', 'y2', 'x3', 'y3', 'line'], dtype=np.int16)
  dataframe = pd.DataFrame(bbox_and_words_list, columns=['filename', 'x0', 'y0', 'x1', 'y1', 'x2', 'y2', 'x3', 'y3', 'line'], dtype=np.int16)
  dataframe = pd.DataFrame(bbox_and_words_list, columns=['filename', 'x0', 'y0', 'x1', 'y1', 'x2', 'y2', 'x3', 'y3', 'line'], dtype=np.int16)
  data

set LABEL_STUDIO_LOCAL_FILES_SERVING_ENABLED=true
set LABEL_STUDIO_LOCAL_FILES_DOCUMENT_ROOT=C:\Users\Yeyian PC\Documents\SROIEDATASET

==============================================================================================================================================================================================================================================

In [None]:
!zip -r '/content/results' '/content/results'

  adding: content/results/ (stored 0%)
  adding: content/results/X51006557164.json (deflated 94%)
  adding: content/results/X51008142062.json (deflated 94%)
  adding: content/results/X51006619566.json (deflated 91%)
  adding: content/results/X51006557193.json (deflated 93%)
  adding: content/results/X51006828217.json (deflated 93%)
  adding: content/results/X51008114281.json (deflated 92%)
  adding: content/results/X51005711442.json (deflated 93%)
  adding: content/results/X51006913031.json (deflated 91%)
  adding: content/results/X51006414394.json (deflated 93%)
  adding: content/results/X51007339158.json (deflated 92%)
  adding: content/results/X51006556815.json (deflated 94%)
  adding: content/results/X51006619697.json (deflated 93%)
  adding: content/results/X51005715456.json (deflated 93%)
  adding: content/results/X51006414638.json (deflated 93%)
  adding: content/results/X51008164991.json (deflated 93%)
  adding: content/results/X51006913070.json (deflated 92%)
  adding: content