<div style="width: 100%; clear: both;">
<div style="float: left; width: 50%;">
<img src="http://www.uoc.edu/portal/_resources/common/imatges/marca_UOC/UOC_Masterbrand.jpg", align="left">
</div>
<div style="float: right; width: 50%;">
<p style="margin: 0; padding-top: 22px; text-align:right;">M2.981 · TFM Àrea 4 · NLP & Text Mining</p>
<p style="margin: 0; text-align:right;">2022 · Màster universitari en Ciència de dades (Data science)</p>
<p style="margin: 0; text-align:right; padding-button: 100px;">Estudis d'Informàtica, Multimèdia i Telecomunicació</p>
</div>
</div>
<div style="width:100%;">&nbsp;</div>

<div class="row" style="padding-top: 50px;">
    <div class="row" style="background: #494949;padding: 10px 20px; color: #FFF">
        <div class="col-md-12">
            <div style="text-align:left;"><b>Estudiant:</b> Albert Cámara Viñals</div>
        </div>
    </div>
</div>


# RVL-CDIP Invoice dataset preparation for Label Studio (tesseract OCR)

https://labelstud.io/blog/Improve-OCR-quality-with-Tesseract-and-Label-Studio.html

In [1]:
# Load libraries
import cv2
import glob
import json
import pandas as pd
import pytesseract
import numpy as np
import os
import re
import seaborn as sns
import shutil
from matplotlib import pyplot as plt
from pathlib import Path
from PIL import Image
from sklearn.model_selection import train_test_split
from xml.etree import ElementTree as ET
from uuid import uuid4

pytesseract.pytesseract.tesseract_cmd =r'C:/Program Files/Tesseract-OCR/tesseract.exe'
plt.style.use("ggplot")

%matplotlib inline

In [2]:
# Dictionary of entities

label_dic = { 
    'BA': {'description':  'Billing Address', 'tag': 'BILLING_ADDRESS'},
    'BCN': {'description':  'Billing Contact Number', 'tag': 'BILLING_CONTACT_NUMBER'},
    'BN': {'description':  'Billing Name', 'tag': 'BILLING_NAME'},
    'CNUM': {'description': 'Client Number', 'tag': 'CLIENT_NUMBER'},
    'EN': {'description':  'E-commerce Platform Name (Like Amazon, Ebay,..)', 'tag': 'ECOMMERCE_PLATFORM_NAME'},  
    'EOID': {'description':  'E-commerce Order Reference', 'tag': 'ECOMMERCE_ORDER_REFERENCE'},
    'EWEB': {'description':  'E-commerce Platform Website', 'tag': 'ECOMMERCE_PLATFORM_WEBSITE'},
    'IDATE': {'description':  'Invoice Date', 'tag': 'INVOICE_DATE'},
    'img': {'description':  'Image', 'tag': 'IMAGE'},
    'IN': {'description': 'Invoice Number', 'tag': 'INVOICE_NUMBER'},
    'ONUM': {'description':  'Order Number', 'tag': 'ORDER_NUMBER'},
    'PD': {'description':  'Product Description', 'tag': 'PRODUCT_DESCRIPTION'},
    'PMODE': {'description':  'Payment Mode', 'tag': 'PAYMENT_MODE'},
    'PT': {'description': 'Product Total', 'tag': 'PRODUCT_TOTAL'},
    'PTTX': {'description': 'Product Total Price with TAX', 'tag': 'PRODUCT_TOTAL_PRICE_WITH_TAX'},
    'PTWTX': {'description': 'Product Total Price without TAX', 'tag': 'PRODUCT_TOTAL_PRICE_WITHOUT_TAX'},
    'QTY': {'description':  'Quantity', 'tag': 'QUANTITY'},
    'SA': {'description':  'Seller Address', 'tag': 'SUPPLIER_ADDRESS'},
    'SCID': {'description':  'Seller Identifier Number (Siren)', 'tag': 'SUPPLIER_ID'},
    'SCN': {'description':  'Seller Contact Number', 'tag': 'SUPPLIER_CONTACT_NUMBER'}, 
    'SEMAIL': {'description':  'Seller Email', 'tag': 'SUPPLIER_EMAIL'},
    'SFAX': {'description':  'Seller Fax Number', 'tag': 'SUPPLIER_FAX_NUMBER'},
    'SHA': {'description':  'Shipping Address', 'tag': 'SHIPPING_ADDRESS'},
    'SHCN': {'description':  'Shipping Contact Number', 'tag': 'SHIPPING_CONTACT_NUMBER'},
    'SHN': {'description':  'Shipping Name', 'tag': 'SHIPPING_NAME'},
    'SN': {'description':  'Seller/Company) name', 'tag': 'SUPPLIER_NAME'},
    'SNO': {'description': 'SNO', 'tag': 'SNO'},
    'SRCS': {'description':  'Seller RCS', 'tag': 'SUPPLIER_RCS'},
    'SSIRET': {'description':  'Seller Siret', 'tag': 'SUPPLIER_SIRET'},
    'STOA': {'description':  'Seller TOA/APE', 'tag': 'SUPPLIER_TOA'},
    'SVAT': {'description':  'Seller Vat Number/TVA', 'tag': 'SUPPLIER_VAT'}, 
    'SWEB': {'description':  'Seller Website', 'tag': 'SUPPLIER_WEBSITE'},
    'TA': {'description': 'Total with TAX', 'tag': 'TOTAL_AMOUNT'},
    'TBL': {'description':  'Table (and its content)', 'tag': 'TABLE'},
    'TPDATE': {'description':  'Tax Point Date (date of supply)', 'tag': 'TAX_POINT_DATE'}, 
    'TTX': {'description': 'Total Tax', 'tag': 'TOTAL_TAX'},
    'TWTX': {'description':  'TWTXS', 'tag': 'TOTAL_WITHOUT_TAX'},
    'TXR': {'description': 'Tax Rate', 'tag': 'TAX_RATE'},
    'UP': {'description': 'Unit Price Without Tax', 'tag': 'UNIT_PRICE'},
    'undefined': {'description':  'OTHER', 'tag': 'OTHER'},
}

In [3]:
# Function to mask text
def mask_content(text):
    masked_alphabetic = re.sub(r'[A-Za-zÀ-ÖØ-öø-ÿ]', r'A', text)
    masked_digits = re.sub(r'\d', r'N', masked_alphabetic)
    masked_text = re.sub(r'\W', r'S', masked_digits)
    return masked_text

# Function to draw bounding boxes in images
def draw_bouding_boxes(image, data, color):
    image = cv2.imread(image)

    for index, row in data.iterrows():
        cv2.rectangle(image, (row["x1"], row["y1"]), (row["x3"], row["y3"]), color)
    
    return image

# Function to preview image
def show_image(image):   
    cv2.imshow('image', image)
    cv2.namedWindow("PREVIEW", cv2.WINDOW_NORMAL)   # Create window with freedom of dimensions
    image_resized = cv2.resize(image, (595, 842))   # Resize image
    cv2.imshow("PREVIEW", image_resized)            # Show image
    cv2.waitKey(0)                                  # Display the image infinitely until any keypress

In [4]:
def create_image_url(filepath):
    """
    Label Studio requires image URLs, so this defines the mapping from filesystem to URLs
    if you use ./serve_local_files.sh <my-images-dir>, the image URLs are localhost:8081/filename.png
    Otherwise you can build links like /data/upload/filename.png to refer to the files
    """
    filename = os.path.basename(filepath)
    #return f'http://localhost:8081/{filename}'
    return f'/data/local-files/?d=images/{filename}'

In [5]:
def create_label_studio_annotations(filename, image, tesseract_output):
    """
    :param image: OpenCV image object
    :param tesseract_output: the output from tesseract
    :param per_level: control the granularity of bboxes from tesseract
    :return: tasks.json ready to be imported into Label Studio with "Optical Character Recognition" template
    """
    image_width = image.shape[1]
    image_height = image.shape[0]
        
    results = []
    all_scores = []
    
    n_boxes = len(tesseract_output['text'])
    
    for i in range(n_boxes):
    
        bbox = {
            'x': 100 * tesseract_output['left'][i] / image_width,
            'y': 100 * tesseract_output['top'][i] / image_height,
            'width': 100 * tesseract_output['width'][i] / image_width,
            'height': 100 * tesseract_output['height'][i] / image_height,
            'rotation': 0
        }

        word = tesseract_output['text'][i]
        confidence = tesseract_output['conf'][i]
        
        if confidence != '-1':   #  If conf is -1, that means that the corresponding bounding box contains a block of text, rather than just a single word.
                               
            region_id = str(uuid4())[:10]
            score = float(confidence)

            bbox_result = {
                'id': region_id, 
                'from_name': 'bbox', 
                'to_name': 'image', 
                'type': 'rectangle',
                'value': bbox
            }

            transcription_result = {
                'id': region_id, 
                'from_name': 'transcription', 
                'to_name': 'image', 
                'type': 'textarea',
                'value': dict(text=[word], **bbox), 
                'score': score
            }

            results.extend([bbox_result, transcription_result])
            all_scores.append(score)

    return {
        'data': {
            'ocr': create_image_url(filename)
        },
        'predictions': [{
            'result': results,
            'score': sum(all_scores) / len(all_scores) if all_scores else 0
        }]
    }

In [6]:
data_dir = 'data/original/images/'
data_transformed_dir = 'data/transformed/'

image_files = glob.glob(data_dir + "*.tif")
num_images = len(image_files)

print('Number of images: ', num_images)

tasks = []
# collect the invoices images from the image directory
for idx, image_file in enumerate(image_files):
    print('Processing ' + str(idx + 1) +'/' + str(num_images))
    # Get filepath
    base = os.path.basename(image_file)

    # Get basename
    basename = os.path.split(base)[-1]
    filename = os.path.splitext(base)[0] + '.png'
    
    image = cv2.imread(image_file)
    cv2.imwrite(data_transformed_dir + filename, image)
    image = cv2.imread(data_transformed_dir + filename)
    
    # OCR Image with Tesseract
    tesseract_output = pytesseract.image_to_data(image, output_type = pytesseract.Output.DICT)
    
    # Create Label studio pre-annotations from OCR
    task = create_label_studio_annotations(filename, image, tesseract_output)
    tasks.append(task)
    
# create a file to import into Label Studio
with open(data_transformed_dir + 'preannotated_ocr_tesseract_tasks.json', mode='w') as f:
    json.dump(tasks, f, indent=2)

Number of images:  519
Processing 1/519
Processing 2/519
Processing 3/519
Processing 4/519
Processing 5/519
Processing 6/519
Processing 7/519
Processing 8/519
Processing 9/519
Processing 10/519
Processing 11/519
Processing 12/519
Processing 13/519
Processing 14/519
Processing 15/519
Processing 16/519
Processing 17/519
Processing 18/519
Processing 19/519
Processing 20/519
Processing 21/519
Processing 22/519
Processing 23/519
Processing 24/519
Processing 25/519
Processing 26/519
Processing 27/519
Processing 28/519
Processing 29/519
Processing 30/519
Processing 31/519
Processing 32/519
Processing 33/519
Processing 34/519
Processing 35/519
Processing 36/519
Processing 37/519
Processing 38/519
Processing 39/519
Processing 40/519
Processing 41/519
Processing 42/519
Processing 43/519
Processing 44/519
Processing 45/519
Processing 46/519
Processing 47/519
Processing 48/519
Processing 49/519
Processing 50/519
Processing 51/519
Processing 52/519
Processing 53/519
Processing 54/519
Processing 55/

Processing 437/519
Processing 438/519
Processing 439/519
Processing 440/519
Processing 441/519
Processing 442/519
Processing 443/519
Processing 444/519
Processing 445/519
Processing 446/519
Processing 447/519
Processing 448/519
Processing 449/519
Processing 450/519
Processing 451/519
Processing 452/519
Processing 453/519
Processing 454/519
Processing 455/519
Processing 456/519
Processing 457/519
Processing 458/519
Processing 459/519
Processing 460/519
Processing 461/519
Processing 462/519
Processing 463/519
Processing 464/519
Processing 465/519
Processing 466/519
Processing 467/519
Processing 468/519
Processing 469/519
Processing 470/519
Processing 471/519
Processing 472/519
Processing 473/519
Processing 474/519
Processing 475/519
Processing 476/519
Processing 477/519
Processing 478/519
Processing 479/519
Processing 480/519
Processing 481/519
Processing 482/519
Processing 483/519
Processing 484/519
Processing 485/519
Processing 486/519
Processing 487/519
Processing 488/519
Processing 4