<div style="width: 100%; clear: both;">
<div style="float: left; width: 50%;">
<img src="http://www.uoc.edu/portal/_resources/common/imatges/marca_UOC/UOC_Masterbrand.jpg", align="left">
</div>
<div style="float: right; width: 50%;">
<p style="margin: 0; padding-top: 22px; text-align:right;">M2.981 · TFM Àrea 4 · NLP & Text Mining</p>
<p style="margin: 0; text-align:right;">2022 · Màster universitari en Ciència de dades (Data science)</p>
<p style="margin: 0; text-align:right; padding-button: 100px;">Estudis d'Informàtica, Multimèdia i Telecomunicació</p>
</div>
</div>
<div style="width:100%;">&nbsp;</div>

<div class="row" style="padding-top: 50px;">
    <div class="row" style="background: #494949;padding: 10px 20px; color: #FFF">
        <div class="col-md-12">
            <div style="text-align:left;"><b>Estudiant:</b> Albert Cámara Viñals</div>
        </div>
    </div>
</div>


# RVL-CDIP Invoice dataset preparation for Label Studio (OCR Abbyy)

In [1]:
# Load libraries
import cv2
import glob
import json
import pandas as pd
import pytesseract
import numpy as np
import os
import re
import seaborn as sns
import shutil
from matplotlib import pyplot as plt
from pathlib import Path
from PIL import Image
from sklearn.model_selection import train_test_split
from xml.etree import ElementTree as ET
from uuid import uuid4

pytesseract.pytesseract.tesseract_cmd =r'C:/Program Files/Tesseract-OCR/tesseract.exe'
plt.style.use("ggplot")

%matplotlib inline

In [2]:
# Function to mask text
def mask_content(text):
    masked_alphabetic = re.sub(r'[A-Za-zÀ-ÖØ-öø-ÿ]', r'A', text)
    masked_digits = re.sub(r'\d', r'N', masked_alphabetic)
    masked_text = re.sub(r'\W', r'S', masked_digits)
    return masked_text

In [3]:
# Define some functions

ns = {'pagecontent': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15'}

# Function to read PAGE-XML file and convert to PANDAS dataframe
def process_page_xml_file(filename):
    document = ET.parse(filename).getroot()
    #ET.dump(document)
    
    # dictionary 
    invoice = {} 
    
    pages = document.findall('pagecontent:Page', ns)
    
    num_pages = len(pages)
    num_word = 0
        
    for idx_page, page in enumerate(pages):  
        #ET.dump(page)
        image_filename = page.get('imageFilename')
        page_width = page.get('imageWidth')
        page_height = page.get('imageHeight')
                
        textregions = page.findall('pagecontent:TextRegion', ns)
        
        for textregion in textregions:
            textlines = textregion.findall('pagecontent:TextLine', ns)
            
            for textline in textlines:
                
                words = textline.findall('pagecontent:Word', ns)

                for word in words:
                    coords = word.find('pagecontent:Coords', ns).get('points')
                    textequiv = word.find('pagecontent:TextEquiv', ns)
                    conf = textequiv.get('conf')
                    unicode = textequiv.find('pagecontent:Unicode', ns)
                                                    
                    invoice[num_word] = {} 
                
                    points = coords.split()
                    col =  int(points[0].split(',')[0])
                    row = int(points[0].split(',')[1])
                    width = int(points[2].split(',')[0]) - int(points[0].split(',')[0])
                    height = int(points[2].split(',')[1]) - int(points[0].split(',')[1])
                    content = unicode.text
                    
                    invoice[num_word]['src'] = image_filename
                    invoice[num_word]['num_pages'] = num_pages
                    invoice[num_word]['page_id'] = idx_page
                    invoice[num_word]['page_width'] = int(page_width)
                    invoice[num_word]['page_height'] = int(page_height)
                    
                    invoice[num_word]['col'] = col
                    invoice[num_word]['row'] = row
                    invoice[num_word]['width'] = width
                    invoice[num_word]['height'] = height
                    invoice[num_word]['x1'] = col
                    invoice[num_word]['y1'] = row
                    invoice[num_word]['x2'] = col + width
                    invoice[num_word]['y2'] = row
                    invoice[num_word]['x3'] = col + width
                    invoice[num_word]['y3'] = row + height
                    invoice[num_word]['x4'] = col 
                    invoice[num_word]['y4'] = row + height
                    
                    invoice[num_word]['contents'] = content
                    invoice[num_word]['contents_masked'] = mask_content(content)
                    invoice[num_word]['confidence'] = round(float(conf), 2)
                    
                    invoice[num_word]['correctclass'] = 'undefined'
                    invoice[num_word]['tag'] = 'OTHER'
                    invoice[num_word]['description'] = 'OTHER'
                    num_word = num_word + 1

    # Create pandas dataframe from invoice object
    df = pd.DataFrame(invoice).transpose()
    
    return df

In [4]:
def create_image_url(filepath):
    """
    Label Studio requires image URLs, so this defines the mapping from filesystem to URLs
    if you use ./serve_local_files.sh <my-images-dir>, the image URLs are localhost:8081/filename.png
    Otherwise you can build links like /data/upload/filename.png to refer to the files
    """
    filename = os.path.basename(filepath)
    #return f'http://localhost:8081/{filename}'
    return f'/data/local-files/?d=images/{filename}'

In [5]:
def create_label_studio_annotations(filename, data):
    results = []
    all_scores = []
    
    for index, row in data.iterrows():
        region_id = str(uuid4())[:10]
        bbox = {
                'x': 100 * row['x1']/row['page_width'],
                'y': 100 * row['y1']/row['page_height'],
                'width': 100 * row['width']/row['page_width'],
                'height': 100 * row['height']/row['page_height'],
                'rotation': 0
        }
        
        bbox_result = {
            'id': region_id, 
            'from_name': 'bbox', 
            'to_name': 'image', 
            'type': 'rectangle', 
            'value': bbox
        }
        
        transcription_result = {
            'id': region_id, 
            'from_name': 'transcription', 
            'to_name': 'image', 
            'type': 'textarea', 
            'value': dict(**bbox, text=[row['contents']])
        }
        
        label_result = {
            'id': region_id, 
            'from_name': 'label', 
            'to_name': 'image', 
            'type': 'labels', 
            'value': dict(**bbox, labels=[row['tag']])
        }
        
        results.extend([bbox_result, transcription_result, label_result])
        all_scores.append(row['confidence']),
        
    return {
        'data': {
           'ocr': create_image_url(filename)
        },
        'predictions': [{
            'result': results,
            'score': sum(all_scores) / len(all_scores) if all_scores else 0
        }]
    }

In [6]:
# Define data directories
data_dir = 'data/original/pageXML_file_ocr/'
data_transformed_dir = 'data/transformed/'

# Get list of PAGE-XML files
page_xml_files = glob.glob(data_dir + "*.xml")

# Show number of files
num_page_xml_files = len(page_xml_files)
print('Number of files: ', num_page_xml_files)

Number of files:  519


In [7]:
tasks =[]

# Loop all files and process (convert to dataframe and label studio preannotation)
for idx, page_xml_file in enumerate(page_xml_files):
    print('Processing ' + str(idx + 1) +'/' + str(num_page_xml_files))
    # Get filepath
    base = os.path.basename(page_xml_file)
    
    # Get basename
    basename = os.path.splitext(base)[0]
        
    # Read PAGE-XML into pandas dataframe
    df = process_page_xml_file(page_xml_file)

    # Sort dataframe by top row and top column (0,0) (left, top)
    df.sort_values(by=['row', 'col'], inplace=True, ascending = [True, True])

    # Save dataframe
    df.to_csv(data_transformed_dir + basename + '.csv', index = False, sep=';')

    labeled = create_label_studio_annotations(basename[:-4] + '.png', df)

    tasks.append(labeled)
    
# Create and a file to import into Label Studio
with open(data_transformed_dir + 'preannotated_ocr_abbyy_tasks.json', mode='w') as f:
    json.dump(tasks, f, indent=2)

Processing 1/519
Processing 2/519
Processing 3/519
Processing 4/519
Processing 5/519
Processing 6/519
Processing 7/519
Processing 8/519
Processing 9/519
Processing 10/519
Processing 11/519
Processing 12/519
Processing 13/519
Processing 14/519
Processing 15/519
Processing 16/519
Processing 17/519
Processing 18/519
Processing 19/519
Processing 20/519
Processing 21/519
Processing 22/519
Processing 23/519
Processing 24/519
Processing 25/519
Processing 26/519
Processing 27/519
Processing 28/519
Processing 29/519
Processing 30/519
Processing 31/519
Processing 32/519
Processing 33/519
Processing 34/519
Processing 35/519
Processing 36/519
Processing 37/519
Processing 38/519
Processing 39/519
Processing 40/519
Processing 41/519
Processing 42/519
Processing 43/519
Processing 44/519
Processing 45/519
Processing 46/519
Processing 47/519
Processing 48/519
Processing 49/519
Processing 50/519
Processing 51/519
Processing 52/519
Processing 53/519
Processing 54/519
Processing 55/519
Processing 56/519
P

Processing 439/519
Processing 440/519
Processing 441/519
Processing 442/519
Processing 443/519
Processing 444/519
Processing 445/519
Processing 446/519
Processing 447/519
Processing 448/519
Processing 449/519
Processing 450/519
Processing 451/519
Processing 452/519
Processing 453/519
Processing 454/519
Processing 455/519
Processing 456/519
Processing 457/519
Processing 458/519
Processing 459/519
Processing 460/519
Processing 461/519
Processing 462/519
Processing 463/519
Processing 464/519
Processing 465/519
Processing 466/519
Processing 467/519
Processing 468/519
Processing 469/519
Processing 470/519
Processing 471/519
Processing 472/519
Processing 473/519
Processing 474/519
Processing 475/519
Processing 476/519
Processing 477/519
Processing 478/519
Processing 479/519
Processing 480/519
Processing 481/519
Processing 482/519
Processing 483/519
Processing 484/519
Processing 485/519
Processing 486/519
Processing 487/519
Processing 488/519
Processing 489/519
Processing 490/519
Processing 4