In [1]:
import json
import os
from pathlib import Path
from pickle import FALSE
import re
import shutil

In [15]:
# Convert the processing data to DocumentLabeler format

JSON_EXT = '.json'
DEFAULT_ENCODING = 'utf-8'

class DataReader():
    def __init__(self, data_dir, encoding=DEFAULT_ENCODING):
        self.data_dir = data_dir
        self.encoding = encoding
    
    def read_text(self, filename):
        with open(str(self.data_dir) + filename, 'r', encoding = self.encoding) as f:
            return f.read()
    
    def read_lines(self, filename):
        with open(str(self.data_dir) + filename, 'r', encoding = self.encoding) as f:
            return f.readlines()
    
    def read_json(self, filename):
        with open(self.data_dir + filename + JSON_EXT, 'r', encoding=self.encoding) as f:
            return json.load(f)
    
    def read_json_from_path(self, path):
        with open(path, 'r', encoding=self.encoding) as f:
            return json.load(f)

class DataWrite():
    def __init__(self, data_dir, encoding=DEFAULT_ENCODING):
        self.data_dir = data_dir
        self.encoding = encoding
    
    def write_text(self, filename, text):
        with open(str(self.data_dir) + filename, 'a', encoding=self.encoding) as f:
            f.write(text)
    
    def write_json(self, filename, data):
        with open(self.data_dir + filename + JSON_EXT, 'w', encoding=self.encoding) as f:
            json.dump(data,f)
    
    def write_json_from_path(self, path, data):
        with open(path, 'w', encoding=self.encoding) as f:
            json.dump(data,f)

In [23]:
if __name__ == '__main__':
    root_dir = '/Path_to_root'
    # Target Directory
    # Please enter the data (source directory where you store the preprocessed .txt and .jpg files)
    data_dir = root_dir + '/path_to/_images/'
    # Please enter below the target directory (where you would like to store the transcripts, 
    # bounding boxes, and clear image copies to annotate)
    target_dir = root_dir + '/path_to_root/_DocumentLabeler/DocumentLabeler'
    # Check existence of target directory
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    reader = DataReader(data_dir)  # Source to read (data directory)
    writer = DataWrite(target_dir) # Target to write (target directory)

    text_files = [f for f in os.listdir(data_dir) if f.endswith('_0.txt')]
    
    with open(str(target_dir) + 'Label.txt', 'a') as l:
        for f in text_files:
            # check existence of image file
            if not os.path.exists(target_dir + f.replace('_0.txt','_ori.jpg')):
                # copy the image file to the target directory by using shutil
                shutil.copy(data_dir + f.replace('_0.txt','_ori.jpg'), target_dir)
           
            l.write(target_dir.split('/')[-2] + '/' + f.replace('_0.txt','_ori.jpg')+'\t')
            writer.write_text('fileState.txt',str(target_dir.split('/')[-2] + '/' +f.replace('0.txt','ori.jpg') +'\t'+ str(1) + '\n'))
            
            # Extract Individual Lines
            data_line = reader.read_lines('/' + f)
            l.write('[')

            # List of lines
            for line in data_line:
                # if re.search('(?:\.){2,}',line): # To filter delineation dots 
                # via regex
                #     continue
                # else:
                # Extract individual words
                word = line.split('\t')[0]
                x0 = int(float(line.split('\t')[1]))
                y0 = int(float(line.split('\t')[2]))
                x1 = int(float(line.split('\t')[3]))
                y1 = int(float(line.split('\t')[4]))

                # Dump the json structure into text as DocumentLabeler needs transcription, 
                # points, and difficult keys
                label_write = {'transcription': word, \
                               'points':[ [x0,y0],[x1,y0],[x1,y1],[x0,y1]], \
                                'difficult':False}
                l.write(json.dumps(label_write, ensure_ascii=False))
                if line == data_line[-1]:
                    pass
                else:
                    l.write(',')
                
            l.write(']\n')