This code converts hadara format xml files to page format xml files. It contains the word bounding boxes and their labels. It extracts text line bounding boxes from the word bounding boxes. It contains these text line bounding boxes and their labels.

In [1]:
import cv2
import os
import shutil
from lxml import etree as ET
import numpy as np
%matplotlib inline
from matplotlib import pyplot as plt
from PIL import ImageFont, ImageDraw, Image
from bidi.algorithm import get_display
import arabic_reshaper

In [2]:
def coordinates(cnt):
    coords= str(cnt[0])+','+str(cnt[1])+' '+str(cnt[0]+cnt[2])+','+str(cnt[1])+' '+str(cnt[0]+cnt[2])+','+str(cnt[1]+cnt[3])+' '+str(cnt[0])+','+str(cnt[1]+cnt[3])
    return coords

In [3]:
def reduce_height(h):
    if h <= 35:
        reduced_h = 10
    else:
        reduced_h = h - 30
    return reduced_h

In [4]:
def one_label(labelList):
    line_label=''
    for label in labelList:
        line_label=line_label+' '+label
    return line_label

In [5]:
def one_box(rectList):
    arr = []
    for rect in rectList:
        arr.append((rect[0],rect[1]))
        arr.append((rect[0]+rect[2],rect[1]+rect[3]))
    (x,y,w,h) = cv2.boundingRect(np.asarray(arr))
    return x,y,w,h

In [6]:
def word_bboxes_and_labels_from_hadara_xml(hadara_xml_path, image_root_name):
    tree = ET.parse(hadara_xml_path)
    root = tree.getroot()
    word_bboxes = []
    word_labels = []
    for element in root.findall('DocumentElement'):
        transcript = element.find('Transcript')
        if (transcript!=None):
            word_label = transcript.text
            x = int(element.find('X').text)
            y = int(element.find('Y').text)
            h = int(element.find('Height').text)
            w = int(element.find('Width').text)
            word_bbox = (x,y,w,h)
            if (word_bboxes!=None):
                word_bboxes.append(word_bbox)
                word_labels.append(word_label)
    print('Gathered '+ str(len(word_labels)) + ' words on ' +image_name)
    return word_bboxes, word_labels

In [7]:
def lines_from_words(word_bboxes, word_labels):
    line_bboxes = []
    line_labels = []
    sorted_word_bboxes = []
    sorted_word_labels = []
    # Sort by y coordinate
    word_bboxes, word_labels = zip(*sorted(zip(word_bboxes, word_labels), key=lambda p: p[0][1]))
    word_bboxes = list(word_bboxes)
    word_labels = list(word_labels)
    # Bottom of the first rectangle is the baseline
    reduced_h = reduce_height(word_bboxes[0][3])
    baseline = word_bboxes[0][1] + reduced_h - 1
    end_idx = 0
    for i in range(len(word_bboxes)):
        # Continue iterating until the box whose y coordinate is below the current baseline
        if word_bboxes[i][1] > baseline:
            # Sort the boxes whose y coordinates are above the current baseline, by their x coordinate, in descending order
            word_bboxes[end_idx:i], word_labels[end_idx:i] = zip(*sorted(zip(word_bboxes[end_idx:i], word_labels[end_idx:i]), reverse=True, key=lambda p: p[0][0]))

            if len(word_bboxes[end_idx:i])>0:
                line_bbox = one_box(word_bboxes[end_idx:i])
                line_bboxes.append(line_bbox)
                line_label = one_label(word_labels[end_idx:i])
                line_labels.append(line_label)
                sorted_word_bboxes.append(word_bboxes[end_idx:i])
                sorted_word_labels.append(word_labels[end_idx:i])

            end_idx = i
        # Update the baseline. 
        # New baseline is the bottom of the box whose y coordinate is below the current baseline
        reduced_h = reduce_height(word_bboxes[i][3])
        baseline = max(word_bboxes[i][1] + reduced_h - 1, baseline)

    # Sort the word bboxes at the final line
    if len(word_bboxes[end_idx:i])>0:
        word_bboxes[end_idx:i], word_labels[end_idx:i] = zip(*sorted(zip(word_bboxes[end_idx:i], word_labels[end_idx:i]), reverse=True, key=lambda p: p[0][0]))
        line_bbox = one_box(word_bboxes[end_idx:i])
        line_bboxes.append(line_bbox)
        line_label = one_label(word_labels[end_idx:i])
        line_labels.append(line_label)
        sorted_word_bboxes.append(word_bboxes[end_idx:i])
        sorted_word_labels.append(word_labels[end_idx:i])
        
    return sorted_word_bboxes, sorted_word_labels, line_bboxes, line_labels

In [8]:
def generate_pagexml(xml_folder_path, image_file_path, sorted_word_bboxes, sorted_word_labels, line_bboxes, line_labels):
    xmlns = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"
    xsi ="http://www.w3.org/2001/XMLSchema-instance"
    schemaLocation = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd"

    PcGts = ET.Element("{" + xmlns + "}PcGts",
                           attrib={"{" + xsi + "}schemaLocation" : schemaLocation}, 
                            nsmap={'xsi': xsi, None: xmlns})
    PcGts.set("pcGtsId","pc-aletheiaexamplepage")
    Metadata = ET.SubElement(PcGts, 'Metadata')
    Creator = ET.SubElement(Metadata, 'Creator')
    Creator.text='PRImA Research Lab'
    Metadata.append(Creator)
    Created = ET.SubElement(Metadata, 'Created')
    Created.text='2015-07-17T15:27:13' 
    Metadata.append(Created)
    LastChange = ET.SubElement(Metadata, 'LastChange')
    LastChange.text='2017-07-14T10:03:33' 
    Metadata.append(LastChange)
    Comments = ET.SubElement(Metadata, 'Comments')
    Comments.text='Example Page' 
    Metadata.append(Comments)
    PcGts.append(Metadata)

    img = cv2.imread(image_file_path)

    rows,cols,_=img.shape
    Page=ET.SubElement(PcGts,'Page')
    Page.set('imageFilename',image_file_path) 
    Page.set('imageWidth',str(cols))
    Page.set('imageHeight',str(rows))
    
    textregionid=0
    coords= '1,1 '+str(cols-2)+',1 '+str(cols-2)+','+str(rows-2)+' 1,'+str(rows-2)
    TextRegion = ET.SubElement(Page, 'TextRegion')   
    TextRegion.set('id','r'+str(textregionid))
    TextRegion.set('type','paragraph')
    Page.append(TextRegion)
    Coords = ET.SubElement(TextRegion, 'Coords')        
    Coords.set('points',coords)
    TextRegion.append(Coords)

    textlineid = 0
    wordid = 0
    for line_bbox in line_bboxes:
        tcoords = coordinates(line_bbox)
        TextLine = ET.SubElement(TextRegion, 'TextLine')   
        TextLine.set('id','l'+str(textlineid))
        TextRegion.append(TextLine)

        Coords = ET.SubElement(TextLine, 'Coords')        
        Coords.set('points',tcoords)
        TextLine.append(Coords)

        textlinewordid = 0
        for word_bbox in sorted_word_bboxes[textlineid]:
            wcoords = coordinates(word_bbox)
            Word = ET.SubElement(TextLine, 'Word')
            Word.set('id','w'+str(wordid))
            Coords = ET.SubElement(Word, 'Coords')        
            Coords.set('points',wcoords)
            TextEquiv = ET.SubElement(Word, 'TextEquiv')
            UnicodeTextEquiv = ET.SubElement(TextEquiv, 'Unicode')
            UnicodeTextEquiv.text = sorted_word_labels[textlineid][textlinewordid]
            TextEquiv.append(UnicodeTextEquiv)
            Word.append(TextEquiv)
            TextLine.append(Word)
            textlinewordid = textlinewordid+1
            wordid = wordid + 1

        TextEquiv = ET.SubElement(TextLine, 'TextEquiv')
        UnicodeTextEquiv = ET.SubElement(TextEquiv, 'Unicode')
        UnicodeTextEquiv.text = line_labels[textlineid]
        TextEquiv.append(UnicodeTextEquiv)
        TextLine.append(TextEquiv)

        textlineid = textlineid + 1
    
    
    mydata = ET.tostring(PcGts,pretty_print=True, encoding='utf-8', xml_declaration=True)    
    image_name = image_file_path.split('/')[1][:-4]
    print(xml_folder_path+image_name+'.xml')
    myfile = open(xml_folder_path+image_name+'.xml', "wb")  
    myfile.write(mydata) 
    myfile.close()

In [9]:
color_images_dir = 'book2_color_images/'
hadara_xmls_dir = 'book2_hadara_xmls/'
words_lines_page_xmls_dir = 'book2_words_lines_page_xmls/'

In [10]:
if os.path.isdir(words_lines_page_xmls_dir):
    shutil.rmtree(words_lines_page_xmls_dir)
os.mkdir(words_lines_page_xmls_dir)
total_sorted_word_labels = []
total_line_labels = []
c=0
for image_name in os.listdir(color_images_dir):
    print(image_name)
    image_root_name = image_name[:-4]
    hadara_xml_path = hadara_xmls_dir + image_root_name + '.xml'
    print(hadara_xml_path)
      
    word_bboxes, word_labels = word_bboxes_and_labels_from_hadara_xml(hadara_xml_path, image_root_name)
    sorted_word_bboxes, sorted_word_labels, line_bboxes, line_labels = lines_from_words(word_bboxes, word_labels)

    print ('Started to generate pagexml for '+ image_name)
    image_path = color_images_dir + image_name
    generate_pagexml(words_lines_page_xmls_dir, image_path, sorted_word_bboxes, sorted_word_labels, line_bboxes, line_labels)

print ('All the pages have been processed.')   

003-2.png
book2_hadara_xmls/003-2.xml
Gathered 639 words on 003-2.png
Started to generate pagexml for 003-2.png
book2_words_lines_page_xmls/003-2.xml
004-1.png
book2_hadara_xmls/004-1.xml
Gathered 576 words on 004-1.png
Started to generate pagexml for 004-1.png
book2_words_lines_page_xmls/004-1.xml
004-2.png
book2_hadara_xmls/004-2.xml
Gathered 686 words on 004-2.png
Started to generate pagexml for 004-2.png
book2_words_lines_page_xmls/004-2.xml
005-1.png
book2_hadara_xmls/005-1.xml
Gathered 731 words on 005-1.png
Started to generate pagexml for 005-1.png
book2_words_lines_page_xmls/005-1.xml
005-2.png
book2_hadara_xmls/005-2.xml
Gathered 745 words on 005-2.png
Started to generate pagexml for 005-2.png
book2_words_lines_page_xmls/005-2.xml
006-1.png
book2_hadara_xmls/006-1.xml
Gathered 749 words on 006-1.png
Started to generate pagexml for 006-1.png
book2_words_lines_page_xmls/006-1.xml
006-2.png
book2_hadara_xmls/006-2.xml
Gathered 694 words on 006-2.png
Started to generate pagexml fo