This code converts hadara format xml files to page format xml files. It does not contain text line information. It only contains word bounding boxes and their labels.

In [None]:
import cv2
import os
import shutil
from lxml import etree as ET
import numpy as np
%matplotlib inline
from matplotlib import pyplot as plt
from PIL import ImageFont, ImageDraw, Image
from bidi.algorithm import get_display
import arabic_reshaper

In [2]:
def word_bboxes_and_labels_from_hadara_xml(hadara_xml_path, image_root_name):
    tree = ET.parse(hadara_xml_path)
    root = tree.getroot()
    word_bboxes = []
    word_labels = []
    for element in root.findall('DocumentElement'):
        transcript = element.find('Transcript')
        if (transcript!=None):
            word_label = transcript.text
            x = int(element.find('X').text)
            y = int(element.find('Y').text)
            h = int(element.find('Height').text)
            w = int(element.find('Width').text)
            word_bbox = (x,y,w,h)
            if (word_bboxes!=None):
                word_bboxes.append(word_bbox)
                word_labels.append(word_label)
    print('Gathered '+ str(len(word_labels)) + ' words on ' +image_name)
    return word_bboxes, word_labels

In [3]:
def coordinates(cnt):
    coords= str(cnt[0])+','+str(cnt[1])+' '+str(cnt[0]+cnt[2])+','+str(cnt[1])+' '+str(cnt[0]+cnt[2])+','+str(cnt[1]+cnt[3])+' '+str(cnt[0])+','+str(cnt[1]+cnt[3])
    return coords

In [4]:
def word_bboxes_and_labels_to_page_xml(word_bboxes, word_labels, image_path, image_name, xml_folder_path):
    xmlns = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"
    xsi ="http://www.w3.org/2001/XMLSchema-instance"
    schemaLocation = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd"
    PcGts = ET.Element("{" + xmlns + "}PcGts",
                           attrib={"{" + xsi + "}schemaLocation" : schemaLocation}, 
                            nsmap={'xsi': xsi, None: xmlns})
    PcGts.set("pcGtsId","pc-aletheiaexamplepage")
    Metadata = ET.SubElement(PcGts, 'Metadata')
    Creator = ET.SubElement(Metadata, 'Creator')
    Creator.text='PRImA Research Lab'
    Metadata.append(Creator)
    Created = ET.SubElement(Metadata, 'Created')
    Created.text='2015-07-17T15:27:13' 
    Metadata.append(Created)
    LastChange = ET.SubElement(Metadata, 'LastChange')
    LastChange.text='2017-07-14T10:03:33' 
    Metadata.append(LastChange)
    Comments = ET.SubElement(Metadata, 'Comments')
    Comments.text='Example Page' 
    Metadata.append(Comments)
    PcGts.append(Metadata)

    img = cv2.imread(image_path)

    rows,cols,_=img.shape
    Page=ET.SubElement(PcGts,'Page')
    Page.set('imageFilename',image_name) 
    Page.set('imageWidth',str(cols))
    Page.set('imageHeight',str(rows))
    
    textregionid=0
    coords= '1,1 '+str(cols-2)+',1 '+str(cols-2)+','+str(rows-2)+' 1,'+str(rows-2)
    TextRegion = ET.SubElement(Page, 'TextRegion')   
    TextRegion.set('id','r'+str(textregionid))
    TextRegion.set('type','paragraph')
    Page.append(TextRegion)
    Coords = ET.SubElement(TextRegion, 'Coords')        
    Coords.set('points',coords)
    TextRegion.append(Coords)
    
    TextLine = ET.SubElement(TextRegion, 'TextLine')   
    TextLine.set('id','l0')
    TextRegion.append(TextLine)

    Coords = ET.SubElement(TextLine, 'Coords')        
    Coords.set('points',coords)
    TextLine.append(Coords)
    
    wordid = 0
    for word_bbox in word_bboxes:
        wcoords = coordinates(word_bbox)
        Word = ET.SubElement(TextLine, 'Word')
        Word.set('id','w'+str(wordid))
        Coords = ET.SubElement(Word, 'Coords')        
        Coords.set('points',wcoords)
        TextEquiv = ET.SubElement(Word, 'TextEquiv')
        UnicodeTextEquiv = ET.SubElement(TextEquiv, 'Unicode')
        UnicodeTextEquiv.text = word_labels[wordid]
        TextEquiv.append(UnicodeTextEquiv)
        Word.append(TextEquiv)
        TextLine.append(Word)
        wordid = wordid + 1
    
    mydata = ET.tostring(PcGts,pretty_print=True, encoding='utf-8', xml_declaration=True)    
    image_root_name = image_name[:-4]
    myfile = open(xml_folder_path+'/'+image_root_name+'.xml', "wb")  
    myfile.write(mydata) 
    myfile.close()
    

In [5]:
color_images_dir = 'book2_color_images/'
hadara_xmls_dir = 'book2_hadara_xmls/'
words_page_xmls_dir = 'book2_words_page_xmls/'

In [6]:
if os.path.isdir(words_page_xmls_dir):
    shutil.rmtree(words_page_xmls_dir)
os.mkdir(words_page_xmls_dir)

c=0
for image_name in os.listdir(color_images_dir):
    print(image_name)
    image_root_name = image_name[:-4]
    hadara_xml_path = hadara_xmls_dir + image_root_name + '.xml'
    print(hadara_xml_path)
      
    word_bboxes, word_labels = word_bboxes_and_labels_from_hadara_xml(hadara_xml_path, image_root_name)
    
    print ('Started to generate pagexml for '+ image_name)
    image_path = color_images_dir + '/' + image_name
    word_bboxes_and_labels_to_page_xml(word_bboxes, word_labels, image_path, image_name, words_page_xmls_dir)

print ('All the pages have been processed.')   

003-2.png
book2_hadara_xmls/003-2.xml
Gathered 639 words on 003-2.png
Started to generate pagexml for 003-2.png
004-1.png
book2_hadara_xmls/004-1.xml
Gathered 576 words on 004-1.png
Started to generate pagexml for 004-1.png
004-2.png
book2_hadara_xmls/004-2.xml
Gathered 686 words on 004-2.png
Started to generate pagexml for 004-2.png
005-1.png
book2_hadara_xmls/005-1.xml
Gathered 731 words on 005-1.png
Started to generate pagexml for 005-1.png
005-2.png
book2_hadara_xmls/005-2.xml
Gathered 745 words on 005-2.png
Started to generate pagexml for 005-2.png
006-1.png
book2_hadara_xmls/006-1.xml
Gathered 749 words on 006-1.png
Started to generate pagexml for 006-1.png
006-2.png
book2_hadara_xmls/006-2.xml
Gathered 694 words on 006-2.png
Started to generate pagexml for 006-2.png
007-1.png
book2_hadara_xmls/007-1.xml
Gathered 722 words on 007-1.png
Started to generate pagexml for 007-1.png
007-2.png
book2_hadara_xmls/007-2.xml
Gathered 719 words on 007-2.png
Started to generate pagexml for 00