In [1]:
import cv2
from lxml import etree as ET
import numpy as np
%matplotlib inline
from matplotlib import pyplot as plt
from PIL import ImageFont, ImageDraw, Image
from bidi.algorithm import get_display
import arabic_reshaper

In [2]:
image_path = '0007-1.png'
xml_file_path='annotation.xml'


In [3]:
def coordinates(cnt):
    coords= str(cnt[0])+','+str(cnt[1])+' '+str(cnt[0]+cnt[2])+','+str(cnt[1])+' '+str(cnt[0]+cnt[2])+','+str(cnt[1]+cnt[3])+' '+str(cnt[0])+','+str(cnt[1]+cnt[3])
    return coords

In [4]:
def one_box(rectList):
    arr = []
    for rect in rectList:
        arr.append((rect[0],rect[1]))
        arr.append((rect[0]+rect[2],rect[1]+rect[3]))
    (x,y,w,h) = cv2.boundingRect(np.asarray(arr))
    return x,y,w,h

In [5]:
def one_label(labelList):
    line_label=''
    for label in labelList:
        line_label=line_label+' '+label
    return line_label

In [6]:
def reduce_height(h):
    if h <= 35:
        reduced_h = 5
    else:
        reduced_h = h - 30
    return reduced_h

In [7]:
tree = ET.parse(xml_file_path)
root = tree.getroot()
word_bboxes = []
word_labels = []
stop = 0
image_name = image_path[:-4]
for image in root.iter('image'):
    if image.attrib.get('src') == image_name:
        for zone in root.iter('zone'):
            stop = stop + 1
            if stop == 100:
                break
            c = 0
            for point in zone.iter('point'):
                if c == 0:
                    x0 = point.attrib.get('x')
                    y0 = point.attrib.get('y')
                if c == 1:
                    x1 = point.attrib.get('x')
                    y1 = point.attrib.get('y')
                if c == 2:
                    x2 = point.attrib.get('x')
                    y2 = point.attrib.get('y')
                if c == 3:
                    x3 = point.attrib.get('x')
                    y3 = point.attrib.get('y')
                c = c+1
            x = int(x0)
            y = int(y0)
            w = int(x1)-int(x0)
            h = int(y2)-int(y1)          
            word_bbox = [x, y, w, h]
            word_bboxes.append(word_bbox)
            id=int(zone.attrib.get('id'))
            word_label=None
            for segment in root.iter('segment'):
                tid=int(segment.attrib.get('id'))
                if tid==id:
                    word_label= segment[1].text
            if word_label is None:
                print(id)
                word_label='mislabel'
                print ('there is a mismatch label')
            word_labels.append(word_label)

In [8]:
line_bboxes = []
line_labels = []
sorted_word_bboxes = []
sorted_word_labels = []
# Sort by y coordinate
word_bboxes, word_labels = zip(*sorted(zip(word_bboxes, word_labels), key=lambda p: p[0][1]))
word_bboxes = list(word_bboxes)
word_labels = list(word_labels)
# Bottom of the first rectangle is the baseline
reduced_h = reduce_height(word_bboxes[0][3])
baseline = word_bboxes[0][1] + reduced_h - 1
end_idx = 0
for i in range(len(word_bboxes)):
    # Continue iterating until the box whose y coordinate is below the current baseline
    if word_bboxes[i][1] > baseline:
        # Sort the boxes whose y coordinates are above the current baseline, by their x coordinate, in descending order
        word_bboxes[end_idx:i], word_labels[end_idx:i] = zip(*sorted(zip(word_bboxes[end_idx:i], word_labels[end_idx:i]), reverse=True, key=lambda p: p[0][0]))
        
        if len(word_bboxes[end_idx:i])>0:
            line_bbox = one_box(word_bboxes[end_idx:i])
            line_bboxes.append(line_bbox)
            line_label = one_label(word_labels[end_idx:i])
            line_labels.append(line_label)
            sorted_word_bboxes.append(word_bboxes[end_idx:i])
            sorted_word_labels.append(word_labels[end_idx:i])
         
        end_idx = i
    # Update the baseline. 
    # New baseline is the bottom of the box whose y coordinate is below the current baseline
    reduced_h = reduce_height(word_bboxes[i][3])
    baseline = max(word_bboxes[i][1] + reduced_h - 1, baseline)

# Sort the final line
word_bboxes[end_idx:i], word_labels[end_idx:i] = zip(*sorted(zip(word_bboxes[end_idx:i], word_labels[end_idx:i]), reverse=True, key=lambda p: p[0][0]))
if len(word_bboxes[end_idx:i])>0:
    line_bbox = one_box(word_bboxes[end_idx:i])
    line_bboxes.append(line_bbox)
    line_label = one_label(word_labels[end_idx:i])
    line_labels.append(line_label)
    sorted_word_bboxes.append(word_bboxes[end_idx:i])
    sorted_word_labels.append(word_labels[end_idx:i])

In [9]:
line_labels

[' ت',
 ' ا و لحجا ر ة عد ت للطا فر ين و بشر لذ ين ا ا منو و عملو ا ا لصا لحا',
 ' ا ن لهم جنا ت تجر ى من تحتها ا لا نها ر كلما ر ز قو ا منها',
 ' من ثمر ة ر ز قا قا لو هذ ا ا لذ ى ر ز قنا من  قبل و ا تو به متشا بها',
 ' و لهم فيها ا ز و ا ج مطهر ة و هم فيها خا لد و ن ا ن ا لله لا',
 ' يستحى ن يضر مثلا ما بعو ضة فما فو']

In [11]:
xmlns = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"
xsi ="http://www.w3.org/2001/XMLSchema-instance"
schemaLocation = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd"

PcGts = ET.Element("{" + xmlns + "}PcGts",
                       attrib={"{" + xsi + "}schemaLocation" : schemaLocation}, 
                        nsmap={'xsi': xsi, None: xmlns})
PcGts.set("pcGtsId","pc-aletheiaexamplepage")
Metadata = ET.SubElement(PcGts, 'Metadata')
Creator = ET.SubElement(Metadata, 'Creator')
Creator.text='PRImA Research Lab'
Metadata.append(Creator)
Created = ET.SubElement(Metadata, 'Created')
Created.text='2015-07-17T15:27:13' 
Metadata.append(Created)
LastChange = ET.SubElement(Metadata, 'LastChange')
LastChange.text='2017-07-14T10:03:33' 
Metadata.append(LastChange)
Comments = ET.SubElement(Metadata, 'Comments')
Comments.text='Example Page' 
Metadata.append(Comments)
PcGts.append(Metadata)

img = cv2.imread(image_path)

rows,cols,_=img.shape
Page=ET.SubElement(PcGts,'Page')
Page.set('imageFilename',image_path) 
Page.set('imageWidth',str(cols))
Page.set('imageHeight',str(rows))

In [12]:
textregionid=0
coords= '1,1 '+str(cols-2)+',1 '+str(cols-2)+','+str(rows-2)+' 1,'+str(rows-2)
TextRegion = ET.SubElement(Page, 'TextRegion')   
TextRegion.set('id','r'+str(textregionid))
TextRegion.set('type','paragraph')
Page.append(TextRegion)
Coords = ET.SubElement(TextRegion, 'Coords')        
Coords.set('points',coords)
TextRegion.append(Coords)

textlineid = 0
wordid = 0
for line_bbox in line_bboxes:
    tcoords = coordinates(line_bbox)
    TextLine = ET.SubElement(TextRegion, 'TextLine')   
    TextLine.set('id','l'+str(textlineid))
    TextRegion.append(TextLine)
    
    Coords = ET.SubElement(TextLine, 'Coords')        
    Coords.set('points',tcoords)
    TextLine.append(Coords)
    
    textlinewordid = 0
    for word_bbox in sorted_word_bboxes[textlineid]:
        wcoords = coordinates(word_bbox)
        Word = ET.SubElement(TextLine, 'Word')
        Word.set('id','w'+str(wordid))
        Coords = ET.SubElement(Word, 'Coords')        
        Coords.set('points',wcoords)
        TextEquiv = ET.SubElement(Word, 'TextEquiv')
        UnicodeTextEquiv = ET.SubElement(TextEquiv, 'Unicode')
        UnicodeTextEquiv.text = sorted_word_labels[textlineid][textlinewordid]
        TextEquiv.append(UnicodeTextEquiv)
        Word.append(TextEquiv)
        TextLine.append(Word)
        textlinewordid = textlinewordid+1
        wordid = wordid + 1
    
    TextEquiv = ET.SubElement(TextLine, 'TextEquiv')
    UnicodeTextEquiv = ET.SubElement(TextEquiv, 'Unicode')
    UnicodeTextEquiv.text = line_labels[textlineid]
    TextEquiv.append(UnicodeTextEquiv)
    TextLine.append(TextEquiv)
    
    textlineid = textlineid + 1


In [14]:
mydata = ET.tostring(PcGts,pretty_print=True, encoding='utf-8', xml_declaration=True)
myfile = open(image_name+'.xml', "wb")  
myfile.write(mydata) 
myfile.close()