In [1]:
import os
from glob import glob
import cv2
import numpy as np
from tqdm import tqdm
from xml.etree import ElementTree as ET
from bidi.algorithm import get_display
import arabic_reshaper
from wordcloud import WordCloud

In [2]:
_ns = {'p': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2013-07-15'}
dataset_folder='pinkas_images/'
xml_folder = 'pinkas_xml/'
output_folder = 'pinkas_word_images/'


In [3]:
def get_page_filename(image_filename: str) -> str:
    return os.path.join(os.path.dirname(image_filename),
                        '{}.xml'.format(os.path.basename(image_filename)[:-4]))


def get_basename(image_filename: str) -> str:
    directory, basename = os.path.split(image_filename)
    return '{}'.format( basename.split('.')[0])


def save_and_resize(img: np.array, filename: str, size=None) -> None:
    if size is not None:
        h, w = img.shape[:2]
        resized = cv2.resize(img, (int(w*size), int(h*size)),
                             interpolation=cv2.INTER_LINEAR)
        cv2.imwrite(filename, resized)
    else:
        cv2.imwrite(filename, img)
        
def xml_to_coordinates(t):
    result = []
    for p in t.split(' '):
        values = p.split(',')
        assert len(values) == 2
        x, y = int(float(values[0])), int(float(values[1]))
        result.append((x,y))
    result=np.array(result)
    return result

In [4]:
image_filenames_list = glob('{}*.jpg'.format(dataset_folder))

In [5]:
word_labels = []
for image_filename in image_filenames_list:
    img = cv2.imread(image_filename)
    page_filename = get_page_filename(image_filename)
    tree = ET.parse(xml_folder+os.path.basename(image_filename)[:-4]+'.xml')
    root = tree.getroot()
    for i in root:
        for j in i:
            for k in j:
                for l in k:
                    for m in l:
                        for n in m:
                            word_label=n.text
                            word_labels.append(word_label)


In [14]:
def save_wordcloud(word_labels):
    reshaped_text = arabic_reshaper.reshape(' '.join(filter(None,total_sorted_word_labels)))
    bidi_text = get_display(reshaped_text)
    wordcloud = WordCloud(font_path='arial.ttf',background_color='white', mode='RGB',width=2000,height=1000,collocations = False).generate(bidi_text)
    wordcloud.to_file("pinkas_wordcloud.png")
    return

In [15]:
save_wordcloud(word_labels)