In [1]:
import glob
import nltk
import random

from collections import Counter

from pagexml.parser import parse_pagexml_file
from pagexml.model.physical_document_model import pretty_print_textregion

data_dir='../../pagexml'
scan_dirs = glob.glob(f'{data_dir}/*[0-9]')

def scan_paths(scan_dir:str):
    return glob.glob(f'{scan_dir}/*.xml')

In [2]:
len(scan_dirs)

719

In [3]:
dir=scan_dirs[random.randint(0,len(scan_dirs))]
paths = scan_paths(dir)

In [4]:
import concurrent.futures

## parse the pagexml with multiple processes

In [5]:
%%time
print(f'Parsing {len(paths)} scans in directory {dir}')
with concurrent.futures.ProcessPoolExecutor() as executor:
    scans = list(executor.map(parse_pagexml_file, paths))
print()
print(len(scans))
scans[0].metadata

Parsing 247 scans in directory ../../pagexml/2384_NOTD00252

247
CPU times: user 882 ms, sys: 1.8 s, total: 2.68 s
Wall time: 3.52 s


{'Creator': 'Transkribus',
 'Created': datetime.datetime(2020, 12, 8, 13, 17, 22, 402000, tzinfo=tzoffset(None, 3600)),
 'LastChange': datetime.datetime(2020, 12, 8, 13, 17, 28, 952000, tzinfo=tzoffset(None, 3600)),
 'filename': '../../pagexml/2384_NOTD00252/NOTD00252000001.xml'}

## parse the pagexml with multiple threads

In [6]:
%%time
print(f'Parsing {len(paths)} scans in directory {dir}')
with concurrent.futures.ThreadPoolExecutor() as executor:
    scans = list(executor.map(parse_pagexml_file, paths))
print()
print(len(scans))
scans[0].metadata

Parsing 247 scans in directory ../../pagexml/2384_NOTD00252

247
CPU times: user 2.16 s, sys: 928 ms, total: 3.09 s
Wall time: 2.73 s


{'Creator': 'Transkribus',
 'Created': datetime.datetime(2020, 12, 8, 13, 17, 22, 402000, tzinfo=tzoffset(None, 3600)),
 'LastChange': datetime.datetime(2020, 12, 8, 13, 17, 28, 952000, tzinfo=tzoffset(None, 3600)),
 'filename': '../../pagexml/2384_NOTD00252/NOTD00252000001.xml'}

## parse the pagexml sequentially

In [7]:
%%time
print(f'Parsing {len(paths)} scans in directory {dir}')
scans = [parse_pagexml_file(p) for p in paths]
print()
print(len(scans))
scans[0].metadata

Parsing 247 scans in directory ../../pagexml/2384_NOTD00252

247
CPU times: user 2 s, sys: 219 ms, total: 2.22 s
Wall time: 3.16 s


{'Creator': 'Transkribus',
 'Created': datetime.datetime(2020, 12, 8, 13, 17, 22, 402000, tzinfo=tzoffset(None, 3600)),
 'LastChange': datetime.datetime(2020, 12, 8, 13, 17, 28, 952000, tzinfo=tzoffset(None, 3600)),
 'filename': '../../pagexml/2384_NOTD00252/NOTD00252000001.xml'}

In [8]:
scans[0].metadata

{'Creator': 'Transkribus',
 'Created': datetime.datetime(2020, 12, 8, 13, 17, 22, 402000, tzinfo=tzoffset(None, 3600)),
 'LastChange': datetime.datetime(2020, 12, 8, 13, 17, 28, 952000, tzinfo=tzoffset(None, 3600)),
 'filename': '../../pagexml/2384_NOTD00252/NOTD00252000001.xml'}

In [9]:
Counter([scan.main_type for scan in scans]).most_common()

[('scan', 247)]

In [10]:
Counter([scan.id for scan in scans]).most_common()

[(None, 247)]

In [11]:
Counter([scan.parent for scan in scans]).most_common()

[(None, 247)]

In [12]:
Counter([scan.orientation for scan in scans]).most_common()

[(None, 247)]

In [13]:
words = [w for scan in scans for w in scan.get_words()]
len(words)
lwords =[w.lower() for w in words]

In [14]:
bc = Counter([b for b in nltk.bigrams(lwords)])
bc.most_common()

[(('mits', 'desen'), 107),
 (('binnen', 'deser'), 105),
 (('van', 'hen'), 104),
 (('de', 'voorsz.'), 88),
 (('winter', 'n.p.'), 84),
 (('somme', 'van'), 80),
 (('j:de', 'winter'), 78),
 (('soude', 'mogen'), 77),
 (('de', 'winter'), 75),
 (('de', 'somme'), 74),
 (('voor', 'mij'), 71),
 (('van', 'haer'), 69),
 (('ende', 'dat'), 69),
 (('mij', 'notario'), 67),
 (('te', 'mogen'), 67),
 (('binnen', 'amstelredamme'), 67),
 (('jacob', 'de'), 65),
 (('caroli', 'guldens'), 63),
 (('mij', 'jacob'), 62),
 (('die', 'sij'), 62),
 (('dat', 'alle'), 61),
 (('in', 'desen'), 60),
 (('te', 'sijn'), 57),
 (('bij', 'de'), 57),
 (('van', 'een'), 57),
 (('gedaen', 'binnen'), 56),
 (('dat', 'de'), 56),
 (('getuijgen', 'hier'), 55),
 (('na', 'rechten'), 54),
 (('aldus', 'gedaen'), 53),
 (('deser', 'stede'), 52),
 (('off', 'bij'), 52),
 (('bij', 'haer'), 51),
 (('sij', 'testatrice'), 50),
 (('gelijck', 'sulcx'), 49),
 (('sal', 'werden'), 49),
 (('van', 'jan'), 49),
 (('residerende', 'openbaer'), 47),
 (('op', 

In [15]:
tc = Counter([b for b in nltk.trigrams(lwords)])
tc.most_common()

[(('de', 'somme', 'van'), 69),
 (('voor', 'mij', 'jacob'), 62),
 (('mij', 'jacob', 'de'), 62),
 (('j:de', 'winter', 'n.p.'), 59),
 (('aldus', 'gedaen', 'binnen'), 49),
 (('binnen', 'deser', 'stede'), 45),
 (('binnen', 'deser', 'stede,'), 43),
 (('gelijck', 'sulcx', 'uijterlijck'), 39),
 (('jacob', 'de', 'winter'), 38),
 (('mits', 'desen', 'gemaeckt'), 37),
 (('de', 'winter', 'binnen'), 35),
 (('ter', 'presentie', 'van'), 35),
 (('desselfs', 'onses', 'heeren'), 34),
 (('inden', 'jare', 'vander'), 33),
 (('getuijgen', 'hier', 'toe'), 32),
 (('vander', 'geboorte', 'desselfs'), 32),
 (('geboorte', 'desselfs', 'onses'), 32),
 (('ende', 'op', 'een'), 32),
 (('binnen', 'amstelredamme', 'residerende'), 31),
 (('gelooffwaerdige', 'getuijgen', 'hier'), 31),
 (('gedaen', 'binnen', 'amstelredamme'), 31),
 (('compareerden', 'voor', 'mij'), 30),
 (('werden', 'onderhouden', 'ende'), 30),
 (('jare', 'vander', 'geboorte'), 29),
 (('te', 'hebben', 'voor'), 29),
 (('soo', 'die', 'best'), 29),
 (('in', 'd

In [21]:
tc = Counter([b for b in nltk.ngrams(lwords,4)])
tc.most_common()

[(('voor', 'mij', 'jacob', 'de'), 62),
 (('mij', 'jacob', 'de', 'winter'), 35),
 (('jacob', 'de', 'winter', 'binnen'), 35),
 (('compareerden', 'voor', 'mij', 'jacob'), 30),
 (('geboorte', 'desselfs', 'onses', 'heeren'), 29),
 (('soo', 'die', 'best', 'bestaen'), 27),
 (('sal', 'werden', 'onderhouden', 'ende'), 27),
 (('binnen', 'amstelredamme', 'residerende', 'openbaer'), 27),
 (('gehouden', 'te', 'hebben', 'voor'), 26),
 (('off', 'bevonden', 'soude', 'mogen'), 26),
 (('in', 'desen', 'van', 'noden'), 26),
 (('inden', 'jare', 'vander', 'geboorte'), 25),
 (('vander', 'geboorte', 'desselfs', 'onses'), 25),
 (('aldus', 'gedaen', 'binnen', 'amstelredamme'), 25),
 (('daer', 'mede', 'te', 'mogen'), 23),
 (('binnen', 'deser', 'stede', 'mij'), 23),
 (('ter', 'presentie', 'van', 'jan'), 23),
 (('ende', 'op', 'een', 'nieuws'), 23),
 (('compareerde', 'voor', 'mij', 'jacob'), 22),
 (('alle', 'andere', 'soorten', 'van'), 22),
 (('jare', 'vander', 'geboorte', 'desselfs'), 22),
 (('alwaert', 'schoon', 

In [22]:
st = sorted([' '.join(s) for s in tc])

In [23]:
[t for t in st if t[0:3] == 'te '] 

['te  werden va',
 'te 103 ƒ ƒ0',
 'te 2gs 289 inden',
 'te 364 te sijn,',
 'te achtervolgen voldoen en',
 'te adsimen off surrogeri',
 'te aen ijenandter werelt',
 'te aenvaerden, en beneficeren,',
 'te aet doende tot',
 'te afpro¬ beren ende',
 'te alimentenen, in kost,',
 'te almmenteren, in kost',
 'te alsdan nogh in',
 'te aosmmeren off surrogeren',
 'te approberen en van',
 'te approberen ende van',
 'te approberen/ ende van',
 'te aproberen, met jusertie,',
 'te avesen taerlieder lespective',
 'te bchoren,/ alle twelcke',
 'te be welen, ende',
 'te bedde leggende dogh',
 'te bedde leggende ende',
 'te bedde leggende nogh',
 'te beddeleggende dogh andersints',
 'te bee gen, ende',
 'te begeven / deselve',
 'te begeven, deselve als',
 'te begeven, desen testapeure',
 'te begevent deser dan,',
 "te behoren, alletwelcke d'ses",
 'te behoren, deselve hare',
 'te behoren, ende de',
 'te bekeeren en enceldijcken',
 'te beleen den sterffdagh',
 'te beleven den steeffdagh',
 'te betalen 

In [24]:
scan = scans[0]
scan.json

{'id': None,
 'type': ['pagexml_doc', 'text_region', 'scan'],
 'metadata': {'Creator': 'Transkribus',
  'Created': datetime.datetime(2020, 12, 8, 13, 17, 22, 402000, tzinfo=tzoffset(None, 3600)),
  'LastChange': datetime.datetime(2020, 12, 8, 13, 17, 28, 952000, tzinfo=tzoffset(None, 3600)),
  'filename': '../../pagexml/2384_NOTD00252/NOTD00252000001.xml'},
 'reading_order': {0: 'r1'},
 'coords': [(0, 0), (2330, 0), (2330, 3390), (0, 3390)],
 'text_regions': [{'id': 'r1',
   'type': ['pagexml_doc', 'text_region'],
   'metadata': {'reading_order': {'index': '0'},
    'parent_type': 'scan',
    'parent_id': None,
    'text_region_id': None,
    'scan_id': None},
   'coords': [(453, 1061), (453, 3155), (1892, 3155), (1892, 1061)],
   'lines': [{'id': 'r1l1',
     'type': ['pagexml_doc', 'line'],
     'metadata': {'reading_order': {'index': '0'}, 'type': 'line'},
     'coords': [(792, 1144),
      (923, 1171),
      (991, 1127),
      (1030, 1153),
      (1095, 1130),
      (1317, 1156),
 

In [29]:
for scan in scans:
    print(f'{scan.metadata["filename"]} - {scan.stats["text_regions"]} textregions')

../../pagexml/2384_NOTD00252/NOTD00252000001.xml - 1 textregions
../../pagexml/2384_NOTD00252/NOTD00252000002.xml - 2 textregions
../../pagexml/2384_NOTD00252/NOTD00252000003.xml - 1 textregions
../../pagexml/2384_NOTD00252/NOTD00252000004.xml - 3 textregions
../../pagexml/2384_NOTD00252/NOTD00252000005.xml - 4 textregions
../../pagexml/2384_NOTD00252/NOTD00252000006.xml - 2 textregions
../../pagexml/2384_NOTD00252/NOTD00252000007.xml - 3 textregions
../../pagexml/2384_NOTD00252/NOTD00252000008.xml - 2 textregions
../../pagexml/2384_NOTD00252/NOTD00252000009.xml - 2 textregions
../../pagexml/2384_NOTD00252/NOTD00252000010.xml - 3 textregions
../../pagexml/2384_NOTD00252/NOTD00252000011.xml - 3 textregions
../../pagexml/2384_NOTD00252/NOTD00252000012.xml - 2 textregions
../../pagexml/2384_NOTD00252/NOTD00252000013.xml - 2 textregions
../../pagexml/2384_NOTD00252/NOTD00252000014.xml - 7 textregions
../../pagexml/2384_NOTD00252/NOTD00252000015.xml - 5 textregions
../../pagexml/2384_NOTD00

In [35]:
import pandas as pd
data = [[scan.metadata["filename"],scan.stats['lines'],scan.stats['words'],scan.stats['text_regions'],scan.stats['columns'],scan.stats['extra'],scan.stats['pages']] for scan in scans]
pd.DataFrame(data, columns=["Filename", "lines","words","text_regions","columns","extra","pages"]).sort_values(by='words')

Unnamed: 0,Filename,lines,words,text_regions,columns,extra,pages
246,../../pagexml/2384_NOTD00252/NOTD00252000247.xml,0,0,0,0,0,0
245,../../pagexml/2384_NOTD00252/NOTD00252000246.xml,0,0,0,0,0,0
152,../../pagexml/2384_NOTD00252/NOTD00252000153.xml,2,2,2,0,0,0
73,../../pagexml/2384_NOTD00252/NOTD00252000074.xml,2,2,2,0,0,0
121,../../pagexml/2384_NOTD00252/NOTD00252000122.xml,3,3,3,0,0,0
...,...,...,...,...,...,...,...
96,../../pagexml/2384_NOTD00252/NOTD00252000097.xml,69,561,2,0,0,0
126,../../pagexml/2384_NOTD00252/NOTD00252000127.xml,75,592,2,0,0,0
199,../../pagexml/2384_NOTD00252/NOTD00252000200.xml,86,610,2,0,0,0
154,../../pagexml/2384_NOTD00252/NOTD00252000155.xml,90,632,1,0,0,0


In [None]:
scan = scans[0]
lines = scan.get_lines()
for l in lines:
    c = l.coords
    print(f'({c.x},{c.y})-({c.x+c.w},{c.y+c.h}) : {l.text} {l.baseline}')

In [None]:
print(c.x)
print(c.y)
print(c.w)
print(c.h)
