In [1]:
import glob
import nltk
import random

from collections import Counter

from pagexml.parser import parse_pagexml_file
from pagexml.model.physical_document_model import pretty_print_textregion

data_dir='../pagexml'
scan_dirs = glob.glob(f'{data_dir}/*[0-9]')

def scan_paths(scan_dir:str):
    return glob.glob(f'{scan_dir}/*.xml')

In [2]:
len(scan_dirs)

722

In [3]:
dir=scan_dirs[random.randint(0,len(scan_dirs))]
paths = scan_paths(dir)

In [4]:
import concurrent.futures

## parse the pagexml with multiple processes

In [5]:
%%time
print(f'Parsing {len(paths)} scans in directory {dir}')
with concurrent.futures.ProcessPoolExecutor() as executor:
    scans = list(executor.map(parse_pagexml_file, paths))
print()
print(len(scans))
scans[0].metadata

Parsing 245 scans in directory ../pagexml/2386_NOTD00254

245
CPU times: user 812 ms, sys: 357 ms, total: 1.17 s
Wall time: 2.43 s


{'Creator': 'Transkribus',
 'Created': datetime.datetime(2020, 12, 8, 13, 40, 18, 321000, tzinfo=tzoffset(None, 3600)),
 'LastChange': datetime.datetime(2020, 12, 8, 13, 40, 23, 439000, tzinfo=tzoffset(None, 3600)),
 'filename': '../pagexml/2386_NOTD00254/NOTD00254000001.xml'}

## parse the pagexml with multiple threads

In [6]:
%%time
print(f'Parsing {len(paths)} scans in directory {dir}')
with concurrent.futures.ThreadPoolExecutor() as executor:
    scans = list(executor.map(parse_pagexml_file, paths))
print()
print(len(scans))
scans[0].metadata

Parsing 245 scans in directory ../pagexml/2386_NOTD00254

245
CPU times: user 2.16 s, sys: 337 ms, total: 2.5 s
Wall time: 2.21 s


{'Creator': 'Transkribus',
 'Created': datetime.datetime(2020, 12, 8, 13, 40, 18, 321000, tzinfo=tzoffset(None, 3600)),
 'LastChange': datetime.datetime(2020, 12, 8, 13, 40, 23, 439000, tzinfo=tzoffset(None, 3600)),
 'filename': '../pagexml/2386_NOTD00254/NOTD00254000001.xml'}

## parse the pagexml sequentially

In [7]:
%%time
print(f'Parsing {len(paths)} scans in directory {dir}')
scans = [parse_pagexml_file(p) for p in paths]
print()
print(len(scans))
scans[0].metadata

Parsing 245 scans in directory ../pagexml/2386_NOTD00254

245
CPU times: user 2.01 s, sys: 151 ms, total: 2.16 s
Wall time: 3.03 s


{'Creator': 'Transkribus',
 'Created': datetime.datetime(2020, 12, 8, 13, 40, 18, 321000, tzinfo=tzoffset(None, 3600)),
 'LastChange': datetime.datetime(2020, 12, 8, 13, 40, 23, 439000, tzinfo=tzoffset(None, 3600)),
 'filename': '../pagexml/2386_NOTD00254/NOTD00254000001.xml'}

In [8]:
scans[0].metadata

{'Creator': 'Transkribus',
 'Created': datetime.datetime(2020, 12, 8, 13, 40, 18, 321000, tzinfo=tzoffset(None, 3600)),
 'LastChange': datetime.datetime(2020, 12, 8, 13, 40, 23, 439000, tzinfo=tzoffset(None, 3600)),
 'filename': '../pagexml/2386_NOTD00254/NOTD00254000001.xml'}

In [9]:
Counter([scan.main_type for scan in scans]).most_common()

[('scan', 245)]

In [10]:
Counter([scan.id for scan in scans]).most_common()

[(None, 245)]

In [11]:
Counter([scan.parent for scan in scans]).most_common()

[(None, 245)]

In [12]:
Counter([scan.orientation for scan in scans]).most_common()

[(None, 245)]

In [13]:
words = [w for scan in scans for w in scan.get_words()]
len(words)
lwords =[w.lower() for w in words]

In [14]:
bc = Counter([b for b in nltk.bigrams(lwords)])
bc.most_common()

[(('winter', 'n.p.'), 105),
 (('mits', 'desen'), 102),
 (('de', 'winter'), 101),
 (('van', 'hen'), 90),
 (('van', 'jan'), 86),
 (('voor', 'mij'), 84),
 (('binnen', 'deser'), 84),
 (('soude', 'mogen'), 84),
 (('j:de', 'winter'), 83),
 (('de', 'voorsz.'), 78),
 (('mij', 'notario'), 74),
 (('dat', 'alle'), 71),
 (('binnen', 'amstelredamme'), 70),
 (('mij', 'jacob'), 68),
 (('caroli', 'guldens'), 68),
 (('somme', 'van'), 66),
 (('die', 'sij'), 66),
 (('getuijgen', 'hier'), 59),
 (('jacob', 'de'), 58),
 (('de', 'somme'), 56),
 (('ende', 'dat'), 53),
 (('gedaen', 'binnen'), 53),
 (('te', 'mogen'), 52),
 (('presentie', 'van'), 52),
 (('geboorte', 'desselfs'), 50),
 (('tot', 'hare'), 50),
 (('dat', 'de'), 50),
 (('aldus', 'gedaen'), 50),
 (('off', 'bij'), 50),
 (('alle', 'andere'), 49),
 (('des', 'doots,'), 49),
 (('ter', 'presentie'), 49),
 (('ende', 'uijterste'), 48),
 (('soo', 'die'), 48),
 (('versocht', 'ende'), 48),
 (('ende', 'in'), 48),
 (('vander', 'geboorte'), 47),
 (('sij', 'testatri

In [15]:
tc = Counter([b for b in nltk.trigrams(lwords)])
tc.most_common()

[(('j:de', 'winter', 'n.p.'), 72),
 (('voor', 'mij', 'jacob'), 67),
 (('mij', 'jacob', 'de'), 55),
 (('de', 'somme', 'van'), 52),
 (('aldus', 'gedaen', 'binnen'), 47),
 (('ter', 'presentie', 'van'), 46),
 (('jacob', 'de', 'winter'), 44),
 (('de', 'winter', 'binnen'), 43),
 (('vander', 'geboorte', 'desselfs'), 42),
 (('schoon', 'dat', 'alle'), 40),
 (('inden', 'jare', 'vander'), 38),
 (('jare', 'vander', 'geboorte'), 37),
 (('binnen', 'deser', 'stede'), 37),
 (('gelijck', 'sulcx', 'uijterlijck'), 35),
 (('geboorte', 'desselfs', 'onses'), 33),
 (('testament', 'ende', 'uijterste'), 33),
 (('j:', 'de', 'winter'), 32),
 (('desselfs', 'onses', 'heeren'), 31),
 (('alle', 'twelck', 'voorsz.'), 31),
 (('inde', 'quohieren', 'vande'), 30),
 (('binnen', 'deser', 'stede,'), 30),
 (('en', 'mochten', 'sijn'), 29),
 (('binnen', 'amstelredamme', 'residerende'), 28),
 (('residerende', 'openbaer', 'notaris'), 28),
 (('soo', 'die', 'best'), 28),
 (('gelooffwaerdige', 'getuijgen', 'hier'), 28),
 (('ende', 

In [16]:
st = sorted([' '.join(s) for s in tc])

In [17]:
[t for t in st if t[0:3] == 'te '] 

['te 1ebben gelijck',
 'te 470 te',
 'te 5 ii',
 'te achtervolgen alle',
 'te achtervolgen voldoen',
 'te adcumeren en',
 'te adsmmeren en',
 'te adsumorei off',
 'te adsumoren off',
 "te aenvaerden d'andere",
 'te aliamentarin in',
 'te aliementeren en',
 'te aliementeren, in',
 'te aliemerter in',
 'te alimenta ren',
 'te alimenteren in',
 'te alimenteren, in',
 'te alimenterlnj in',
 'te ampaerteren. mits',
 'te ander houden',
 'te ander huwelijck,',
 'te approberen en',
 'te approberen ende',
 'te approberen, ende',
 'te aprobeen en',
 'te aproberen ende',
 'te articiparen, off',
 'te asproberen e',
 'te avaerden gevande',
 'te bchoren, sonder',
 'te bedde laggen',
 'te bedde leggende',
 'te bedde leggende,',
 'te bedde leggende,/',
 'te begeven deschel',
 'te begeven, deselve',
 'te begeven, desen',
 'te behoren, alle',
 'te behoren, elle',
 'te behoren. alle',
 'te behorende als',
 'te behouden, soo',
 'te beleggen op',
 'te belevem den',
 'te bercolijen /',
 'te bescharmen, dat'

In [18]:
scan = scans[0]
lines = scan.get_lines()
for l in lines:
    c = l.coords
    print(f'({c.x},{c.y})-({c.x+c.w},{c.y+c.h}) : {l.text} {l.baseline}')

(1681,139)-(1732,242) : ƒ Baseline(points="1691,228 1707,231")
(1543,949)-(1589,1142) : None Baseline(points="1551,1146 1581,1146")
(1176,936)-(1375,1191) : ƒ Baseline(points="1187,1151 1212,1154 1238,1155 1264,1154 1289,1153 1315,1152 1341,1151 1367,1151")
(938,956)-(1073,1198) : ƒ Baseline(points="963,1164 995,1152 1027,1152 1059,1149")
(436,2166)-(1722,2405) : DSamentele Baseline(points="444,2333 507,2336 570,2339 633,2343 697,2346 760,2350 823,2354 887,2357 950,2361 1013,2364 1077,2366 1140,2369 1203,2370 1266,2372 1330,2373 1393,2373 1456,2373 1520,2371 1583,2369 1646,2366 1710,2363")
(1456,2416)-(1693,2572) : e Baseline(points="1464,2532 1491,2532 1518,2533 1545,2535 1572,2538 1599,2539 1626,2541 1653,2541 1680,2541")
(1185,2619)-(1259,2735) : d Baseline(points="1193,2720 1242,2724")
(1077,2954)-(1741,3135) : J:de Winter N.P. Baseline(points="1085,3071 1117,3073 1149,3076 1182,3078 1214,3080 1247,3081 1279,3083 1311,3083 1344,3084 1376,3084 1409,3084 1441,3084 1473,3084 1506,3083

In [19]:
print(c.x)
print(c.y)
print(c.w)
print(c.h)


1077
2954
664
181
