In [1]:
import csv
from dataclasses import dataclass

@dataclass
class ScanData:
    """Simple class for scan data"""
    id: int
    title: str
    number: int
    previewImage: str
    xmlKey: str
    imageURL: str

@dataclass
class WorkData:
    """Data from werkvoorraad"""
    rubric: int
    inventoryNumber: int
    serialNumber:str
    title:str
    writer:str
    numberOfScans:int
    status:str

@dataclass
class Writer:
    id:str
    name:str
        
@dataclass
class Archive:
    inventoryNumber: int
    serialNumber: str
    title: str
    numberOfScans: int
    status: str


In [2]:
file='../pagexml/scans_20210403.csv'

with open(file) as f:
    reader = csv.reader(f)
    data = [tuple(row) for row in reader]
        
sdl =  [ScanData(*sd) for sd in data[1:]]

In [3]:
file = '../golden-agents-htr/resources/werkvoorraad.csv'
with open(file) as f:
    reader = csv.reader(f,delimiter=';')
    data = [tuple(row) for row in reader]
    
wdl = [WorkData(*wd) for wd in data[1:]]

In [4]:
sdl

[ScanData(id='605529', title='10021_NOTA00961', number='1', previewImage='https://files.transkribus.eu/iiif/2/CJHHGXALGVYHDROFTCJHJDMD/full/128,/0/default.jpg', xmlKey='https://files.transkribus.eu/Get?id=EYSQSQIQQOHZANOLAEPDNKGN', imageURL='https://files.transkribus.eu/Get?id=CJHHGXALGVYHDROFTCJHJDMD&fileType=view'),
 ScanData(id='605529', title='10021_NOTA00961', number='2', previewImage='https://files.transkribus.eu/iiif/2/XOUSVEERBXTUKGRSXONVFSHC/full/128,/0/default.jpg', xmlKey='https://files.transkribus.eu/Get?id=LYVJUPEVOBKGPEOYMUGMWZZR', imageURL='https://files.transkribus.eu/Get?id=XOUSVEERBXTUKGRSXONVFSHC&fileType=view'),
 ScanData(id='605529', title='10021_NOTA00961', number='3', previewImage='https://files.transkribus.eu/iiif/2/BBFQXZQYYILAZAZFKFFEGSOR/full/128,/0/default.jpg', xmlKey='https://files.transkribus.eu/Get?id=KIRSCTTOOFAQFFVZWXYDBTZM', imageURL='https://files.transkribus.eu/Get?id=BBFQXZQYYILAZAZFKFFEGSOR&fileType=view'),
 ScanData(id='605529', title='10021_NOTA

In [5]:
len(sdl)

275327

In [6]:
wdl

[WorkData(rubric='8', inventoryNumber='173', serialNumber='KLAF00792', title='173_KLAF00792', writer='JAN FRANSSEN BRUIJNINGH', numberOfScans='110', status='GT'),
 WorkData(rubric='20', inventoryNumber='494', serialNumber='A26620', title='494_A26620', writer='JACOB JANSZ WESTFRISIUS', numberOfScans='284', status='GT'),
 WorkData(rubric='20', inventoryNumber='509', serialNumber='A21956', title='509_A21956', writer='JACOB JANSZ WESTFRISIUS', numberOfScans='282', status='GT'),
 WorkData(rubric='48', inventoryNumber='1165', serialNumber='A14251', title='1165_A14251', writer='JOOST VAN DE VEN', numberOfScans='285', status='GT'),
 WorkData(rubric='49', inventoryNumber='1183', serialNumber='NOTG00415', title='1183_NOTG00415', writer='JAN DE VOS', numberOfScans='134', status='GT'),
 WorkData(rubric='54', inventoryNumber='1278', serialNumber='A31203', title='1278_A31203', writer='HENDRIK SCHAEF', numberOfScans='212', status='HTR'),
 WorkData(rubric='54', inventoryNumber='1279', serialNumber='A3

In [7]:
len(wdl)

717

In [8]:
uniqueScanDataTitles = set([sd.title for sd in sdl])
uniqueWordDataTitle = set([wd.title for wd in wdl])
print(len(uniqueScanDataTitles))
print(len(uniqueWordDataTitle))

724
716


In [9]:
wdl[-1]

WorkData(rubric='300', inventoryNumber='8308B', serialNumber='VBPD00011', title='8308B_VBPD00011', writer='WILLEM DE FAY', numberOfScans='464', status='HTR')

In [10]:
from collections import Counter

Counter([sd.title for sd in sdl]).most_common()

[('12854_NOTI01112', 986),
 ('13131_A31239', 933),
 ('4084_NOTA00192', 918),
 ('14269_KLAC00687', 912),
 ('12863_A29378', 868),
 ('12858_NOTG00021', 840),
 ('10061_NOTA00997', 839),
 ('12867_NOTI01121', 826),
 ('12883_NOTI01226', 825),
 ('NOTI01181', 803),
 ('14271_KLAC00736', 795),
 ('4085_NOTA00193', 787),
 ('2408_A16098', 767),
 ('12873_NOTI01126', 762),
 ('10046_NOTA00982', 755),
 ('14274_KLAC00822', 754),
 ('12865_NOTI01120', 751),
 ('13132_KLAC01201', 748),
 ('12796_NOTI01189', 740),
 ('12853_KLAG03569', 739),
 ('12811_NOTI01201', 736),
 ('4082_NOTA00191', 732),
 ('4102_KLAB08488', 724),
 ('12862_NOTI01118', 716),
 ('12803_NOTI01196', 715),
 ('14256_KLAC00452', 708),
 ('12882_NOTI01225', 705),
 ('14258_KLAC00437', 704),
 ('4167_A25538', 704),
 ('12871_NOTI01124', 703),
 ('14270_KLAC00715', 701),
 ('12848_NOTI01108', 700),
 ('12866_NOTG00022', 700),
 ('4081_NOTA00190', 700),
 ('12798_NOTI01191', 698),
 ('14268_KLAC00678', 698),
 ('4108_NOTA00218', 694),
 ('12874_NOTI01127', 692),


In [11]:
Counter([(wd.writer,wd.rubric) for wd in wdl])

Counter({('JAN FRANSSEN BRUIJNINGH', '8'): 1,
         ('JACOB JANSZ WESTFRISIUS', '20'): 3,
         ('JOOST VAN DE VEN', '48'): 1,
         ('JAN DE VOS', '49'): 1,
         ('HENDRIK SCHAEF', '54'): 138,
         ('GILLES BORSSELAER', '57'): 1,
         ('JACOB DE WINTER', '98'): 144,
         ('HENRICK VENKEL', '131'): 1,
         ('JACOB PONDT', '136'): 1,
         ('ANTHONY VAN DE VEN', '152'): 1,
         ('DIRK VAN DER GROE', '174'): 232,
         ('SIMON VAN SEVENHOVEN', '208'): 1,
         ('DAVID DES POMMARE', '261'): 1,
         ('CORNELIS VAN LOON', '265'): 2,
         ('JAN BARELS DE JONGE', '307'): 1,
         ('PHILIP ZWEERTS', '334'): 47,
         ('DANIEL VAN DEN BRINK', '342'): 4,
         ('JAN VERLEIJ', '358'): 1,
         ('HERMANUS VAN HEEL', '371'): 98,
         ('CORNELIS STAAL', '377'): 4,
         ('HENDRIK DANIEL VAN HOORN', '390'): 22,
         ('PALM MATHIJSZ', '18'): 1,
         ('WILLEM DE FAY', '300'): 11})

In [12]:
Counter([(wd.writer) for wd in wdl])

Counter({'JAN FRANSSEN BRUIJNINGH': 1,
         'JACOB JANSZ WESTFRISIUS': 3,
         'JOOST VAN DE VEN': 1,
         'JAN DE VOS': 1,
         'HENDRIK SCHAEF': 138,
         'GILLES BORSSELAER': 1,
         'JACOB DE WINTER': 144,
         'HENRICK VENKEL': 1,
         'JACOB PONDT': 1,
         'ANTHONY VAN DE VEN': 1,
         'DIRK VAN DER GROE': 232,
         'SIMON VAN SEVENHOVEN': 1,
         'DAVID DES POMMARE': 1,
         'CORNELIS VAN LOON': 2,
         'JAN BARELS DE JONGE': 1,
         'PHILIP ZWEERTS': 47,
         'DANIEL VAN DEN BRINK': 4,
         'JAN VERLEIJ': 1,
         'HERMANUS VAN HEEL': 98,
         'CORNELIS STAAL': 4,
         'HENDRIK DANIEL VAN HOORN': 22,
         'PALM MATHIJSZ': 1,
         'WILLEM DE FAY': 11})

In [13]:
wl = [Writer(*t) for t in set([(wd.rubric,wd.writer) for wd in wdl])]

In [14]:
wl

[Writer(id='307', name='JAN BARELS DE JONGE'),
 Writer(id='54', name='HENDRIK SCHAEF'),
 Writer(id='152', name='ANTHONY VAN DE VEN'),
 Writer(id='98', name='JACOB DE WINTER'),
 Writer(id='49', name='JAN DE VOS'),
 Writer(id='358', name='JAN VERLEIJ'),
 Writer(id='131', name='HENRICK VENKEL'),
 Writer(id='390', name='HENDRIK DANIEL VAN HOORN'),
 Writer(id='265', name='CORNELIS VAN LOON'),
 Writer(id='261', name='DAVID DES POMMARE'),
 Writer(id='334', name='PHILIP ZWEERTS'),
 Writer(id='18', name='PALM MATHIJSZ'),
 Writer(id='342', name='DANIEL VAN DEN BRINK'),
 Writer(id='300', name='WILLEM DE FAY'),
 Writer(id='377', name='CORNELIS STAAL'),
 Writer(id='20', name='JACOB JANSZ WESTFRISIUS'),
 Writer(id='48', name='JOOST VAN DE VEN'),
 Writer(id='136', name='JACOB PONDT'),
 Writer(id='57', name='GILLES BORSSELAER'),
 Writer(id='208', name='SIMON VAN SEVENHOVEN'),
 Writer(id='371', name='HERMANUS VAN HEEL'),
 Writer(id='174', name='DIRK VAN DER GROE'),
 Writer(id='8', name='JAN FRANSSEN BR

In [15]:
import itertools

writers = []
for writerTuple, wordDataGroup in itertools.groupby(wdl, lambda x: (x.rubric,x.writer)):
    writer = Writer(*writerTuple)
    writer.archives = [Archive(wd.inventoryNumber, wd.serialNumber, wd.title, wd.numberOfScans, wd.status) for wd in wordDataGroup]
    writers.append(writer)
writers[2].archives

[Archive(inventoryNumber='1165', serialNumber='A14251', title='1165_A14251', numberOfScans='285', status='GT')]

In [16]:
w = writers[0]
for wa in w.archives:
    archive_scans = [sd for sd in sdl if sd.title == wa.title]
    wa.scans = archive_scans
    
print(len(w.archives[0].scans))
print(w.archives[0].numberOfScans)
print(w.archives[0])
w.archives[0].scans[-1]

173
110
Archive(inventoryNumber='173', serialNumber='KLAF00792', title='173_KLAF00792', numberOfScans='110', status='GT')


ScanData(id='595912', title='173_KLAF00792', number='220', previewImage='https://files.transkribus.eu/iiif/2/GJUTRIXITXYWPEMWTQBYVCYW/full/128,/0/default.jpg', xmlKey='https://files.transkribus.eu/Get?id=JHXAOTPJEPZSEOIQHRABCNWC', imageURL='https://files.transkribus.eu/Get?id=GJUTRIXITXYWPEMWTQBYVCYW&fileType=view')

In [17]:
def transkribus_url(archive_id:str, page_index: int) -> str:
    return f'https://transkribus.eu/r/amsterdam-city-archives/#/documents/{archive_id}/pages/{page_index}'

In [18]:
set(sorted([(w.name,w.id) for w in writers]))

{('ANTHONY VAN DE VEN', '152'),
 ('CORNELIS STAAL', '377'),
 ('CORNELIS VAN LOON', '265'),
 ('DANIEL VAN DEN BRINK', '342'),
 ('DAVID DES POMMARE', '261'),
 ('DIRK VAN DER GROE', '174'),
 ('GILLES BORSSELAER', '57'),
 ('HENDRIK DANIEL VAN HOORN', '390'),
 ('HENDRIK SCHAEF', '54'),
 ('HENRICK VENKEL', '131'),
 ('HERMANUS VAN HEEL', '371'),
 ('JACOB DE WINTER', '98'),
 ('JACOB JANSZ WESTFRISIUS', '20'),
 ('JACOB PONDT', '136'),
 ('JAN BARELS DE JONGE', '307'),
 ('JAN DE VOS', '49'),
 ('JAN FRANSSEN BRUIJNINGH', '8'),
 ('JAN VERLEIJ', '358'),
 ('JOOST VAN DE VEN', '48'),
 ('PALM MATHIJSZ', '18'),
 ('PHILIP ZWEERTS', '334'),
 ('SIMON VAN SEVENHOVEN', '208'),
 ('WILLEM DE FAY', '300')}

In [19]:
print(w.name)
for a in w.archives:
    print(f'Archive {a.title}')
    for (i,s) in enumerate(a.scans):
        print(transkribus_url(s.id,i+1))
        print 

JAN FRANSSEN BRUIJNINGH
Archive 173_KLAF00792
https://transkribus.eu/r/amsterdam-city-archives/#/documents/595912/pages/1
https://transkribus.eu/r/amsterdam-city-archives/#/documents/595912/pages/2
https://transkribus.eu/r/amsterdam-city-archives/#/documents/595912/pages/3
https://transkribus.eu/r/amsterdam-city-archives/#/documents/595912/pages/4
https://transkribus.eu/r/amsterdam-city-archives/#/documents/595912/pages/5
https://transkribus.eu/r/amsterdam-city-archives/#/documents/595912/pages/6
https://transkribus.eu/r/amsterdam-city-archives/#/documents/595912/pages/7
https://transkribus.eu/r/amsterdam-city-archives/#/documents/595912/pages/8
https://transkribus.eu/r/amsterdam-city-archives/#/documents/595912/pages/9
https://transkribus.eu/r/amsterdam-city-archives/#/documents/595912/pages/10
https://transkribus.eu/r/amsterdam-city-archives/#/documents/595912/pages/11
https://transkribus.eu/r/amsterdam-city-archives/#/documents/595912/pages/12
https://transkribus.eu/r/amsterdam-city

In [20]:
with open('../pagexml/scans_20210403.csv', newline='') as csvfile:
     reader = csv.DictReader(csvfile)
     data = [row for row in reader]

print(data[0])
print(data[-1])


{'id': '605529', 'title': '10021_NOTA00961', 'number': '1', 'previewImage': 'https://files.transkribus.eu/iiif/2/CJHHGXALGVYHDROFTCJHJDMD/full/128,/0/default.jpg', 'xmlKey': 'https://files.transkribus.eu/Get?id=EYSQSQIQQOHZANOLAEPDNKGN', 'imageURL': 'https://files.transkribus.eu/Get?id=CJHHGXALGVYHDROFTCJHJDMD&fileType=view'}
{'id': '628095', 'title': 'TRAINING_VALIDATION_SET_Transkribus Medieval Writing', 'number': '79', 'previewImage': 'https://files.transkribus.eu/iiif/2/UTSSSQFQTUVSEABWKZVSDVLX/full/128,/0/default.jpg', 'xmlKey': 'https://files.transkribus.eu/Get?id=VLCSBUCOIMVTSHNMGSFEJDCW', 'imageURL': 'https://files.transkribus.eu/Get?id=UTSSSQFQTUVSEABWKZVSDVLX&fileType=view'}


In [28]:
import sqlite3
with sqlite3.connect(':memory:') as db:
    db.execute('create table archives(inventoryNumber integer primary key,    serialNumber text not null,    title text not null,    numberOfScans integer,    status text not null);')
    print(db.execute('select * from archives;')).fetchall()


<sqlite3.Cursor object at 0x7f1c7caf3ce0>


AttributeError: 'NoneType' object has no attribute 'fetchall'