In [8]:
!pip install pyproj
!pip install -U numpy==1.26.4
!pip install transformers
!pip install -U accelerate
!pip install pandas
!pip install PyPDF2
!pip install NLTK
!pip install editdistance
!pip install  paramiko
!pip install func_timeout

import nltk
nltk.download('punkt')
nltk.download('punkt_tab')



[nltk_data] Downloading package punkt to /home/alex/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/alex/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [6]:
# import libraries
import urllib.request
import requests
import json
from pyproj import Transformer
import os
import time
import sys
from datetime import datetime
#from lxml import etree
from xml.etree import ElementTree as ET
import logging

# import common functions
sys.path.insert(1, '../')
import common

# set name of module, to fetch info from config
module_name = "dans"

In [8]:
# get info from config file
config = common.get_config()

# set up logging
log_location = config['data_source'][module_name]['harvest_log_location']
now = datetime.now()
date = now.strftime("%Y-%m-%d")
logfile = f"{log_location}harvest-log-{module_name}-{date}.log"
logging.basicConfig(level=logging.DEBUG, filename=logfile, filemode="a+",
                format="%(asctime)-15s %(levelname)-8s %(message)s")

# log config info        
last_indexed_date = config['data_source'][module_name]['last_indexed_date']
print(f'last_indexed_date: {last_indexed_date}')

endpoint_url = config['data_source'][module_name]['endpoint_url']
print(f'endpoint_url: {endpoint_url}')

pdf_folder = config['data_source'][module_name]['pdf_folder']
print(f'pdf_folder: {pdf_folder}')

json_folder = config['data_source'][module_name]['json_folder']
print(f'json_folder: {json_folder}')

html_folder = config['data_source'][module_name]['html_folder']
print(f'html_folder: {html_folder}')

language = config['data_source'][module_name]['language']
print(f'language: {language}')

bert_model = config['bert_models'][language]
print(f'bert_model: {bert_model}')



last_indexed_date: 2022-01-01T00:00:00
endpoint_url: https://archaeology.datastations.nl/api/
pdf_folder: /media/alex/Data/agnes_data/dans/pdf/
json_folder: /media/alex/Data/agnes_data/dans/json/
html_folder: /media/alex/Data/agnes_data/dans/html/
language: dutch
bert_model: /media/alex/Data/agnes_models/ArcheoBERTje-NER


In [9]:
#API settings
index_datetimestring = str(datetime.now().strftime("%Y-%m-%dT%H:%M:%S"))
providerName = 'Data_Station_Archaeology'
api_url = endpoint_url + 'search'
meta_url = endpoint_url + 'datasets/export'
content_url = endpoint_url + 'access/datafile/'
headers = {'Origin': 'http://agnessearch.nl',
           'Access-Control-Request-Method': 'GET',
           'httpAccept': 'application/json',
           'User-Agent': 'python-requests/2.18.1',
           'From': 'a.brandsen@arch.leidenuniv.nl (EXALT)'  
          }
page_length = 100  # size of the metadata chunks
datesort = "dateSort:["+str(last_indexed_date)+"Z TO "+index_datetimestring+"Z]"

In [10]:
params = {
            'X-Dataverse-Key': '0450fc54-e0e7-4692-b82c-e196eed69b12',  # personal API key valid till 01.05.2024
            'start': 0,                                                 # start of iteration
            'per_page': page_length,                                    # results in pages of n datasets (max. 1000)
            'type': 'dataset',
            'fq': 'publicationStatus:Published',                        # no Draft datasets
            'fq': datesort,  # everything from selected date onwards
            #'q': 'benedenberg',                                          # query (*) within archaeology.datastation.nl
            #'q': 'authorName:(schinkel AND fokkens)',                    # AND 
            #'q': 'authorName:(schinkel+OR+fokkens+OR+fontijn)',          # OR
            #'q': 'dsPublicationDate:2023',
            'q': 'dsPublicationDate:(2022 OR 2023 OR 2024 OR 2025 OR 2026 OR 2027)',                              # year of publication
            #'q': 'dsPublicationDate:(1930 OR 1950 OR 1951 OR 1952 OR 1953 OR 1954 OR 1955 OR 1956 OR 1958 OR \
            #          1959 OR 1961 OR 1962 OR 1963 OR 1964 OR 1965 OR 1966 OR 1967 OR 1968 OR 1969 \
            #          OR 1970 OR 1971 OR 1972 OR 1973 OR 1974 OR 1975 OR 1976 OR 1977 OR 1978 OR 1979 \
            #          OR 1980 OR 1981 OR 1982 OR 1983 OR 1984 OR 1985 OR 1986 OR 1987 OR 1988 OR 1989 \
            #          OR 1990 OR 1991 OR 1992 OR 1993 OR 1994 OR 1995 OR 1996 OR 1997 OR 1998 OR 1999 \
            #          OR 2000 OR 2001 OR 2002 OR 2003 OR 2004 OR 2005)',
}

In [11]:
# Create a requests Session
session = requests.Session()
# Set the header options, zoals 'application/json'
session.headers.update(headers)

In [12]:
i = 0
# Make a request and store the response
try:
    response = session.get(api_url, params=params)
    response.raise_for_status()                 # Raise error in case of failure 
except requests.exceptions.HTTPError as httpErr: 
    print ("HTTP Error:",httpErr) 
except requests.exceptions.ConnectionError as connErr: 
    print ("Error Connecting:",connErr) 
except requests.exceptions.Timeout as timeOutErr: 
    print ("Timeout Error:",timeOutErr) 
except requests.exceptions.RequestException as reqErr: 
    print ("Something Else:",reqErr)


In [7]:
if response.status_code == requests.codes.ok:
    
    if response:

        data = response.json()
            
        response_length = data['data']['total_count']
        print('Datasets found: '+ str(response_length))
            
        dsets = data['data']['items']

        # Fetch all the other pages of results
        page = 1
        while (page*page_length+1) <= response_length:
            params['start'] = (page*page_length)
            #print (page, params['start'])

            # GET the next page of dataset metadata
            next_response = session.get(api_url, params=params)
            if next_response.status_code == requests.codes.ok:
             if next_response:
                 next_data = next_response.json()
                 next_dsets = next_data['data']['items']
        
                 # add the data from the next page to the data we collected
                 dsets = dsets + next_dsets
            page = page + 1
            #time.sleep(1)                 # being kind to the API server

    else:
        print ('Empty result')
else:
        print ('Web connection Error')

print ('Saved: ' + str(i))


Datasets found: 34035
Saved: 0


In [8]:
import pickle 
  
# Open a file and use dump() 
with open('dsets-TEMP.pkl', 'wb') as file: 
      
    # A new file will be created 
    pickle.dump(dsets, file) 


In [13]:
import pickle

# Open the file in binary mode 
with open('dsets-TEMP.pkl', 'rb') as file: 
      
    # Call load method to deserialze 
    dsets = pickle.load(file) 
  


In [22]:
#processing the metadata of the datasets
# date_sorted_dsets = sorted(dsets, key=lambda d: d['published_at'])
# i = 0;
# if date_sorted_dsets:
#     print ('Processing: ' + str(len(dsets)) + ' datasets')
#     for dset in date_sorted_dsets:
#         i+= 1
#         #print(json.dumps(dset, indent=4, sort_keys=False))  
#         print(dset['published_at'])  
#         if i == 300:
#             break

Processing: 34035 datasets
2022-01-02T23:00:00Z
2022-01-03T23:00:00Z
2022-01-03T23:00:00Z
2022-01-05T23:00:00Z
2022-01-06T23:00:00Z
2022-01-06T23:00:00Z
2022-01-06T23:00:00Z
2022-01-09T23:00:00Z
2022-01-09T23:00:00Z
2022-01-09T23:00:00Z
2022-01-09T23:00:00Z
2022-01-09T23:00:00Z
2022-01-09T23:00:00Z
2022-01-09T23:00:00Z
2022-01-09T23:00:00Z
2022-01-09T23:00:00Z
2022-01-09T23:00:00Z
2022-01-09T23:00:00Z
2022-01-09T23:00:00Z
2022-01-09T23:00:00Z
2022-01-10T23:00:00Z
2022-01-10T23:00:00Z
2022-01-10T23:00:00Z
2022-01-10T23:00:00Z
2022-01-10T23:00:00Z
2022-01-10T23:00:00Z
2022-01-10T23:00:00Z
2022-01-10T23:00:00Z
2022-01-10T23:00:00Z
2022-01-10T23:00:00Z
2022-01-10T23:00:00Z
2022-01-10T23:00:00Z
2022-01-10T23:00:00Z
2022-01-10T23:00:00Z
2022-01-10T23:00:00Z
2022-01-10T23:00:00Z
2022-01-11T23:00:00Z
2022-01-12T23:00:00Z
2022-01-12T23:00:00Z
2022-01-12T23:00:00Z
2022-01-12T23:00:00Z
2022-01-12T23:00:00Z
2022-01-16T23:00:00Z
2022-01-16T23:00:00Z
2022-01-16T23:00:00Z
2022-01-16T23:00:00Z
2022-01

In [None]:
#processing the metadata of the datasets
if dsets:

    # sort dsets by published date
    date_sorted_dsets = sorted(dsets, key=lambda d: d['published_at'])
    
    print ('Processing: ' + str(len(date_sorted_dsets)) + ' datasets')
    for dset in date_sorted_dsets:
        #print(json.dumps(dset, indent=4, sort_keys=False))  
        print('--------------------------------------------')
        print('Processing dataset: '+dset['global_id'])
        
        # get extended metadata
        if dset['global_id']:
            dataset_doi = dset['global_id']
            meta_params = { 'X-Dataverse-Key': '0450fc54-e0e7-4692-b82c-e196eed69b12',
                            'exporter': 'dataverse_json',                             # metadata format
                            'httpAccept': 'application/json',
                            'persistentId': dataset_doi}
            meta_response = requests.get(meta_url, params=meta_params)                # request dataset metadata
            
            if meta_response.status_code == requests.codes.ok:
                if meta_response:
                    metadata = meta_response.json()

                    #print(json.dumps(metadata, indent=4, sort_keys=False)) 
                    #exit()

                    metadataBlocks = metadata['datasetVersion']['metadataBlocks']
                    
                    files = metadata['datasetVersion']['files']
                    for file in files:
                        #print(file)
                        dataFile = file['dataFile']
                        if dataFile['contentType'] == 'application/pdf':

                            print('Processing file: '+dataFile['filename'])

                            # generate document dictionary
                            output_document = {}
                            output_document['source'] = module_name
                            output_document['file_name'] = common.cleanFileName(dataFile['filename'])
                            output_document['description'] = dset['description']
                            output_document['title'] = dset['name']
                            output_document['creators'] = dset['authors']
                            output_document['publisher'] = dset['publisher']
                            output_document['createdAt'] = dset['published_at'][:10]
                            
                            identifiers = {
                                'DOI':dset['global_id'],
                                'url':dset['url'],
                                'dans_dataset_version_id': dset['versionId']
                            }
                            if 'dansArchaeologyMetadata' in metadataBlocks:
                                for thing in metadataBlocks['dansArchaeologyMetadata']['fields']:
                                    if thing['typeName'] == 'dansArchisZaakId':
                                        identifiers['dansArchisZaakId'] = thing['value']
                            if 'dansDataVaultMetadata' in metadataBlocks:
                                for thing in metadataBlocks['dansDataVaultMetadata']['fields']:
                                    identifiers[thing['typeName']] = thing['value']
                            output_document['identifiers'] = identifiers

                            if 'citation' in metadataBlocks:
                                for thing in metadataBlocks['citation']['fields']:
                                    if thing['typeName'] == 'language':
                                        output_document['language'] = thing['value']
                            
                            # check dag/week rapport etc, so we can filter these out in ES
                            rapportList = ['dagrapport' , 'dag_rapport' , 'weekrapport' , 'week_rapport' , 'weekverslag' , 'week_verslag' , 'logboek']
                            pvaList = ['draaiboek' , 'plan_van_aanpak' , 'pva']
                            pveList = ['programma_van_eisen' , 'pve']
                            omnList = ['onderzoeksmeldingsnummer' , 'onderzoeksmeldings_nummer' , 'onderzoeks_meldings_nummer']
                            if any(word in dataFile['filename'].lower() for word in rapportList):
                                output_document['file_type'] = 'dag_week_rapport'
                            elif any(word in dataFile['filename'].lower() for word in pvaList):
                                output_document['file_type'] = 'plan_van_aanpak'
                            elif any(word in dataFile['filename'].lower() for word in pveList):
                                output_document['file_type'] = 'programma_van_eisen'
                            elif any(word in dataFile['filename'].lower() for word in omnList):
                                output_document['file_type'] = 'onderzoeksmeldingsnummer'
                            else:
                                output_document['file_type'] = 'report'
                            
                            
                            # coordinates 
                            coordX = False
                            coordY = False

                            if 'dansTemporalSpatial' in metadataBlocks:
                                for thing in metadataBlocks['dansTemporalSpatial']['fields']:
                                    if thing['typeName'] == 'dansSpatialBox':
                                        # bounding box
                                        #print(thing['value'])
                                        totalX = 0
                                        totalY = 0
                                        for box in thing['value']:
                                            totalX += (float(box['dansSpatialBoxEast']['value']) + float(box['dansSpatialBoxWest']['value'])) / 2 
                                            totalY += (float(box['dansSpatialBoxSouth']['value']) + float(box['dansSpatialBoxNorth']['value'])) / 2
                                        coordX = totalX / len(thing['value'])
                                        coordY = totalY / len(thing['value'])
                                        
                                    elif thing['typeName'] == 'dansSpatialPoint':
                                        # point
                                        #print(thing['value'])
                                        totalX = 0
                                        totalY = 0
                                        for point in thing['value']:
                                            #print(point)
                                            totalX += float(point['dansSpatialPointX']['value'])
                                            totalY += float(point['dansSpatialPointY']['value'])
                                        coordX = totalX / len(thing['value'])
                                        coordY = totalY / len(thing['value'])
                                       
                            
                            if not coordX and not coordY:
                                # no coordinates, try geocoding the location(s)
                                if 'dansTemporalSpatial' in metadataBlocks:
                                    for thing in metadataBlocks['dansTemporalSpatial']['fields']:    
                                        if thing['typeName'] == 'dansSpatialCoverageText':
                                            #print('doing GEOCODING')
                                            try:
                                                locations = thing['value']
                                                query = ''
                                                if type(locations) is list:
                                                    for location in locations:
                                                        query += location + ', ' 
                                                url = f"https://nominatim.openstreetmap.org/search?q={urllib.parse.quote(query)}&format=geojson&limit=1"
                                                #print(url)
                                                response = urllib.request.urlopen(url)
                                                data = json.loads(response.read())
                                                if len(data['features']): # if results
                                                    lat = data['features'][0]['geometry']['coordinates'][0]
                                                    lon = data['features'][0]['geometry']['coordinates'][1]
                                                    output_document['location'] = {'lat':lat,'lon':lon}
                                            except Exception as error:
                                                print('geocoding error')
                                                print(error)                                

                            if coordX and coordY:
                                output_document['coordX'] = coordX
                                output_document['coordY'] = coordY
                                #convert to lat lon
                                try:
                                    lat, lon = common.rd2wgs(coordX,coordY)
                                    output_document['location'] = {'lat':lat,'lon':lon}
                                except:
                                    print('error converting coordinates from rd to wgs for DOI '+dset['global_id'])

                            
                            # object file download (Open Access)
                            if dataFile['id'] and not file['restricted']:

                                # check for embargo
                                present = datetime.now()                                     
                                embargo = datetime.now()
                                
                                if 'embargo' in file.keys():
                                    embargo = datetime.strptime(filemeta['embargo'], "%Y-%m-%d")
                                
                                if not embargo.date() > present.date(): 

                                    # set document identifier
                                    doc_id = f"{dset['versionId']}_{common.cleanFileName(dataFile['filename'].replace('.pdf',''))}"
                                    
                                    print(f"doc_id:{doc_id}")

                                    # TEMP if already downloaded, skip
                                    output_location = f"{pdf_folder}{dset['versionId']}_{common.cleanFileName(dataFile['filename'])}"
                                    if os.path.isfile(output_location):
                                        print('already downloaded, SKIPPING')
                                        continue

                                    # sometimes get download error, so try and continue if error
                                    try:
                                        # download pdf
                                        file_url = content_url + str(dataFile['id'] ) 
                                        pdf_location = common.downloaddocument(
                                            file_url, 
                                            dset['versionId'], 
                                            pdf_folder, 
                                            common.cleanFileName(dataFile['filename'])
                                        )
                                        print(f"downloaded pdf")
                                    except Exception as e:
                                        print(f"could not download pdf, error:")
                                        print(e)
                                        continue

                                    # save document.json 
                                    json_output_folder = f"{json_folder}/{doc_id}"
                                    common.savejson(output_document, f"{json_output_folder}/document.json")
                    
                                    print(f"saved doc json")
                    
                                    # process pdf, store page.json files with entities 
                                    common.run_ner_on_pdf(
                                        pdf_location, 
                                        json_output_folder, 
                                        bert_model, 
                                        language
                                    )
                    
                                    print(f"ran NER, saved page json")
                    
                                    # process pdf, save html files
                                    html_output_folder = f"{html_folder}/{doc_id}"
                                    common.pdf2html(pdf_location, html_output_folder)
                    
                                    print(f"generated and saved html")
                    
                                    # save last id we indexed in the settings file 
                                    # (do this in the loop instead of after, in case we get errors/hanging that ends the script before we get to the end)
                                    common.update_config(module_name, 'last_indexed_date', dset['published_at'])
                                    
                                    print(f"updated config with latest date: {dset['published_at']}")
                                          

                                else:
                                    print ('embargoed: '+ dset['name'])
                            else:
                                print ('restricted: ' + dset['name'])
                        #else:
                             #print ('not pdf: ' + dataFile['filename'])
                                 

            else:
                print (dset['global_id'] + ' no dataset metadata found')
else:
    print ('Processing: 0')

Processing: 34035 datasets
--------------------------------------------
Processing dataset: doi:10.17026/dans-283-urkb
Processing file: Rap 5665_4230718_Utrechtse Heuvelrug Maarsbergen Maarn Anderstein Valkenheide, vooronderzoek.pdf
doc_id:85329_Rap_5665_4230718_Utrechtse_Heuvelrug_Maarsbergen_Maarn_Anderstein_Valkenheide_vooronderzoek
already downloaded, SKIPPING
--------------------------------------------
Processing dataset: doi:10.17026/dans-xr2-s438
--------------------------------------------
Processing dataset: doi:10.17026/dans-x4y-pby9
Processing file: Rap 5231_ 4220215_Defensie, DEM-Erfgoed, Perceel Noord 2, acht bureauonderzoeken.pdf
doc_id:84260_Rap_5231__4220215_Defensie_DEM-Erfgoed_Perceel_Noord_2_acht_bureauonderzoeken
already downloaded, SKIPPING
--------------------------------------------
Processing dataset: doi:10.17026/dans-zrj-p536
Processing file: 2021-0576 Dorpsstraat 3, Oosterhout_versie2.0_def.pdf
restricted: Bureauonderzoek en inventariserend veldonderzoek (BO

In [9]:
         
# upload json and html to webserver
common.upload2webserver(json_folder, html_folder, module_name, config['webserver']['json_folder'], config['webserver']['html_folder'])

print(f"uploaded json/html to webserver")

# remotely start indexing script on webserver
common.start_index(module_name)

print(f"indexing on webserver started")

print(f"done!")



Processing id:2960
doc_id:2960_Rapport_1409_Bilzen_Kloosterstraat_EV
saved doc json
file /media/alex/Data/agnes_data/onroerend_erfgoed/pdf/2960_Rapport_1409_Bilzen_Kloosterstraat_EV.pdf has already been downloaded
downloaded pdf


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


ran NER, saved page json


Superfluous whitespace found in object header b'1' b'0'
Superfluous whitespace found in object header b'2' b'0'
Superfluous whitespace found in object header b'3' b'0'
Superfluous whitespace found in object header b'31' b'0'
Superfluous whitespace found in object header b'42' b'0'
Superfluous whitespace found in object header b'61' b'0'
Superfluous whitespace found in object header b'72' b'0'
Superfluous whitespace found in object header b'100' b'0'
Superfluous whitespace found in object header b'118' b'0'
Superfluous whitespace found in object header b'128' b'0'
Superfluous whitespace found in object header b'138' b'0'
Superfluous whitespace found in object header b'141' b'0'
Superfluous whitespace found in object header b'144' b'0'
Superfluous whitespace found in object header b'148' b'0'
Superfluous whitespace found in object header b'151' b'0'
Superfluous whitespace found in object header b'154' b'0'
Superfluous whitespace found in object header b'157' b'0'
Superfluous whitespace f

Converted /media/alex/Data/agnes_data/onroerend_erfgoed/pdf/2960_Rapport_1409_Bilzen_Kloosterstraat_EV.pdf to html!
generated and saved html
updated config with latest id: 2960
Processing id:2961
doc_id:2961_2023-0739_Aartselaar_Oeyvaersbosch_EV_DEF
saved doc json
file /media/alex/Data/agnes_data/onroerend_erfgoed/pdf/2961_2023-0739_Aartselaar_Oeyvaersbosch_EV_DEF.pdf has already been downloaded
downloaded pdf


Superfluous whitespace found in object header b'71' b'0'
Superfluous whitespace found in object header b'69' b'0'
Superfluous whitespace found in object header b'68' b'0'
Superfluous whitespace found in object header b'67' b'0'
Superfluous whitespace found in object header b'70' b'0'
Superfluous whitespace found in object header b'99' b'0'
Superfluous whitespace found in object header b'81' b'0'
Superfluous whitespace found in object header b'80' b'0'
Superfluous whitespace found in object header b'79' b'0'
Superfluous whitespace found in object header b'89' b'0'
Superfluous whitespace found in object header b'88' b'0'
Superfluous whitespace found in object header b'87' b'0'
Superfluous whitespace found in object header b'97' b'0'
Superfluous whitespace found in object header b'96' b'0'
Superfluous whitespace found in object header b'95' b'0'
Superfluous whitespace found in object header b'98' b'0'
Superfluous whitespace found in object header b'73' b'0'
Superfluous whitespace found in

ran NER, saved page json


Superfluous whitespace found in object header b'2' b'0'
Superfluous whitespace found in object header b'3' b'0'
Superfluous whitespace found in object header b'29' b'0'
Superfluous whitespace found in object header b'41' b'0'
Superfluous whitespace found in object header b'44' b'0'
Superfluous whitespace found in object header b'47' b'0'
Superfluous whitespace found in object header b'58' b'0'
Superfluous whitespace found in object header b'71' b'0'
Superfluous whitespace found in object header b'76' b'0'
Superfluous whitespace found in object header b'81' b'0'
Superfluous whitespace found in object header b'92' b'0'
Superfluous whitespace found in object header b'95' b'0'
Superfluous whitespace found in object header b'98' b'0'
Superfluous whitespace found in object header b'102' b'0'
Superfluous whitespace found in object header b'105' b'0'
Superfluous whitespace found in object header b'108' b'0'
Superfluous whitespace found in object header b'112' b'0'
Superfluous whitespace found 

Converted /media/alex/Data/agnes_data/onroerend_erfgoed/pdf/2961_2023-0739_Aartselaar_Oeyvaersbosch_EV_DEF.pdf to html!
generated and saved html
updated config with latest id: 2961
Processing id:2962
doc_id:2962_ORTEC2201008_-_Eindverslag_-_Geetbets_-_Zoutleeuw_OPGR
saved doc json
file /media/alex/Data/agnes_data/onroerend_erfgoed/pdf/2962_ORTEC2201008_-_Eindverslag_-_Geetbets_-_Zoutleeuw_OPGR.pdf has already been downloaded
downloaded pdf


Superfluous whitespace found in object header b'941' b'0'
Superfluous whitespace found in object header b'947' b'0'
Superfluous whitespace found in object header b'953' b'0'
Superfluous whitespace found in object header b'959' b'0'
Superfluous whitespace found in object header b'965' b'0'
Superfluous whitespace found in object header b'968' b'0'
Superfluous whitespace found in object header b'974' b'0'
Superfluous whitespace found in object header b'980' b'0'
Superfluous whitespace found in object header b'986' b'0'
Superfluous whitespace found in object header b'992' b'0'
Superfluous whitespace found in object header b'998' b'0'
Superfluous whitespace found in object header b'1004' b'0'
Superfluous whitespace found in object header b'1010' b'0'
Superfluous whitespace found in object header b'1016' b'0'
Superfluous whitespace found in object header b'1019' b'0'
Superfluous whitespace found in object header b'1022' b'0'
Superfluous whitespace found in object header b'1025' b'0'
Superflu

ran NER, saved page json
Converted /media/alex/Data/agnes_data/onroerend_erfgoed/pdf/2962_ORTEC2201008_-_Eindverslag_-_Geetbets_-_Zoutleeuw_OPGR.pdf to html!
generated and saved html
updated config with latest id: 2962
Processing id:2970
doc_id:2970_Diest_-_Leuvensestraat_Eindverslag
saved doc json
downloaded pdf
ran NER, saved page json
Converted /media/alex/Data/agnes_data/onroerend_erfgoed/pdf/2970_Diest_-_Leuvensestraat_Eindverslag.pdf to html!
generated and saved html
updated config with latest id: 2970
Processing id:2971
doc_id:2971_Rapport_1446_Deurne_-_Eksterlaar_Eindverslag
saved doc json
downloaded pdf
ran NER, saved page json
Converted /media/alex/Data/agnes_data/onroerend_erfgoed/pdf/2971_Rapport_1446_Deurne_-_Eksterlaar_Eindverslag.pdf to html!
generated and saved html
updated config with latest id: 2971
Processing id:2977
doc_id:2977_Bijlage_2_-_LABR-19_Sporenlijst
saved doc json
downloaded pdf
ran NER, saved page json
Converted /media/alex/Data/agnes_data/onroerend_erfgo

Superfluous whitespace found in object header b'1' b'0'
Superfluous whitespace found in object header b'2' b'0'
Superfluous whitespace found in object header b'3' b'0'
Superfluous whitespace found in object header b'28' b'0'
Superfluous whitespace found in object header b'39' b'0'
Superfluous whitespace found in object header b'58' b'0'
Superfluous whitespace found in object header b'69' b'0'
Superfluous whitespace found in object header b'89' b'0'
Superfluous whitespace found in object header b'95' b'0'
Superfluous whitespace found in object header b'101' b'0'
Superfluous whitespace found in object header b'107' b'0'
Superfluous whitespace found in object header b'110' b'0'
Superfluous whitespace found in object header b'113' b'0'
Superfluous whitespace found in object header b'132' b'0'
Superfluous whitespace found in object header b'135' b'0'
Superfluous whitespace found in object header b'139' b'0'
Superfluous whitespace found in object header b'142' b'0'
Superfluous whitespace fou

downloaded pdf


Superfluous whitespace found in object header b'112' b'0'
Superfluous whitespace found in object header b'111' b'0'
Superfluous whitespace found in object header b'131' b'0'
Superfluous whitespace found in object header b'121' b'0'
Superfluous whitespace found in object header b'120' b'0'
Superfluous whitespace found in object header b'119' b'0'
Superfluous whitespace found in object header b'129' b'0'
Superfluous whitespace found in object header b'128' b'0'
Superfluous whitespace found in object header b'127' b'0'
Superfluous whitespace found in object header b'130' b'0'
Superfluous whitespace found in object header b'134' b'0'
Superfluous whitespace found in object header b'133' b'0'
Superfluous whitespace found in object header b'138' b'0'
Superfluous whitespace found in object header b'137' b'0'
Superfluous whitespace found in object header b'136' b'0'
Superfluous whitespace found in object header b'141' b'0'
Superfluous whitespace found in object header b'140' b'0'
Superfluous wh

ran NER, saved page json
Converted /media/alex/Data/agnes_data/onroerend_erfgoed/pdf/2985_2024-0409_Lievegem_Centrumstraat_fase_1_EV.pdf to html!
generated and saved html
updated config with latest id: 2985
Processing id:2987
doc_id:2987_2022D95_Eindverslag_Veurne_N8_v2
saved doc json
downloaded pdf
ran NER, saved page json
Converted /media/alex/Data/agnes_data/onroerend_erfgoed/pdf/2987_2022D95_Eindverslag_Veurne_N8_v2.pdf to html!
generated and saved html
updated config with latest id: 2987
Processing id:2994
doc_id:2994_Eindverslag_Bierbeek_-_Waterhoenweg_v001
saved doc json
downloaded pdf
ran NER, saved page json
Converted /media/alex/Data/agnes_data/onroerend_erfgoed/pdf/2994_Eindverslag_Bierbeek_-_Waterhoenweg_v001.pdf to html!
generated and saved html
updated config with latest id: 2994
Processing id:2995
doc_id:2995_2017-1342_Bavikhove_Eerste_Aardstraat_Catalogus_DEF
saved doc json
downloaded pdf
ran NER, saved page json
Converted /media/alex/Data/agnes_data/onroerend_erfgoed/p

unknown widths : 
{'/Kids': [IndirectObject(366, 0, 135957464836944), IndirectObject(367, 0, 135957464836944), IndirectObject(408, 0, 135957464836944), IndirectObject(409, 0, 135957464836944), IndirectObject(410, 0, 135957464836944), IndirectObject(411, 0, 135957464836944), IndirectObject(412, 0, 135957464836944), IndirectObject(413, 0, 135957464836944), IndirectObject(414, 0, 135957464836944), IndirectObject(415, 0, 135957464836944), IndirectObject(416, 0, 135957464836944), IndirectObject(417, 0, 135957464836944), IndirectObject(418, 0, 135957464836944), IndirectObject(419, 0, 135957464836944), IndirectObject(420, 0, 135957464836944), IndirectObject(421, 0, 135957464836944), IndirectObject(422, 0, 135957464836944), IndirectObject(423, 0, 135957464836944), IndirectObject(424, 0, 135957464836944), IndirectObject(425, 0, 135957464836944), IndirectObject(426, 0, 135957464836944), IndirectObject(427, 0, 135957464836944), IndirectObject(428, 0, 135957464836944), IndirectObject(429, 0, 13595

unsupported operand type(s) for /: 'NameObject' and 'int'
ran NER, saved page json
Converted /media/alex/Data/agnes_data/onroerend_erfgoed/pdf/3019_VEC_164_4210422_Belgie,_Kruishoutem,_Passionistenstraat,_DO.pdf to html!
generated and saved html
updated config with latest id: 3019
Processing id:3024
doc_id:3024_2023L87_K&L_Schijnpoort_Eindrapport
saved doc json
downloaded pdf
ran NER, saved page json
pdftohtml error for file /media/alex/Data/agnes_data/onroerend_erfgoed/pdf/3024_2023L87_K&L_Schijnpoort_Eindrapport.pdf /bin/sh: 1: L_Schijnpoort_Eindrapport.pdf: not found
/bin/sh: 1: L_Schijnpoort_Eindrapport/index.html: not found
I/O Error: Couldn't open file '/media/alex/Data/agnes_data/onroerend_erfgoed/pdf/3024_2023L87_K': No such file or directory.

generated and saved html
updated config with latest id: 3024
Processing id:3033
doc_id:3033_2022-041_Tongeren_Industriezone_Oost_EV
saved doc json
downloaded pdf
ran NER, saved page json
Converted /media/alex/Data/agnes_data/onroerend_er

Superfluous whitespace found in object header b'1' b'0'
Superfluous whitespace found in object header b'2' b'0'
Superfluous whitespace found in object header b'3' b'0'
Superfluous whitespace found in object header b'34' b'0'
Superfluous whitespace found in object header b'54' b'0'
Superfluous whitespace found in object header b'85' b'0'
Superfluous whitespace found in object header b'108' b'0'
Superfluous whitespace found in object header b'119' b'0'
Superfluous whitespace found in object header b'125' b'0'
Superfluous whitespace found in object header b'167' b'0'
Superfluous whitespace found in object header b'201' b'0'
Superfluous whitespace found in object header b'240' b'0'
Superfluous whitespace found in object header b'273' b'0'
Superfluous whitespace found in object header b'299' b'0'
Superfluous whitespace found in object header b'347' b'0'
Superfluous whitespace found in object header b'374' b'0'
Superfluous whitespace found in object header b'426' b'0'
Superfluous whitespace 

downloaded pdf


Superfluous whitespace found in object header b'84' b'0'
Superfluous whitespace found in object header b'63' b'0'
Superfluous whitespace found in object header b'62' b'0'
Superfluous whitespace found in object header b'61' b'0'
Superfluous whitespace found in object header b'71' b'0'
Superfluous whitespace found in object header b'70' b'0'
Superfluous whitespace found in object header b'69' b'0'
Superfluous whitespace found in object header b'79' b'0'
Superfluous whitespace found in object header b'78' b'0'
Superfluous whitespace found in object header b'77' b'0'
Superfluous whitespace found in object header b'80' b'0'
Superfluous whitespace found in object header b'81' b'0'
Superfluous whitespace found in object header b'83' b'0'
Superfluous whitespace found in object header b'55' b'0'
Superfluous whitespace found in object header b'82' b'0'
Superfluous whitespace found in object header b'107' b'0'
Superfluous whitespace found in object header b'86' b'0'
Superfluous whitespace found i

ran NER, saved page json
Converted /media/alex/Data/agnes_data/onroerend_erfgoed/pdf/3039_OAOE336-001.pdf to html!
generated and saved html
updated config with latest id: 3039
uploaded json/html to webserver
indexing on webserver started
done!
