In [1]:
!pip install pypdf2

Collecting pypdf2
  Obtaining dependency information for pypdf2 from https://files.pythonhosted.org/packages/8e/5e/c86a5643653825d3c913719e788e41386bee415c2b87b4f955432f2de6b2/pypdf2-3.0.1-py3-none-any.whl.metadata
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: pypdf2
Successfully installed pypdf2-3.0.1


In [2]:
#!/usr/bin/env python
# coding: utf-8


# import libraries
import urllib.request
import requests
import pandas as pd
import json
import sys
import os
import time
import yaml
import re
from PyPDF2 import PdfReader
import pickle
import subprocess
import nltk
nltk.data.path.append('/data1/brandsena/nltk_data/')

sys.path.insert(1, '/home/brandsena/timeperiod-to-daterange/')
#import timeperiod2daterange

# set higher recursion for pypdf2
sys.setrecursionlimit(10000)

"""
# BERT model import
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
from collections import defaultdict
import torch
cudaGpuNumber = torch.cuda.current_device()
#print('cuda gpu number is '+str(cudaGpuNumber))
modelDir = '/data1/brandsena/lucdh-dataset/archeobertje-production-model-fold2/'
model = AutoModelForTokenClassification.from_pretrained(modelDir)
tokenizer = AutoTokenizer.from_pretrained(modelDir)
predictor = pipeline(
                      'ner', 
                      model=model, 
                      tokenizer=tokenizer,
                      device = cudaGpuNumber, # 0 for gpu, -1 for cpu
                      #device = -1, # 0 for gpu, -1 for cpu
                      grouped_entities = False
                    )

"""

def download_document(file_url, source, file_id, file_name = False):
    # 2DO get pdf location folder from settings file
    pdf_folder = "/data1/brandsena/document-sources/oe_vlaanderen/pdf/"
    
    # if file name not specified, use last part of url as filename
    if not file_name:
        file_name = file_url.split('/')[-1]
        
    output_location = f"{pdf_folder}{file_id}_{file_name}"
    
    urllib.request.urlretrieve(file_url, output_location) 
    
    return output_location

def _mkdir(_dir):
    if os.path.isdir(_dir): pass
    elif os.path.isfile(_dir):
        raise OSError("%s exists as a regular file." % _dir)
    else:
        parent, directory = os.path.split(_dir)
        if parent and not os.path.isdir(parent): _mkdir(parent)
        if directory: os.mkdir(_dir)

            
def save_json(data, location):

    # make parent folder(s) if needed
    _mkdir('/'.join(location.split('/')[0:-1]))
    
    # save json
    jsonOutput = json.dumps(data)
    with open(location, "w") as json_file:
        try:
            json_file.write(jsonOutput) 
        except:
            jsonOutput = jsonOutput.encode('utf-8')
            json_file.write(jsonOutput) 

def run_ner_on_pdf(fileLocation, outputFolder, language = 'dutch'):
    try:
        # create a pdf reader object
        try:
            reader = PdfReader(fileLocation)
        except Exception as error:
            print('PDF reading error')
            print(error)
            return False # something wrong with pdf, skip file
            
        
        # loop through pages 
        for i in range(0, len(reader.pages) ):
        
            pageNumber = i + 1

            #logger.info('working on page '+str(pageNumber)+' of file '+fileLocation)
            #print('working on page '+str(pageNumber)+' of file '+fileLocation)
            
            page = reader.pages[i]
            
            pageEntities = defaultdict(list)
            pageTimespans = []	
            maxYear = False
            minYear = False
            maxYearExcludingRecent = False	
            
            try:
                pageText = page.extract_text()
            except Exception as error:
                print('PDF reading error')
                print(error)
                return False # something wrong with pdf, skip file
            
            pageText = pageText.replace('\n',' ') # replace line endings with spaces
                    
            # create sentences
            #print('make sentences')
            sent_text = nltk.sent_tokenize(pageText, language = language) # this gives us a list of sentences
            
            # loop through sentences
            for sentence in sent_text:
                #print(sentence)
                
                # get entities 
                #print('get entities')
                entities = predictor(sentence)
                #print('entities retrieved')
                
                concatenatedEntity = ''
                currentLabel = False
                prevEntity = {'index':0}
                
                for entity in entities:
                    #print(entity)
                    
                    if entity['word'] != '[UNK]' and entity['word'] != '[CLS]':
                        # beginning of entity
                        if entity['word'][:2] != '##' and (entity['entity'][:1] == 'B' or prevEntity['index']+1 < entity['index']):
                            # save previous entity
                            if currentLabel:
                                pageEntities[currentLabel].append(concatenatedEntity)
                                if currentLabel == 'PER':
                                    timespan = timeperiod2daterange.detection2daterange(concatenatedEntity)
                                    if timespan:
                                        pageTimespans.append({'startdate':timespan[0],'enddate':timespan[1]})
                                        if not minYear or timespan[0] < minYear:
                                            minYear = timespan[0]
                                        if not maxYear or timespan[1] > maxYear:
                                            maxYear = timespan[1]
                                        if timespan[1] < 1950 and (not maxYearExcludingRecent or timespan[1] > maxYearExcludingRecent):
                                            maxYearExcludingRecent = timespan[1]
                                            
                            
                            # store entity in memory          
                            concatenatedEntity = entity['word']
                            currentLabel = entity['entity'][2:]
                        
                        # continuation of word in entity
                        elif entity['word'][:2] == '##':
                            concatenatedEntity += entity['word'][2:]
                            
                        # new word in same entity
                        else:
                            concatenatedEntity += ' '+entity['word']
                            
                        prevEntity = entity
                    
            # save as json
            #print('save json')
            pageJsonOutput = {
                "page_number":pageNumber,
                "content":pageText
            }
            if pageEntities:
                pageJsonOutput["ner_entities"] = pageEntities
            if pageTimespans:
                pageJsonOutput["timespans"] = pageTimespans
            if minYear and (maxYear or maxYearExcludingRecent):
                pageJsonOutput["minYear"] = minYear
                if maxYear:
                    pageJsonOutput["maxYear"] = maxYear
                if maxYearExcludingRecent:
                    pageJsonOutput["maxYearExcludingRecent"] = maxYearExcludingRecent
                
            
            save_json(pageJsonOutput, outputFolder+'/page'+str(pageNumber)+'.json')
            
            #print (pageJsonOutput)
            #print (r.text)
    except Exception as error:
        print('PDF reading error')
        print(error)
        
def pdf2html(file_location):
    """convert single file from pdf to html"""

    # do html conversion
    # TODO maybe use md5 of file for folder name to stop clashes?
    htmlDir = '/data1/brandsena/document-sources/oe_vlaanderen/html/' + file_location.split('/')[-1].replace('.pdf', '')
    if not os.path.isdir(htmlDir):
        # first create folder for html to go in

        _mkdir(htmlDir)

        try:
            cmnd = 'pdftohtml -c -nodrm ' + file_location.replace(' ', '\ ') + ' ' + htmlDir + '/index.html'
            output = subprocess.check_output(
                cmnd, stderr=subprocess.STDOUT, shell=True,
                universal_newlines=True)
        except subprocess.CalledProcessError as exc:
            errormsg = "pdftohtml error for file " + file_location + " " + exc.output
            print (errormsg)
            #logger.error(errormsg)
            os.rmdir(htmlDir)
        else:
            print ("Converted " + file_location + " to html!")

    else:
        print (file_location + " html folder already exists, skipping")
        
        
def rd2wgs (x,y):
    """Calculate WGS84 coordinates"""
    x = int(x)
    y = int(y)

    dX = (x - 155000) * pow(10, - 5)
    dY = (y - 463000) * pow(10, - 5)

    SomN = (3235.65389 * dY) + (- 32.58297 * pow(dX, 2)) + (- 0.2475 * pow(dY, 2)) + (- 0.84978 * pow(dX, 2) * dY) + (- 0.0655 * pow(dY, 3)) + (- 0.01709 * pow(dX, 2) *pow(dY, 2)) + (- 0.00738 * dX) + (0.0053 * pow(dX, 4)) + (- 0.00039 * pow(dX, 2) *pow(dY, 3)) + (0.00033 * pow(dX, 4) * dY) + (- 0.00012 * dX * dY)

    SomE = (5260.52916 * dX) + (105.94684 * dX * dY) + (2.45656 * dX * pow(dY, 2)) + (- 0.81885 * pow(dX, 3)) + (0.05594 * dX * pow(dY, 3)) + (- 0.05607 * pow(dX, 3) * dY) + (0.01199 * dY) + (- 0.00256 * pow(dX, 3) *pow(dY, 2)) + (0.00128 * dX * pow(dY, 4)) + (0.00022 * pow(dY,2)) + (- 0.00022 * pow(dX, 2)) + (0.00026 * pow(dX, 5))

    lat = 52.15517 + (SomN / 3600);
    lon = 5.387206 + (SomE / 3600);

    return lat,lon







ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [None]:
pdf_folder = 'pdf/'
json_folder = 'json/'
archis_zaakdocumenten_location = 'zaakdoc-vondstloc-join.csv'

archis_zaakdocumenten = pd.read_csv(archis_zaakdocumenten_location)
#print(archis_zaakdocumenten)

for directory, subdirectories, files in os.walk(pdf_folder):
    for file in files:
        
        file_location = os.path.join(directory, file)
        
        print(f"indexing: {file}")
        
        output_folder = json_folder+file.replace('.pdf','')

        # if not done yet
        if os.path.exists(output_folder):
            print('Output folder for '+file+' already exists, skipping')
            continue #skip this file if it already exists
        else:
            # create folder, do try just in case other process has already made the folder
            try:
                os.mkdir(output_folder)
                print('created folder '+output_folder)
            except:
                print('cannot create folder '+output_folder)
                continue
        
            

        archis_zaakidentificatie = int(file.split('_')[0][1:]+'100')
        #print(archis_zaakidentificatie)
        
        record = archis_zaakdocumenten.loc[archis_zaakdocumenten['zaakidentificatie'] == archis_zaakidentificatie]
        
        if len(record) == 0: # no result in db, log and skip
            print(f"no entry in db for {archis_zaakidentificatie}, skipping")
            continue
            
        if len(record) > 1: # multiple rows with same zaakidentificatie, get one with document_id
            record = record.loc[record['document_id'].notnull()]

        
        file_name = file.replace(' ','_')

        output_document = {}
        output_document['source'] = 'archis'
        output_document['file_name'] = file_name
        output_document['file_type'] = 'report'
        output_document['title'] = record['titel'].values[0]
        creators = re.split(',|&| en |/|;', record['auteur'].values[0])    # split on muliple characters, because messy data         
        output_document['creators'] = creators
        output_document['description'] = '' # no descriptions in this data source
        output_document['publisher'] = '' # no publisher in this data source
        output_document['createdAt'] = int(record['jaar'].values[0])
        output_document['identifiers'] = {
            'uri': record['link'].values[0],
            'archis_zaakidentificatie': int(record['zaakidentificatie'].values[0]),
            'archis_zaak_id': int(record['zaak_id'].values[0]),
            'archis_identificatie': str(record['identificatie'].values[0])
        }
        output_document['language'] = 'Dutch'
        output_document['html_folder_name'] = f"{archis_zaakidentificatie}_{file_name.replace('.pdf','')}"

        #coordinates
        if str(record['x_coordinaat'].values[0]) != 'nan' and str(record['x_coordinaat'].values[0]) != 'nan':
            coordX = int(record['x_coordinaat'].values[0])
            coordY = int(record['y_coordinaat'].values[0])
            lat, lon = rd2wgs(coordX,coordY)
            output_document['coordX'] = coordX
            output_document['coordY'] = coordY
            output_document['location'] = {'lat':lat,'lon':lon}

        # save document.json 2DO get json location from settings
        json_output_folder = f"json/{file_name.replace('.pdf','')}"
        save_json(output_document, f"{json_output_folder}/document.json")

                   
"""
                    # process pdf, store page.json files with entities 
                    run_ner_on_pdf(
                        pdf_location, 
                        json_output_folder, 
                        'dutch'
                    )

                    # make and save html version of pdf
                    pdf2html(pdf_location)

"""



print('done!')


In [None]:
archis_zaaknummer = 4000543100
print(archis_zaaknummer)
record = archis_zaakdocumenten.loc[archis_zaakdocumenten['zaakidentificatie'] == archis_zaaknummer]

record

In [None]:
if len(record) > 1:
    record = record.loc[record['document_id'].notnull()]
    

In [None]:
record['link'].values[0]

In [None]:
a =