# KG index pdf-to-text pipeline

In [None]:
import os
import sys
import json
import pathlib
import logging
from datetime import datetime

import fitz

logging.basicConfig(stream=sys.stdout, level=logging.INFO)

In [None]:
CORPUS = 'ArxivHealthcareNLP'
#CORPUS = 'arxiv_cl'

In [None]:
def load_properties(filepath, sep='=', comment_char='#'):
    '''
    Read the file passed as parameter as a properties file.
    '''
    props = {}
    with open(filepath, "rt") as f:
        for line in f:
            l = line.strip()
            if l and not l.startswith(comment_char):
                key_value = l.split(sep)
                key = key_value[0].strip()
                value = sep.join(key_value[1:]).strip().strip('"') 
                props[key] = value 
    return props

corpus_properties = load_properties(f"corpora/{CORPUS}.properties")
corpus_properties

In [None]:
CORPUS_BASE = corpus_properties['corpus_base']
PDF_BASE = f'{CORPUS_BASE}/pdf/'
JSON_BASE = f'{CORPUS_BASE}/json_raw/'

if not os.path.exists(JSON_BASE):
    print(f'{JSON_BASE} does not exist. Creating.')
    os.makedirs(JSON_BASE)

In [None]:
from os import listdir
from os.path import isfile, join
pdf_files = [f for f in listdir(PDF_BASE) if isfile(join(PDF_BASE, f))]
print(f'Converting {len(pdf_files)} PDF files to text/json.')

In [None]:
def fix_document_name(document, pdf_name):
    #normalize to the pdf name
    document.metadata['title'] = pdf_name


In [None]:
def get_pdf_content(document):
    content = ""
    for page in document:
        content += page.get_text()
    return content

In [None]:
def is_pymupdf_extractable(document,content):
    #print(document.metadata['creator'])
    acceptable_creator = ['Springer','Pages', 'LaTeX with hyperref', 
                          'LaTeX with acmart 2020/04/30 v1.71 Typesetting articles for the Association for Computing Machinery and hyperref 2020-05-15 v7.00e Hypertext links for LaTeX',
                          'LaTeX with acmart 2022/02/19 v1.83 Typesetting articles for the Association for Computing Machinery and hyperref 2020-05-15 v7.00e Hypertext links for LaTeX']
    words_threshold = 200
    #if (document.metadata['creator'] in acceptable_creator) and (len(content) > words_threshold):
    if (len(content) > words_threshold):
        return True
    else:
        print(f'Document too short: {len(content)} characters {document.metadata["title"]} {document.metadata["creator"]}')
        return False
    
def write_json_file(filename, content):
    pathlib.Path(JSON_BASE+filename).write_bytes(content.encode('utf-8').strip())

def save_content(document,content):
    document_dict = dict()
    filename = document.metadata['title']+'.json'
    document_dict['title'] = document.metadata['title']
    document_dict['text'] = content
    document_dict['extraction_date'] = str(datetime.utcnow())
    document_dict['num_pages'] = document.page_count
    json_object = json.dumps(document_dict) 
    write_json_file(filename,json_object)

In [None]:

for pdf_name in pdf_files:
    file_name = f'{JSON_BASE}/{pdf_name}.json'
    if(os.path.exists(file_name)):
        #print(f'File converted, skipping: {file_name}')
        continue
    try:
        with fitz.open(join(PDF_BASE, pdf_name)) as document:
            content = get_pdf_content(document)
            fix_document_name(document, pdf_name)
            if is_pymupdf_extractable(document,content):
                save_content(document,content)
    except Exception as e:
        print(f'Cannot process {file_name} because {e}')

