# knowledge graph index pdf pipeline

In [1]:
import os
import time

import openai
os.environ['OPENAI_API_KEY'] = "EMPTY"
os.environ['OPENAI_API_BASE'] = "http://10.0.0.222:30307/v1"
openai.api_key = "EMPTY"
openai.api_base = "http://10.0.0.222:30307/v1"

model = "Writer/camel-5b-hf"
#model = "mosaicml/mpt-7b-instruct"
#model = "mosaicml/mpt-30b-instruct"

In [2]:
import logging
import sys

#kron extensions to llama_index to support openai compatible api
#sys.path.append('../llama_index')

logging.basicConfig(stream=sys.stdout, level=logging.INFO)

In [None]:
#CORPUS = 'ArxivHealthcareNLP'
CORPUS = 'arxiv_cl'

In [None]:
def load_properties(filepath, sep='=', comment_char='#'):
    '''
    Read the file passed as parameter as a properties file.
    '''
    props = {}
    with open(filepath, "rt") as f:
        for line in f:
            l = line.strip()
            if l and not l.startswith(comment_char):
                key_value = l.split(sep)
                key = key_value[0].strip()
                value = sep.join(key_value[1:]).strip().strip('"') 
                props[key] = value 
    return props

corpus_properties = load_properties(f"corpora/{CORPUS}.properties")
corpus_properties

In [3]:
from datetime import datetime
import json
import pathlib
import fitz

CORPUS_BASE = '/home/arylwen/datasets/documents/ArxivHealthcareNLP'
PDF_BASE = f'{CORPUS_BASE}/pdf/'
JSON_BASE = f'{CORPUS_BASE}/json_raw/'

JSON_BASE

'/home/arylwen/datasets/documents/ArxivHealthcareNLP/json_raw/'

In [4]:
from os import listdir
from os.path import isfile, join
pdf_files = [f for f in listdir(PDF_BASE) if isfile(join(PDF_BASE, f))]
pdf_files

['2211.01705v1.A_speech_corpus_for_chronic_kidney_disease.pdf',
 '2211.01761v1.PromptEHR__Conditional_Electronic_Healthcare_Records_Generation_with_Prompt_Learning.pdf',
 '2211.03536v1.Knowledge_Graph_Embedding__A_Survey_from_the_Perspective_of_Representation_Spaces.pdf',
 '2211.03818v1.CELLS__A_Parallel_Corpus_for_Biomedical_Lay_Language_Generation.pdf',
 '2211.04013v1.COV19IR___COVID_19_Domain_Literature_Information_Retrieval.pdf',
 '2211.04569v1.Toward_a_Neural_Semantic_Parsing_System_for_EHR_Question_Answering.pdf',
 '2211.04759v1.Nested_Named_Entity_Recognition_from_Medical_Texts__An_Adaptive_Shared_Network_Architecture_with_Attentive_CRF.pdf',
 '2211.06778v1.Textual_Data_Augmentation_for_Patient_Outcomes_Prediction.pdf',
 '2211.07047v2.Language_Model_Classifier_Aligns_Better_with_Physician_Word_Sensitivity_than_XGBoost_on_Readmission_Prediction.pdf',
 '2211.07126v3.Discharge_Summary_Hospital_Course_Summarisation_of_In_Patient_Electronic_Health_Record_Text_with_Clinical_Concept_Gu

In [5]:
def fix_document_name(document, pdf_name):
    #some documents have really long names
    #if document.metadata['title'].strip() == '':
    document.metadata['title'] = pdf_name


In [6]:
def get_pdf_content(document):
    content = ""
    for page in document:
        content += page.get_text()
    return content

#print(get_pdf_content(document))

In [7]:
def is_pymupdf_extractable(document,content):
    print(document.metadata['creator'])
    acceptable_creator = ['Springer','Pages', 'LaTeX with hyperref', 
                          'LaTeX with acmart 2020/04/30 v1.71 Typesetting articles for the Association for Computing Machinery and hyperref 2020-05-15 v7.00e Hypertext links for LaTeX',
                          'LaTeX with acmart 2022/02/19 v1.83 Typesetting articles for the Association for Computing Machinery and hyperref 2020-05-15 v7.00e Hypertext links for LaTeX']
    words_threshold = 20
    #if (document.metadata['creator'] in acceptable_creator) and (len(content) > words_threshold):
    if (len(content) > words_threshold):
        return True
    else:
        print(document.metadata['title']+" "+document.metadata['creator'])
        return False
    
def write_json_file(filename, content):
    pathlib.Path(JSON_BASE+filename).write_bytes(content.encode('utf-8').strip())

def save_content(document,content):
    document_dict = dict()
    filename = document.metadata['title']+'.json'
    document_dict['title'] = document.metadata['title']
    document_dict['text'] = content
    document_dict['extraction_date'] = str(datetime.utcnow())
    document_dict['num_pages'] = document.page_count
    json_object = json.dumps(document_dict) 
    write_json_file(filename,json_object)

In [8]:

for pdf_name in pdf_files:
    with fitz.open(join(PDF_BASE, pdf_name)) as document:
        content = get_pdf_content(document)
        if is_pymupdf_extractable(document,content):
            fix_document_name(document, pdf_name)
            save_content(document,content)


Certified by IEEE PDFeXpress at September 29, 2022 07:16:23 
LaTeX with hyperref
LaTeX with acmart 2021/08/29 v1.79 Typesetting articles for the Association for Computing Machinery and hyperref 2020-05-15 v7.00e Hypertext links for LaTeX
LaTeX with hyperref
LaTeX with hyperref
LaTeX with hyperref
LaTeX with hyperref
LaTeX with hyperref
LaTeX with hyperref
LaTeX with hyperref
LaTeX with hyperref
Hwp 2018 10.0.0.11808
LaTeX with hyperref
LaTeX with hyperref
LaTeX with hyperref
LaTeX with hyperref
LaTeX with hyperref
LaTeX with hyperref
LaTeX with hyperref
LaTeX with hyperref
LaTeX with hyperref
LaTeX with hyperref
LaTeX with hyperref
LaTeX with hyperref
LaTeX with hyperref
LaTeX with hyperref
LaTeX with hyperref
LaTeX with hyperref
LaTeX with hyperref
TeX
Microsoft® Word LTSC
LaTeX with hyperref
LaTeX with hyperref
Word
Microsoft® Word for Microsoft 365
Word
LaTeX with hyperref
LaTeX with acmart 2020/09/13 v1.73 Typesetting articles for the Association for Computing Machinery and hyperre