## Running ATRIUM information extraction pipeline on full text extracted from OASIS PDF reports

In [1]:
%%capture
import warnings
# suppress user warnings during execution
warnings.filterwarnings(action='ignore', category=UserWarning)
warnings.filterwarnings(action='ignore', category=FutureWarning)
# load required dependencies
%pip install --upgrade pip
%pip install spacy
%pip install ipywidgets

%sx python -m spacy download en_core_web_sm

In [None]:
import spacy # for text processing
from spacy.language import Language
from datetime import datetime as DT # for timestamps
import json, os
from slugify import slugify # for valid filenames from identifiers
from weasyprint import HTML
#from rematch2 import PeriodoRuler, VocabularyRuler, NegationRuler, DocSummary, TextNormalizer
from rematch2 import DocSummary

from ATRIUM_T4_1_2_IE_pipeline import get_pipeline

# using predefined spaCy pipeline (English)
nlp = get_pipeline("en")

# input directory containing text files to process
#input_directory = "./data/oasis/journals_july_2024/text extraction - new" # Mugdha's output
input_directory = "./data/oasis/journals_july_2024/text_extraction-20251117" # Mark's script re-extracted text 2025-11-17

# subset of names of files to process
file_names = [
    "text_extraction_120_031_097.pdf.json",
    "text_extraction_2022_96_001_012_cooper_garton.pdf.json",
    "text_extraction_2022_96_013-068_huxley.pdf.json",
    "text_extraction_archael547-005-040-breeze.pdf.json",
    "text_extraction_archael547-079-116-ceolwulf.pdf.json",
    "text_extraction_daj_v023_1901_040-047.pdf.json",
    "text_extraction_daj_v086_1966_031-053.pdf.json",
    "text_extraction_nas_20_1985_67-86_jackson.pdf.json",
    "text_extraction_nas_20_1985_87-112_taylor.pdf.json",
    "text_extraction_surreyac103_063-090_lambert.pdf.json"
]

# timestamp for use in directory names
timestamp = DT.now().strftime('%Y%m%d')   

# create output file path if it does not already exist
output_directory = os.path.join(input_directory, f"ie-output-{timestamp}")
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

counter = 0
print("scanning input file directory")
for entry in os.scandir(input_directory):
    
    #if entry.is_file() and entry.name.endswith(".txt"): 
    if entry.is_file() and entry.name.lower() in file_names:  
        # temp break for testing
        #counter += 1        
        #if counter > 3: 
            #break

        ts_started = DT.now()
        
        # read text contents of input file
        input_file_name = entry.name        
        
        print(f"reading {input_file_name}")
        input_file_text = ""        
        with open(entry.path) as input_file:
            if(entry.name.endswith(".json")):
                data = json.load(input_file)
                input_file_text = data.get("text","")
            else:
                input_file_text = input_file.read()
        print(f"read '{input_file_name}' in {DT.now() - ts_started}ms")
        
        # set up metadata to include in output
        metadata = {
            "identifier": input_file_name,
            "title": "vocabulary-based information extraction results",
            "description": "vocabulary-based information extraction annotation on ADS OASIS journal report full-text",
            "creator": "T4-1-2-IE-OASIS-reports-full-text.ipynb",
            #"periodo_authority_id": periodo_authority_id,
            "pipeline": nlp.pipe_names,
            "input_file_name": input_file_name,
            "input_record_count": 1
        }

        # perform annotation on input text
        ts_nlp = DT.now() 
        print("running nlp pipeline") 
        doc = nlp(input_file_text)
        print(f"finished nlp pipeline in {DT.now() - ts_nlp}")

        ts_sum = DT.now() 
        print("summarizing results") 
        summary = DocSummary(doc, metadata=metadata)
        print(f"finished summarizing results in {DT.now() - ts_sum}")
        
        ts_finished = DT.now()
        metadata["starting"] = ts_started.strftime('%Y-%m-%dT%H:%M:%SZ')
        metadata["finished"] = ts_finished.strftime('%Y-%m-%dT%H:%M:%SZ')
        metadata["duration"] =  ts_finished - ts_started

        # write results to text files
        #html_file_name = os.path.join(output_directory, f"ner-output-{slugify(input_file_name)}.html") 
        #text_file_name = os.path.join(output_directory, f"ner-output-{slugify(input_file_name)}.txt")
        csv_file_name = os.path.join(output_directory, f"ner-output-{slugify(input_file_name)}.csv")
        json_file_name = os.path.join(output_directory, f"ner-output-{slugify(input_file_name)}.json")
        pdf_file_name = os.path.join(output_directory, f"ner-output-{slugify(input_file_name)}.pdf")
        
        ts_started = DT.now()   
        print("creating CSV file")
        with open(csv_file_name, "w") as file:
            file.write(summary.spans_to_csv())
        print(f"finished creating CSV file in {DT.now() - ts_started}")

        # note last run took 21 mins for 2 files
        # 10/07/25 - now 1:10:31 for 10 files (JSON, TEXT and HTML output)
        # 17/11/25 - now 0:25:50 for 10 files (JSON, CSV and PDF output) (now omitting negation pairs)
        ts_started = DT.now()   
        print("creating PDF report")
        report = summary.report(format="html")
        HTML(None, string=report, encoding="utf-8").write_pdf(target=pdf_file_name)                
        print(f"finished creating PDF report in {DT.now() - ts_started}")

        #ts_started = DT.now()
        #print("creating TEXT report")
        #report = summary.report(format="text")
        #with open(text_file_name, "w") as file:
            #file.write(report)
        #print(f"finished creating TEXT report in {DT.now() - ts_started}")
             
        ts_started = DT.now()
        print("creating JSON report")
        report = summary.report(format="json")
        with open(json_file_name, "w") as file:
            file.write(report) 
        print(f"finished creating JSON report in {DT.now() - ts_started}")

scanning input file directory
reading text_extraction_archael547-079-116-ceolwulf.pdf.json
read 'text_extraction_archael547-079-116-ceolwulf.pdf.json' in 0:00:00.002980ms
running nlp pipeline
finished nlp pipeline in 0:02:27.772107
summarizing results
finished summarizing results in 0:00:00.006653
creating CSV file
finished creating CSV file in 0:00:00.024600
creating PDF report
"_get_dependency_pairs" ran in 41.978 seconds
"_get_noun_chunk_pairs" ran in 5.034 seconds
"_get_all_pairs" ran in 47.013 seconds
"get_span_pairs" ran in 47.013 seconds
"report_to_html" ran in 48.709 seconds
"report" ran in 48.709 seconds
finished creating PDF report in 0:00:55.553863
creating JSON report
"report_to_json" ran in 0.203 seconds
"report" ran in 0.203 seconds
finished creating JSON report in 0:00:00.206163
reading text_extraction_NAS_20_1985_87-112_Taylor.pdf.json
read 'text_extraction_NAS_20_1985_87-112_Taylor.pdf.json' in 0:00:00.005836ms
running nlp pipeline
finished nlp pipeline in 0:01:11.6602