## Running ATRIUM information extraction pipeline on full text extracted from OASIS PDF reports

In [4]:
%%capture
import warnings
# suppress user warnings during execution
warnings.filterwarnings(action='ignore', category=UserWarning)
warnings.filterwarnings(action='ignore', category=FutureWarning)
# load required dependencies
%pip install --upgrade pip
%pip install spacy
%pip install ipywidgets

%sx python -m spacy download en_core_web_sm

In [1]:
import spacy # for text processing
from spacy.language import Language
from datetime import datetime as DT # for timestamps
import json, os
from slugify import slugify # for valid filenames from identifiers
from weasyprint import HTML
#from rematch2 import PeriodoRuler, VocabularyRuler, NegationRuler, DocSummary, TextNormalizer
from rematch2 import DocSummary
from rematch2.SpanScorer import SpanScorer

from ATRIUM_T4_1_2_IE_pipeline import get_pipeline

# using predefined spaCy pipeline (English)
ts_started = DT.now()     
print("setting up nlp pipeline")
nlp = get_pipeline("en")
print(f"finished setting up nlp pipeline in {DT.now() - ts_started}")

# input directory containing text files to process
#input_directory = "./data/oasis/journals_july_2024/text extraction - new" # Mugdha's output
input_directory = "./data/oasis/journals_july_2024/text_extraction-20251117" # Mark's script re-extracted text 2025-11-17

# subset of files to process for 27/11/2025 workshop at ADS
file_names = [
    #"text_extraction_120_031_097.pdf.json",#
    "text_extraction_2022_96_013-068_Huxley.pdf.json",#
    #"text_extraction_archael547-005-040-breeze.pdf.json",#
    #"text_extraction_archael547-079-116-ceolwulf.pdf.json",#
    "text_extraction_DAJ_v023_1901_040-047.pdf.json",#
    #"text_extraction_DAJ_v106_1986_018-100.pdf.json",#
    "text_extraction_SAC118_Garton.pdf.json",
    #"text_extraction_surreyac103_091-172_haslam.pdf.json",#
    #"text_extraction_surreyac103_185-266_saxby.pdf.json",#
    "text_extraction_2022_96_001_012_Cooper_Garton.pdf.json"#
]

# timestamp for use in directory names
timestamp = ts_started.strftime('%Y%m%d')   

# create output file path if it does not already exist
output_directory = os.path.join(input_directory, f"ie-output-{timestamp}")
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

counter = 0
print("scanning input file directory")
for entry in os.scandir(input_directory):
    
    if entry.is_file() and entry.name.lower() in list(map(str.lower, file_names)):  
        # temp break for testing
        #counter += 1        
        #if counter > 3: 
            #break

        
        # read contents of input (JSON) file
        print(f"reading {entry.name}")
        input_file_content = {}        
        with open(entry.path) as input_file:
            if(entry.name.endswith(".json")):
                input_file_content = json.load(input_file)                
            else:
                input_file_content = { "text": input_file.read() }
         
        # set up metadata to include in output
        metadata = {
            "identifier": entry.name,
            "title": "vocabulary-based information extraction results",
            "description": "vocabulary-based information extraction annotation on ADS OASIS journal report full-text",
            "creator": "T4-1-2-IE-OASIS-reports-full-text.ipynb",
            #"periodo_authority_id": periodo_authority_id,
            "pipeline": nlp.pipe_names,
            "input_file_name": entry.name,
            "input_record_count": 1
        }

        # perform annotation on input text
        ts_nlp = DT.now() 
        print("running nlp pipeline") 
        doc = nlp(input_file_content.get("text",""))
        print(f"finished nlp pipeline in {DT.now() - ts_nlp}")

        # add calculated scores for spans
        sections = list(input_file_content.get("sections", []))
        scorer = SpanScorer(nlp, sections=sections)
        doc = scorer(doc)

        ts_sum = DT.now() 
        print("summarizing results") 
        summary = DocSummary(doc, metadata=metadata)
        print(f"finished summarizing results in {DT.now() - ts_sum}")
        
        ts_finished = DT.now()
        metadata["starting"] = ts_nlp.strftime('%Y-%m-%dT%H:%M:%SZ')
        metadata["finished"] = ts_finished.strftime('%Y-%m-%dT%H:%M:%SZ')
        metadata["duration"] =  ts_finished - ts_nlp

        # write results to text files
        # html_file_name = os.path.join(output_directory, f"ner-output-{slugify(input_file_name)}.html") 
        # text_file_name = os.path.join(output_directory, f"ner-output-{slugify(entry.name)}.txt")
        csv_file_name = os.path.join(output_directory, f"ner-output-{slugify(entry.name)}.csv")
        json_file_name = os.path.join(output_directory, f"ner-output-{slugify(entry.name)}.json")
        pdf_file_name = os.path.join(output_directory, f"ner-output-{slugify(entry.name)}.pdf")
        
        ts_csv = DT.now()   
        print("creating CSV file")
        with open(csv_file_name, "w") as file:
            file.write(summary.spans_to_csv())
        print(f"finished creating CSV report in {DT.now() - ts_csv}")

        # note last run took 21 mins for 2 files
        # 10/07/25 - now 1:10:31 for 10 files (JSON, TEXT and HTML output)
        # 19/11/25 - now 0:49:45 for 10 files (JSON, CSV and PDF output) (now omitting negation pairs)
        ts_pdf = DT.now()   
        print("creating PDF report")
        report = summary.report(format="html")
        HTML(None, string=report, encoding="utf-8").write_pdf(target=pdf_file_name)                
        print(f"finished creating PDF report in {DT.now() - ts_pdf}")

        #ts_started = DT.now()
        #print("creating TEXT report")
        #report = summary.report(format="text")
        #with open(text_file_name, "w") as file:
            #file.write(report)
        #print(f"finished creating TEXT report in {DT.now() - ts_started}")
             
        ts_json = DT.now()
        print("creating JSON report")
        report = summary.report(format="json")
        with open(json_file_name, "w") as file:
            file.write(report) 
        print(f"finished creating JSON report in {DT.now() - ts_json}")

print(f"finished processing {counter} files in {DT.now() - ts_started}")

setting up nlp pipeline
finished setting up nlp pipeline in 0:01:50.979470
scanning input file directory
reading text_extraction_DAJ_v023_1901_040-047.pdf.json
running nlp pipeline
finished nlp pipeline in 0:00:14.143251
summarizing results
finished summarizing results in 0:00:00.000060
creating CSV file
finished creating CSV report in 0:00:00.014007
creating PDF report
"_get_dependency_pairs" ran in 0.798 seconds
"_get_noun_chunk_pairs" ran in 0.060 seconds
"_get_all_pairs" ran in 0.859 seconds
"get_span_pairs" ran in 0.859 seconds
"report_to_html" ran in 0.930 seconds
"report" ran in 0.930 seconds
finished creating PDF report in 0:00:02.743579
creating JSON report
"report_to_json" ran in 0.022 seconds
"report" ran in 0.022 seconds
finished creating JSON report in 0:00:00.022761
reading text_extraction_2022_96_013-068_Huxley.pdf.json
running nlp pipeline
finished nlp pipeline in 0:01:38.525130
summarizing results
finished summarizing results in 0:00:00.000274
creating CSV file
finishe