## Running ATRIUM information extraction pipeline on full text extracted from OASIS PDF reports

In [1]:
%%capture
import warnings
# suppress user warnings during execution
warnings.filterwarnings(action='ignore', category=UserWarning)
warnings.filterwarnings(action='ignore', category=FutureWarning)
# load required dependencies
%pip install --upgrade pip
%pip install spacy
%pip install ipywidgets

%sx python -m spacy download en_core_web_sm

In [2]:
from datetime import datetime as DT # for timestamps
import json, os
from slugify import slugify # for valid filenames from identifiers
from weasyprint import HTML
#from rematch2 import PeriodoRuler, VocabularyRuler, NegationRuler, DocSummary, TextNormalizer
from rematch2 import DocSummary
from rematch2.SpanScorer import SpanScorer
#from tides_dataclasses import Section

from ATRIUM_T4_1_2_IE_pipeline import get_pipeline, read_csv_file

# using predefined spaCy pipeline (English)
ts_started = DT.now()     
print("setting up nlp pipeline")
nlp = get_pipeline("en")
print(f"finished setting up nlp pipeline in {DT.now() - ts_started}")

# input directory containing text files to process
#input_directory = "./data/oasis/journals_july_2024/text extraction - new" # Mugdha's output
input_directory = "./data/oasis/journals_july_2024/text_extraction_20251117" # Mark's script re-extracted text 2025-11-17

# read separate CSV file containing titles and abstracts for (some of) the files we will process
print("extracting metadata records from CSV file")
metadata_file_path = os.path.join(input_directory, "journal_metadata.csv")
metadata_records = read_csv_file(file_path=metadata_file_path)
print(f"Total metadata records extracted: {len(metadata_records)}")
#print(metadata_records[0])  # print first record for inspection

# subset of files to process
file_names_subset = [
	"text_extraction_120_031_097.pdf.json",
	"text_extraction_2022_96_013-068_Huxley.pdf.json",
    "text_extraction_078_233_250.pdf.json",
    "text_extraction_120_215_235.pdf.json",
    "text_extraction_2022_96_001_012_Cooper_Garton.pdf.json",
    "text_extraction_2022_96_079-094_Browning_et_al.pdf.json",
    "text_extraction_archael522-067-077-whitworth.pdf.json",
    "text_extraction_archael547-005-040-breeze.pdf.json",
    "text_extraction_archael547-079-116-ceolwulf.pdf.json",
    "text_extraction_DAJ_v023_1901_040-047.pdf.json",
    "text_extraction_DAJ_v086_1966_093-098.pdf.json",
    "text_extraction_DAJ_v106_1986_005-017.pdf.json",
    "text_extraction_DAJ_v106_1986_018-100.pdf.json",
    "text_extraction_NAS_20_1985_113-138_Shaw.pdf.json",
    "text_extraction_SAC118_Bedwin.pdf.json",
    "text_extraction_SAC118_Garton.pdf.json",
    "text_extraction_SAC118_Stevens.pdf.json",
    "text_extraction_surreyac103_091-172_haslam.pdf.json",
    "text_extraction_surreyac103_185-266_saxby.pdf.json",
    "text_extraction_surreyac103_297-305_english.pdf.json",
]

# timestamp for use in directory names
timestamp = ts_started.strftime('%Y%m%d')   

# create output file path if it does not already exist
output_directory = os.path.join(input_directory, f"ie-output-{timestamp}")
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

counter = 0
print("scanning input file directory")
for entry in os.scandir(input_directory):
    
    if not entry.is_file(): continue
    if not entry.name.lower() in list(map(str.lower, file_names_subset)): continue
    # temp break for testing
    #if counter >= 2: 
        #print(f"reached test limit of {counter} files - stopping")
        #break
    counter += 1     
       
    # read contents of input (JSON) file
    print(f"--------------------------------")
    print(f"reading {entry.name}")
    input_file_content = {}        
    with open(entry.path) as input_file:
        if(entry.name.endswith(".json")):
            input_file_content = json.load(input_file)
        else:
            input_file_content = { "text": input_file.read() }
         
    # find matching metadata record (if it exists) for this file. The metadata filename is the original PDF file; 
    # the input file is prefixed "text_extraction_" and suffixed ".json" so find an entry that _contains_ the metadata filename    
    metadata_record = next((record for record in metadata_records if record.get("file", "").lower() in entry.name.lower()), {})
        
    # remove any existing title or abstract sections from input file content            
    sections = input_file_content.get("sections", [])
    sections = list(filter(lambda sec: sec.get("type", "") not in ["title", "abstract"], sections))

    # get title and abstract from metadata record (if present)
    title_text = metadata_record.get("title", "").strip()
    abstract_text = metadata_record.get("abstract", "").strip()
    
    # create new title section (even if empty)
    title_start = 0
    title_end = title_start + (len(title_text) - 1 if title_text != "" else 0)
    sec_title = {
        "type": "title",
        "start": title_start,
        "end": title_end
    }

    # create new abstract section (even if empty)
    abstract_start = sec_title["end"] + (2 if title_text != "" else 0)  # add 2 to account for newline after title
    abstract_end = abstract_start + (len(abstract_text) - 1 if abstract_text != "" else 0)
    sec_abstract = {
        "type": "abstract",
        "start": abstract_start,
        "end": abstract_end
    }
    # else: 
        # no abstract - so just use first page text to represent the abstract
        #pages = list(filter(lambda sec: sec.get("type", "") == "page", input_file_content.get("sections", [])))
        #if pages:
            #first_page = min(pages, key=lambda p: p['start'])
            #first_page_start = first_page.get("start", 0)
            #first_page_end = first_page.get("end", 0)
            #first_page_text = input_file_content.get("text", "")[first_page_start:first_page_end]
            #input_file_content["abstract"] = first_page_text
            #sec_abstract = {
                #"type": "abstract",
                #"text": first_page_text,
                #"start": first_page_start,
                #"end": first_page_end
            #} 
            
    # prepend title and abstract from metadata file (if available) to input file text
    prepend_text = f"{title_text}\n{abstract_text}"
    prepend_size = len(prepend_text)
    input_file_content["text"] = f"{prepend_text}\n{input_file_content.get('text', '')}"
        
    # adjust existing sections' start and end positions accordingly
    for sec in sections:
        sec["start"] += prepend_size + 1  # add 1 to account for newline after abstract 
        sec["end"] += prepend_size + 1  
        
    # add new title and abstract sections
    sections = [sec_title, sec_abstract] + sections
    input_file_content["sections"] = sections
           
    # also add or replace title and abstract as top-level fields
    if (title_text != ""):     
        input_file_content["title"] = title_text
    if(abstract_text != ""):
        input_file_content["abstract"] = abstract_text
    # TODO - write modified sections etc back to original input file?
                       
    print(f"processing file {counter}: {entry.name}")

    # set up metadata to include in output
    metadata = {
        "identifier": entry.name,
        "title": "vocabulary-based information extraction results",
        "description": "vocabulary-based information extraction annotation on ADS OASIS journal report full-text",
        "creator": "T4_1_2_IE_OASIS_journal_reports.ipynb",
        #"periodo_authority_id": periodo_authority_id,
        "pipeline": nlp.pipe_names,
        "input_file_name": entry.name,
        "input_record_count": 1
    }

    # perform annotation on input text
    ts_nlp = DT.now() 
    print("running nlp pipeline") 
    doc = nlp(input_file_content.get("text",""))
    print(f"finished nlp pipeline in {DT.now() - ts_nlp}")

    # add calculated scores to spans
    sections = list(input_file_content.get("sections", []))
    scorer = SpanScorer(nlp, sections=sections)
    doc = scorer(doc)

    ts_sum = DT.now() 
    print("summarizing results") 
    summary = DocSummary(doc, metadata=metadata)
    print(f"finished summarizing results in {DT.now() - ts_sum}")
        
    ts_finished = DT.now()
    metadata["starting"] = ts_nlp.strftime('%Y-%m-%dT%H:%M:%SZ')
    metadata["finished"] = ts_finished.strftime('%Y-%m-%dT%H:%M:%SZ')
    metadata["duration"] =  ts_finished - ts_nlp

    # write results to text files
    # html_file_name = os.path.join(output_directory, f"ie-output-{slugify(entryname)}.html") 
    text_file_name = os.path.join(output_directory, f"ie-output-{slugify(entry.name)}.txt")
    csv_file_name = os.path.join(output_directory, f"ie-output-{slugify(entry.name)}.csv")
    json_file_name = os.path.join(output_directory, f"ie-output-{slugify(entry.name)}.json")
    pdf_file_name = os.path.join(output_directory, f"ie-output-{slugify(entry.name)}.pdf")
        
    ts_csv = DT.now()   
    print("creating CSV file")
    with open(csv_file_name, "w") as file:
        file.write(summary.spans_to_csv())
    print(f"finished creating CSV report in {DT.now() - ts_csv}")

    # note early run took 21 mins for 2 files
    # 10/07/2025 - 1:10:31 for 10 files (JSON, TXT and HTML output)
    # 19/11/2025 - 0:49:45 for 10 files (JSON, CSV and PDF output) (omitting negation pairs)
    # 02/02/2026 - 0:41:15 for 18 files (JSON, CSV and PDF output); remaining 2 in 0:15:44
    # 04/02/2026 - 0:00:00 for 20 files (JSON, CSV and PDF output)
    ts_pdf = DT.now()   
    print("creating PDF report")
    report = summary.report(format="html")
    HTML(None, string=report, encoding="utf-8").write_pdf(target=pdf_file_name)                
    print(f"finished creating PDF report in {DT.now() - ts_pdf}")

    #ts_started = DT.now()
    #print("creating TEXT report")
    #report = summary.report(format="text")
    #with open(text_file_name, "w") as file:
        #file.write(report)
    #print(f"finished creating TEXT report in {DT.now() - ts_started}")
             
    ts_json = DT.now()
    print("creating JSON report")
    report = summary.report(format="json")
    report["sections"] = input_file_content.get("sections", [])  # include sections in output JSON for score diagnostics
        
    with open(json_file_name, "w") as file:
        # convert to JSON string first for pretty printing
        json_string = json.dumps(report, indent=4, default=str)
        file.write(json_string)
    print(f"finished creating JSON report in {DT.now() - ts_json}")

print(f"finished processing {counter} files in {DT.now() - ts_started}")

setting up nlp pipeline
finished setting up nlp pipeline in 0:01:45.686281
extracting metadata records from CSV file
Total metadata records extracted: 75
scanning input file directory
--------------------------------
reading text_extraction_surreyac103_091-172_haslam.pdf.json
processing file 1: text_extraction_surreyac103_091-172_haslam.pdf.json
running nlp pipeline
finished nlp pipeline in 0:04:38.276556
summarizing results
finished summarizing results in 0:00:00.000121
creating CSV file
finished creating CSV report in 0:00:00.197467
creating PDF report
"_get_dependency_pairs" ran in 367.597 seconds
"_get_noun_chunk_pairs" ran in 41.805 seconds
"_get_all_pairs" ran in 409.403 seconds
"get_span_pairs" ran in 409.403 seconds
"report_to_html" ran in 427.226 seconds
"report" ran in 427.226 seconds
finished creating PDF report in 0:07:30.628543
creating JSON report
"report_to_json" ran in 0.446 seconds
"report" ran in 0.446 seconds
finished creating JSON report in 0:00:00.774209
----------