## Running ATRIUM information extraction pipeline on full text extracted from OASIS PDF reports

In [1]:
%%capture
import warnings
# suppress user warnings during execution
warnings.filterwarnings(action='ignore', category=UserWarning)
warnings.filterwarnings(action='ignore', category=FutureWarning)
# load required dependencies
%pip install --upgrade pip
%pip install spacy
%pip install ipywidgets

%sx python -m spacy download en_core_web_sm

In [None]:
from datetime import datetime as DT # for timestamps
import json, os
from slugify import slugify # for valid filenames from identifiers
from weasyprint import HTML
#from rematch2 import PeriodoRuler, VocabularyRuler, NegationRuler, DocSummary, TextNormalizer
from rematch2 import DocSummary
from rematch2.SpanScorer import SpanScorer
#from tides_dataclasses import Section

from ATRIUM_T4_1_2_IE_pipeline import get_pipeline, get_records_from_csv_file


# using predefined spaCy pipeline (English)
ts_started = DT.now()     
print("setting up nlp pipeline")
nlp = get_pipeline("en")
print(f"finished setting up nlp pipeline in {DT.now() - ts_started}")

# input directory containing text files to process
#input_directory = "./data/oasis/journals_july_2024/text extraction - new" # Mugdha's output
input_directory = "./data/oasis/journals_july_2024/text_extraction_20251117" # Mark's script re-extracted text 2025-11-17

# read separate CSV file containing titles and abstracts for (some of) the files we will process
print("extracting metadata records from CSV file")
metadata_file_path = os.path.join(input_directory, "journal_metadata.csv")
metadata_records = get_records_from_csv_file(file_path=metadata_file_path)
print(f"Total metadata records extracted: {len(metadata_records)}")
print(metadata_records[0])  # print first record for inspection

# subset of files to process
file_names_subset = [
	#"text_extraction_120_031_097.pdf.json",
	#"text_extraction_2022_96_013-068_Huxley.pdf.json",
    #"text_extraction_078_233_250.pdf.json",
    #"text_extraction_120_215_235.pdf.json",
    #"text_extraction_2022_96_001_012_Cooper_Garton.pdf.json",
    #"text_extraction_2022_96_079-094_Browning_et_al.pdf.json",
    #"text_extraction_archael522-067-077-whitworth.pdf.json",
    #"text_extraction_archael547-005-040-breeze.pdf.json",
    #"text_extraction_archael547-079-116-ceolwulf.pdf.json",
    #"text_extraction_DAJ_v023_1901_040-047.pdf.json",
    #"text_extraction_DAJ_v086_1966_093-098.pdf.json",
    #"text_extraction_DAJ_v106_1986_005-017.pdf.json",
    #"text_extraction_DAJ_v106_1986_018-100.pdf.json",
    #"text_extraction_NAS_20_1985_113-138_Shaw.pdf.json",
    #"text_extraction_SAC118_Bedwin.pdf.json",
    #"text_extraction_SAC118_Garton.pdf.json",
    "text_extraction_SAC118_Stevens.pdf.json",
    "text_extraction_surreyac103_091-172_haslam.pdf.json",
    #"text_extraction_surreyac103_185-266_saxby.pdf.json",
    #"text_extraction_surreyac103_297-305_english.pdf.json",
]

# timestamp for use in directory names
timestamp = ts_started.strftime('%Y%m%d')   

# create output file path if it does not already exist
output_directory = os.path.join(input_directory, f"ie-output-{timestamp}")
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

counter = 0
print("scanning input file directory")
for entry in os.scandir(input_directory):
    
    if not entry.is_file(): continue
    if not entry.name.lower() in list(map(str.lower, file_names_subset)): continue
    # temp break for testing
    #if counter >= 2: break
    counter += 1        
        
       
    # read contents of input (JSON) file
    print(f"--------------------------------")
    print(f"reading {entry.name}")
    input_file_content = {}        
    with open(entry.path) as input_file:
        if(entry.name.endswith(".json")):
            input_file_content = json.load(input_file)
        else:
            input_file_content = { "text": input_file.read() }
         
    # find matching metadata record (if it exists) for this file
    # the metadata filename is the original PDF file; 
    # the input file is prefixed "text_extraction_" and suffixed ".json"
    # so just find an entry that contains the metadata filename    
    metadata_record = next((record for record in metadata_records if record.get("file", "").lower() in entry.name.lower()), None)
        
    # replace sections in input file content with title and abstract from metadata record (if available)
    if metadata_record is not None:
        title_text = metadata_record.get("title", "").strip()
        abstract_text = metadata_record.get("abstract", "").strip()
        
        # remove any existing title or abstract sections from input file content            
        sections = input_file_content.get("sections", [])
        sections = list(filter(lambda sec: sec.get("type", "") not in ["title", "abstract"], sections))

        # create new title and abstract sections
        sec_title = {
            "type": "title",
            "text": title_text,
            "start": 0,
            "end": len(title_text)
        }
        sec_abstract = {
            "type": "abstract",
            "text": abstract_text,
            "start": sec_title["end"] + 2,
            "end": sec_title["end"] + 2 + len(abstract_text)
        }
            
        # prepend title and abstract to existing text
        prepend_text = f"{title_text}\n{abstract_text}\n\n"
        prepend_size = len(prepend_text)
        input_file_content["text"] = f"{prepend_text}{input_file_content.get('text', '')}"
        
        # adjust existing sections' start and end positions accordingly
        for sec in sections:
            sec["start"] += prepend_size
            sec["end"] += prepend_size
        
        # add new title and abstract sections
        sections = [sec_title, sec_abstract] + sections
        input_file_content["sections"] = sections
           
        # also add or replace title and abstract as top-level fields
        if (title_text != ""):     
            input_file_content["title"] = title_text
        if(abstract_text != ""):
            input_file_content["abstract"] = abstract_text
    else:
        # no corresponding metadata record - so just use first page text to represent the abstract
        pages = list(filter(lambda sec: sec.get("type", "") == "page", input_file_content.get("sections", [])))
        if pages:
            first_page = min(pages, key=lambda p: p['start'])
            first_page_start = first_page.get("start", 0)
            first_page_end = first_page.get("end", 0)
            first_page_text = input_file_content.get("text", "")[first_page_start:first_page_end]
            input_file_content["abstract"] = first_page_text
            input_file_content["sections"].append({
                "type": "abstract",
                "text": first_page_text,
                "start": first_page_start,
                "end": first_page_end
            })
        # TODO - write modified sections etc back to input file?                   
    print(f"processing file {counter}: {entry.name}")

    # set up metadata to include in output
    metadata = {
        "identifier": entry.name,
        "title": "vocabulary-based information extraction results",
        "description": "vocabulary-based information extraction annotation on ADS OASIS journal report full-text",
        "creator": "T4_1_2_IE_OASIS_journal_reports.ipynb",
        #"periodo_authority_id": periodo_authority_id,
        "pipeline": nlp.pipe_names,
        "input_file_name": entry.name,
        "input_record_count": 1
    }

    # perform annotation on input text
    ts_nlp = DT.now() 
    print("running nlp pipeline") 
    doc = nlp(input_file_content.get("text",""))
    print(f"finished nlp pipeline in {DT.now() - ts_nlp}")

    # add calculated scores to spans
    sections = list(input_file_content.get("sections", []))
    scorer = SpanScorer(nlp, sections=sections)
    doc = scorer(doc)

    ts_sum = DT.now() 
    print("summarizing results") 
    summary = DocSummary(doc, metadata=metadata)
    print(f"finished summarizing results in {DT.now() - ts_sum}")
        
    ts_finished = DT.now()
    metadata["starting"] = ts_nlp.strftime('%Y-%m-%dT%H:%M:%SZ')
    metadata["finished"] = ts_finished.strftime('%Y-%m-%dT%H:%M:%SZ')
    metadata["duration"] =  ts_finished - ts_nlp

    # write results to text files
    # html_file_name = os.path.join(output_directory, f"ner-output-{slugify(input_file_name)}.html") 
    text_file_name = os.path.join(output_directory, f"ner-output-{slugify(entry.name)}.txt")
    csv_file_name = os.path.join(output_directory, f"ner-output-{slugify(entry.name)}.csv")
    json_file_name = os.path.join(output_directory, f"ner-output-{slugify(entry.name)}.json")
    pdf_file_name = os.path.join(output_directory, f"ner-output-{slugify(entry.name)}.pdf")
        
    ts_csv = DT.now()   
    print("creating CSV file")
    with open(csv_file_name, "w") as file:
        file.write(summary.spans_to_csv())
    print(f"finished creating CSV report in {DT.now() - ts_csv}")

    # note early run took 21 mins for 2 files
    # 10/07/2025 - 1:10:31 for 10 files (JSON, TEXT and HTML output)
    # 19/11/2025 - 0:49:45 for 10 files (JSON, CSV and PDF output) (now omitting negation pairs)
    # 02/02/2026 - 0:41:15 for 18 files (JSON, CSV and PDF output), remaining 2 in 0:15:44
    ts_pdf = DT.now()   
    print("creating PDF report")
    report = summary.report(format="html")
    HTML(None, string=report, encoding="utf-8").write_pdf(target=pdf_file_name)                
    print(f"finished creating PDF report in {DT.now() - ts_pdf}")

    #ts_started = DT.now()
    #print("creating TEXT report")
    #report = summary.report(format="text")
    #with open(text_file_name, "w") as file:
        #file.write(report)
    #print(f"finished creating TEXT report in {DT.now() - ts_started}")
             
    ts_json = DT.now()
    print("creating JSON report")
    report = summary.report(format="json")
        
    with open(json_file_name, "w") as file:
        #file.write(report) `
        # convert to JSON string
        json_string = json.dumps(report, indent=4)
        file.write(json_string)
    print(f"finished creating JSON report in {DT.now() - ts_json}")

print(f"finished processing {counter} files in {DT.now() - ts_started}")

setting up nlp pipeline
finished setting up nlp pipeline in 0:01:45.593784
extracting metadata records from CSV file
Total metadata records extracted: 75
{'title': 'King Ceolwulf’s land grants to St Cuthbert and their loss in the ninth century', 'file': 'archael547-079-116-ceolwulf.pdf', 'object_id': 3226527, 'journal': 'Archaeologia Aeliana', 'volume': 'Volume 47', 'pagees': 'Pages 79:116', 'abstract': 'The Historia de Sancto Cuthberto makes retrospective claims to lands granted to St Cuthbert and, in some cases, later taken from the church. Sections 8 and 11 refer to lands granted by King Ceolwulf (729–737) and lost around 860. Retrogressive technique is used to reconstruct the geography of these lands, within the widely accepted models of shire and minster organisation, drawing on post-Conquest feudal records. The circumstances under which these and other estates were taken from the church in the 9th century are discussed. It is argued that Lindisfarne’s territorial reach in Northum