## Running ATRIUM NER pipeline on full text extracted from OASIS PDF reports

In [1]:
%%capture

import warnings
# suppress user warnings during execution
warnings.filterwarnings(action='ignore', category=UserWarning)

# load required dependencies
%pip install --upgrade pip
%pip install spacy
%pip install ipywidgets
%sx python -m spacy download en_core_web_sm

In [2]:
import spacy # for NER processing
#from spacy.tokens import Doc # for NER results
from datetime import datetime as DT # for timestamps
import json, os
from slugify import slugify # for valid filenames from identifiers
from weasyprint import HTML
#from rematch2 import PeriodoRuler, VocabularyRuler, NegationRuler, DocSummary, TextNormalizer
from rematch2 import DocSummary
from rematch2.spacypatterns import patterns_en_ATTRIBUTE_RULES # rules to override POS tags in some cases

# reading supplementary lists from JSON files
def read_json(file_name):
    data = []
    try:
        with open(file_name, "r") as f:
            data = json.load(f)
    except Exception as e:
        print(f"Problem reading \"{file_name}\": {e}")
    return data

# using predefined spaCy pipeline (English)
print("loading spacy language model")
nlp = spacy.load("en_core_web_sm", disable = ['ner'])
print("..done")
# adding custom rules to override default POS tagging for specific cases
# NOTE: adding rules to existing attribute_ruler component didn't seem to
# work, so insert another one directly after it and add rules to that one
print("adding attribute ruler")
# nlp.get_pipe("attribute_ruler").add_patterns(patterns_en_ATTRIBUTE_RULES) # didn't work this way
ar = nlp.add_pipe("attribute_ruler", name="custom_attribute_ruler", after="attribute_ruler")
ar.add_patterns(patterns_en_ATTRIBUTE_RULES)
print("..done")

# using "Historic England Archaeological and Cultural Periods" Perio.do authority
periodo_authority_id = "p0kh9ds" 
# read supplementary and stopword lists from JSON files
print("reading supplementary lists and stopword lists")
# supplementary concepts we want to appear in the results (or alternate terms for existing concepts)
supp_list_obj = read_json("./supp_list_en_FISH_ARCHOBJECTS.json")
supp_list_mon = read_json("./supp_list_en_FISH_MONUMENTS.json")
supp_list_per = read_json("./supp_list_en_FISH_PERIODS.json")
# existing vocabulary concepts we don't want to appear in the results (even if legitimate matches) 
stop_list_obj = read_json("./stop_list_en_FISH_ARCHOBJECTS.json")
stop_list_mon = read_json("./stop_list_en_FISH_MONUMENTS.json")
print("..done")

# add rematch2 NER component(s) to the pipeline
nlp.add_pipe("normalize_text", before = "tagger")
nlp.add_pipe("yearspan_ruler", last=True)   
nlp.add_pipe("periodo_ruler", last=True, config={"periodo_authority_id": periodo_authority_id, "supp_list": supp_list_per}) 
nlp.add_pipe("fish_archobjects_ruler", last=True, config={"supp_list": supp_list_obj, "stop_list": stop_list_obj}) 
nlp.add_pipe("fish_monument_types_ruler", last=True, config={"supp_list": supp_list_mon, "stop_list": stop_list_mon})   
#nlp.add_pipe("negation_ruler", last=True) 
nlp.add_pipe("child_span_remover", last=True) 

input_directory = "./data/oasis/journals_july_2024/text extraction - new"

# subset of files to process
file_names = [
    "120_031_097_new.txt",
    "2022_96_001_012_cooper_garton_new.txt",
    "2022_96_013-068_huxley_new.txt",
    "archael547-005-040-breeze_new.txt",
    "archael547-079-116-ceolwulf_new.txt",
    "daj_v023_1901_040-047_new.txt",
    "daj_v086_1966_031-053_new.txt",
    "nas_20_1985_67-86_jackson_new.txt",
    "nas_20_1985_87-112_taylor_new.txt",
    "surreyac103_063-090_lambert_new.txt"
]

# timestamp for use in directory names
timestamp = DT.now().strftime('%Y%m%d')   

# create output file path if it does not already exist
output_directory = os.path.join(input_directory, f"ner-output-{timestamp}")
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

counter = 0
print("scanning input file directory")
for entry in os.scandir(input_directory):
    
    #if entry.is_file() and entry.name.endswith(".txt"): 
    if entry.is_file() and entry.name.lower() in file_names:  
        counter += 1
        # temp break for testing
        #if counter > 3: 
            #break  
            
        ts_started = DT.now()   

        # print progress indicator
        input_file_name = entry.name        
        print(f"processing entry {counter} - '{input_file_name}'")

        # read text contents of input file
        print(f"reading {input_file_name}")
        input_file_text = ""
        with open(entry.path) as input_file:
            input_file_text = input_file.read()
        print(f"read '{input_file_name}' in {DT.now() - ts_started}ms")
        
        # set up metadata to include in output
        metadata = {
            "identifier": input_file_name,
            "title": "vocabulary-based NER results",
            "description": "vocabulary-based NER annotation on report full-text",
            "creator": "T4-1-2-NER-OASIS-reports-full-text.ipynb",
            "periodo_authority_id": periodo_authority_id,
            "ner_pipeline": nlp.pipe_names,
            "input_file_name": input_file_name,
            "input_record_count": 1
        }

        # perform annotation on input text
        ts_nlp = DT.now() 
        print("running nlp pipeline") 
        doc = nlp(input_file_text)
        print(f"finished nlp pipeline in {DT.now() - ts_nlp}")

        ts_sum = DT.now() 
        print("summarizing results") 
        summary = DocSummary(doc, metadata=metadata)
        print(f"finished summarizing results in {DT.now() - ts_sum}")
        
        ts_finished = DT.now()
        metadata["starting"] = ts_started.strftime('%Y-%m-%dT%H:%M:%SZ')
        metadata["finished"] = ts_finished.strftime('%Y-%m-%dT%H:%M:%SZ')
        metadata["duration"] =  ts_finished - ts_started

        # write results to text files
        #html_file_name = os.path.join(output_directory, f"ner-output-{slugify(input_file_name)}.html") 
        #text_file_name = os.path.join(output_directory, f"ner-output-{slugify(input_file_name)}.txt")
        json_file_name = os.path.join(output_directory, f"ner-output-{slugify(input_file_name)}.json")
        pdf_file_name= os.path.join(output_directory, f"ner-output-{slugify(input_file_name)}.pdf")
        
        # note last run took 21 mins for 2 files
        # 10/07/25 - now 1:10:31 for 10 files (JSON, TEXT and HTML output)
        ts_started = DT.now()   
        print("creating PDF report")
        report = summary.report(format="html")
        HTML(None, string=report, encoding="utf-8").write_pdf(target=pdf_file_name)                
        print(f"finished creating PDF report in {DT.now() - ts_started}")

        #ts_started = DT.now()
        #print("creating TEXT report")
        #report = summary.report(format="text")
        #with open(text_file_name, "w") as file:
            #file.write(report)
        #print(f"finished creating TEXT report in {DT.now() - ts_started}")
             
        ts_started = DT.now()
        print("creating JSON report")
        report = summary.report(format="json")
        with open(json_file_name, "w") as file:
            file.write(report) 
        print(f"finished creating JSON report in {DT.now() - ts_started}")

loading spacy language model
..done
adding attribute ruler
..done
reading supplementary lists and stopword lists
..done
Create VocabularyRuler - stop list:
[]
Stop IDs: []
Create VocabularyRuler - stop list:
[{'id': 'http://purl.org/heritagedata/schemes/mda_obj/concepts/139085', 'term': 'Coin (Contemporary Imitation)'}, {'id': 'http://purl.org/heritagedata/schemes/mda_obj/concepts/139087', 'term': 'Coin (Modern Forgery)'}, {'id': 'http://purl.org/heritagedata/schemes/mda_obj/concepts/139086', 'term': 'Coin (Modern Imitation)'}, {'id': 'http://purl.org/heritagedata/schemes/mda_obj/concepts/95353', 'term': 'level'}, {'id': 'http://purl.org/heritagedata/schemes/mda_obj/concepts/95306', 'term': 'scale'}, {'id': 'http://purl.org/heritagedata/schemes/mda_obj/concepts/96615', 'term': 'shift'}, {'id': 'http://purl.org/heritagedata/schemes/mda_obj/concepts/96379', 'term': 'point'}, {'id': 'http://purl.org/heritagedata/schemes/mda_obj/concepts/143243', 'term': 'setting'}, {'id': 'http://purl.org