## Running ATRIUM NER pipeline on full text extracted from OASIS PDF reports

In [1]:
%%capture

import warnings
# suppress user warnings during execution
warnings.filterwarnings(action='ignore', category=UserWarning)

# load required dependencies
%pip install --upgrade pip
%pip install spacy
%pip install ipywidgets
%sx python -m spacy download en_core_web_sm

#from IPython.display import display, HTML
import spacy # for NER processing
from spacy.tokens import Doc # for NER results
from datetime import datetime as DT # for timestamps
import pandas as pd  # for DataFrame
import os
from IPython.display import display, HTML # for displaying HTML output in Python notebook
from rematch2 import PeriodoRuler, VocabularyRuler, NegationRuler, DocSummary

In [4]:
# read full report text 
def read_text_file(filename: str="")-> str: 
    txt = ""
    try:
        with open(filename, "r") as input:
            txt = input.read()
    except:
        print(f"couldn't read '{filename}'")
    return txt


# normalize string whitespace
def normalize_whitespace(s: str = "") -> str: 
    return ' '.join(s.strip().split()) 


periodo_authority_id = "p0kh9ds" # HE Periods list
# using predefined spaCy pipeline (English)
nlp = spacy.load("en_core_web_sm", disable = ['ner'])
# add rematch2 custom NER component(s) to the end of the pipeline
nlp.add_pipe("yearspan_ruler", last=True)    
nlp.add_pipe("periodo_ruler", last=True, config={"periodo_authority_id": periodo_authority_id}) 
nlp.add_pipe("fish_archobjects_ruler", last=True)
nlp.add_pipe("fish_monument_types_ruler", last=True)  
nlp.add_pipe("fish_supplementary_ruler", last=True) 
nlp.add_pipe("negation_ruler", last=True) 

# read input file text
input_file_name = "./data/text extraction - pdftotext/acarchae2-517986_214251.txt"
input_file_text = read_text_file(input_file_name)

# override with something smaller for testing...
input_file_text2 = """
An archaeological trench evaluation was undertaken by AC archaeology during July
          2023 on land at Hartnoll Farm, Tiverton, Devon (centred on NGR SS 9898 1288). The
          evaluation comprised the machine excavation of 33 trenches totaling 1640m in length
          with each trench 1.8m wide. Trenches were positioned to target anomalies identified
          by a previous geophysical survey, as well as in what were thought to be blank areas.
          The site is located where previous investigations nearby had identified evidence for
          late prehistoric settlement, funerary and agricultural occupation. The main
          archaeological features identified during the present work were comparable to
          previous results and comprised two probable cremation pits representing potential
          evidence for an Early Bronze Age flat cemetery in the southwest part of the site, as
          well as part of a ring ditch of a probable ploughed-down former barrow to the southeast.
          Adjacent to this was a linear ditch likely to be part of a wider pattern of early field
          division. Elsewhere across the site mainly former ditches were present, with the
          majority of these of post medieval/modern date and related to agricultural field division
          and drainage.
"""

metadata = {
    "identifier": "acarchae2-517986_214251.txt",
    "title": "LAND AT HARTNOLL FARM, TIVERTON, DEVON",
    "description": "vocabulary-based NER annotation",
    "creator": "T4-1-2-NER-OASIS-reports-full-text.ipynb",
    "created": DT.now().strftime('%Y-%m-%dT%H:%M:%SZ'),
    "periodo_authority_id": periodo_authority_id,
    "input_record_source": "acarchae2-517986_214251.txt",
    "input_record_count": 1
}


# normalise white space prior to annotation
# (extra spaces frustrate pattern matching)
clean_file_text = normalize_whitespace(input_file_text)

# perform annotation on cleaned text    
doc = nlp(clean_file_text)

# write results to text file
summary = DocSummary(doc, metadata=metadata)
#display(HTML(summary.doctext(format="html")))
#print(summary.spancounts(format="text"))
#print(summary.report(format="json"))
#display(HTML(summary.report(format="html")))

# note - not completed in 64 mins, possibly running something multiple times
# as display above and JSON finished earlier - maybe cache pairs not calc for each report?
#with open(input_file_name + "-ner-output.json", "w") as output:
    #output.write(summary.report(format="json"))

with open(input_file_name + "-ner-output.html", "w") as output:
    output.write(summary.report(format="html"))

#with open(input_file_name + "-ner-output.txt", "w") as output:
    #output.write(summary.report(format="text"))


