## Running ATRIUM NER pipeline on full text extracted from OASIS PDF reports

In [4]:
%%capture

import warnings
# suppress user warnings during execution
warnings.filterwarnings(action='ignore', category=UserWarning)

# load required dependencies
%pip install --upgrade pip
%pip install spacy
%pip install ipywidgets
%sx python -m spacy download en_core_web_sm

import spacy # for NER processing
from spacy.tokens import Doc # for NER results
from datetime import datetime as DT # for timestamps
import os
from slugify import slugify # for valid filenames from identifiers
from rematch2 import PeriodoRuler, VocabularyRuler, NegationRuler, DocSummary, StringCleaning

In [7]:
# using predefined spaCy pipeline (English)
nlp = spacy.load("en_core_web_sm", disable = ['ner'])

# using HE Periods list
periodo_authority_id = "p0kh9ds" 

# add rematch2 NER component(s) to the end of the pipeline
nlp.add_pipe("yearspan_ruler", last=True)    
nlp.add_pipe("periodo_ruler", last=True, config={"periodo_authority_id": periodo_authority_id}) 
nlp.add_pipe("fish_archobjects_ruler", last=True)
nlp.add_pipe("fish_monument_types_ruler", last=True)  
nlp.add_pipe("fish_supplementary_ruler", last=True) 
nlp.add_pipe("negation_ruler", last=True) 

# read input file text from specified file
# NOTE: takes 39 mins to process one file
input_file_path = "./data/text extraction - pdftotext"
input_file_name = "acarchae2-517986_214251.txt"
input_file_full = os.path.join(input_file_path, input_file_name)
input_file_text = ""
with open(input_file_full, "r") as input:
    input_file_text = input.read()

# override for testing...
input_file_text2 = """
An archaeological trench evaluation was undertaken by AC archaeology during July
          2023 on land at Hartnoll Farm, Tiverton, Devon (centred on NGR SS 9898 1288). The
          evaluation comprised the machine excavation of 33 trenches totaling 1640m in length
          with each trench 1.8m wide. Trenches were positioned to target anomalies identified
          by a previous geophysical survey, as well as in what were thought to be blank areas.
          The site is located where previous investigations nearby had identified evidence for
          late prehistoric settlement, funerary and agricultural occupation. The main
          archaeological features identified during the present work were comparable to
          previous results and comprised two probable cremation pits representing potential
          evidence for an Early Bronze Age flat cemetery in the southwest part of the site, as
          well as part of a ring ditch of a probable ploughed-down former barrow to the southeast.
          Adjacent to this was a linear ditch likely to be part of a wider pattern of early field
          division. Elsewhere across the site mainly former ditches were present, with the
          majority of these of post medieval/modern date and related to agricultural field division
          and drainage.
"""

metadata = {
    "identifier": input_file_name,
    "title": "vocabulary-based NER results",
    "description": "vocabulary-based NER annotation on report full-text",
    "creator": "T4-1-2-NER-OASIS-reports-full-text.ipynb",
    "created": DT.now().strftime('%Y-%m-%dT%H:%M:%SZ'),
    "periodo_authority_id": periodo_authority_id,
    "ner_pipeline": nlp.pipe_names,
    "input_file_name": input_file_name,
    "input_record_count": 1
}

# normalise input text prior to annotation
clean_file_text = StringCleaning.normalize(input_file_text)

# perform annotation on cleaned text    
doc = nlp(clean_file_text)

# write results to text file
summary = DocSummary(doc, metadata=metadata)

# create output file path if it does not already exist
output_file_path = os.path.join(input_file_path, "output")
if not os.path.exists(output_file_path):
    os.makedirs(output_file_path)

html_file_name = os.path.join(output_file_path, f"{slugify(input_file_name)}-ner-output.html") 

with open(html_file_name, "w") as file:
    file.write(summary.report(format="html"))