## Running ATRIUM NER pipeline on full text extracted from OASIS PDF reports

In [1]:
%%capture

import warnings
# suppress user warnings during execution
warnings.filterwarnings(action='ignore', category=UserWarning)

# load required dependencies
%pip install --upgrade pip
%pip install spacy
%pip install ipywidgets
%sx python -m spacy download en_core_web_sm

import spacy # for NER processing
from spacy.tokens import Doc # for NER results
from datetime import datetime as DT # for timestamps
import os
from slugify import slugify # for valid filenames from identifiers
from rematch2 import PeriodoRuler, VocabularyRuler, NegationRuler, DocSummary, StringCleaning

In [2]:
# using predefined spaCy pipeline (English)
nlp = spacy.load("en_core_web_sm", disable = ['ner'])

# using HE Periods list
periodo_authority_id = "p0kh9ds" 

# add rematch2 NER component(s) to the end of the pipeline
nlp.add_pipe("yearspan_ruler", last=True)    
nlp.add_pipe("periodo_ruler", last=True, config={"periodo_authority_id": periodo_authority_id}) 
nlp.add_pipe("fish_archobjects_ruler", last=True)
nlp.add_pipe("fish_monument_types_ruler", last=True)  
nlp.add_pipe("fish_supplementary_ruler", last=True) 
nlp.add_pipe("negation_ruler", last=True) 
nlp.add_pipe("child_span_remover", last=True) 

input_directory = "./data/journals_july_2024/text extraction - new"

# subset of files to process
file_names = [
    "archael547-079-116-ceolwulf_new.txt",
    "archael547-005-040-breeze_new.txt",
    "2022_96_013-068_huxley_new.txt",
    "2022_96_001_012_cooper_garton_new.txt",
    "surreyac103_063-090_lambert_new.txt",
    "nas_20_1985_67-86_jackson_new.txt",
    "nas_20_1985_87-112_taylor_new.txt",
    "daj_v023_1901_040-047_new.txt",
    "daj_v086_1966_031-053_new.txt",
    "120_031_097_new.txt"
]

# create output file path if it does not already exist
yyyymmdd = DT.now().strftime('%Y%m%d')        
output_directory = os.path.join(input_directory, f"ner-output-{yyyymmdd}")
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

counter = 0
for entry in os.scandir(input_directory):        
    counter += 1
    # temp break for testing
    #if counter > 7: 
       # break
    
    #if entry.is_file() and entry.name.endswith(".txt"): 
    if entry.is_file() and entry.name.lower() in file_names:    
        # print progress indicator
        input_file_name = entry.name        
        print(f"processing '{input_file_name}'")

        # read text contents of input file
        input_file_text = ""
        with open(entry.path) as input_file:
            input_file_text = input_file.read()

        # set up metadata to include in output
        metadata = {
            "identifier": input_file_name,
            "title": "vocabulary-based NER results",
            "description": "vocabulary-based NER annotation on report full-text",
            "creator": "T4-1-2-NER-OASIS-reports-full-text.ipynb",
            "created": DT.now().strftime('%Y-%m-%dT%H:%M:%SZ'),
            "periodo_authority_id": periodo_authority_id,
            "ner_pipeline": nlp.pipe_names,
            "input_file_name": input_file_name,
            "input_record_count": 1
        }

        # normalise input text prior to annotation
        clean_file_text = StringCleaning.normalize_text(input_file_text)

        # perform annotation on cleaned text    
        doc = nlp(clean_file_text)
        summary = DocSummary(doc, metadata=metadata)

        # write results to text files
        html_file_name = os.path.join(output_directory, f"ner-output-{slugify(input_file_name)}.html") 
        text_file_name = os.path.join(output_directory, f"ner-output-{slugify(input_file_name)}.txt")
        json_file_name = os.path.join(output_directory, f"ner-output-{slugify(input_file_name)}.json")
        
        # note last run took 21 mins for 2 files  
        report = summary.report(format="html")      
        with open(html_file_name, "w") as file:
            file.write(report)

        report = summary.report(format="text")
        with open(text_file_name, "w") as file:
            file.write(report)
             
        report = summary.report(format="json")
        with open(json_file_name, "w") as file:
            file.write(report) 

processing 'NAS_20_1985_87-112_Taylor_new.txt'
processing 'archael547-005-040-breeze_new.txt'
processing 'archael547-079-116-ceolwulf_new.txt'
processing 'surreyac103_063-090_lambert_new.txt'
