# Process OASIS records
Vocabulary-based Named Entity Recognition (NER) applied to a set of XML OASIS abstracts obtained from ADS. Detecting temporal phrases and object/monument types.

In [None]:
%%capture

import warnings
# suppress user warnings during execution
warnings.filterwarnings(action='ignore', category=UserWarning)

# load required dependencies
%pip install --upgrade pip
%pip install spacy
%pip install ipywidgets
%sx python -m spacy download en_core_web_sm

#from IPython.display import display, HTML
from slugify import slugify # for valid filenames from identifiers
import spacy # for NER processing
from spacy.tokens import Doc # for NER results
from lxml import etree as ET # for parsing input records from XML file
from datetime import datetime as DT # for timestamps
from html import escape # for writing escaped HTML
import pandas as pd  # for DataFrame
import os
from rematch2 import PeriodoRuler, VocabularyRuler, NegationRuler, DocSummary, StringCleaning


In [None]:
# parse and extract a list of OASIS abstract records from source XML file 
# returns [{"id", "text"}, {"id", "text"}, ...] for subsequent processing
def get_records_from_xml_file(file_path: str="") -> list:
    records = []
    try:
        # read XML file
        tree = ET.parse(file_path)
        root = tree.getroot()
    except:
        print(f"Could not read from {file_path}")
        return []

    # find rows to be processed in the XML file
    rows = tree.xpath("/table/rows/row")

    for row in rows:
        # find abstract(s) in the current item
        abstracts = row.xpath("value[@columnNumber='1']/text()")
       
        # if multiple abstracts, get first one
        if (len(abstracts) > 0):
            abstract = abstracts[0]
        else:
            abstract = ""

         # find identifier(s) in the current item
        identifiers = row.xpath("value[@columnNumber='0']/text()")

        # if multiple identifiers, get first one (remove URL prefix if present)
        if (len(identifiers) > 0):
            identifier = identifiers[0]
            identifier = identifier.replace(
                "https://archaeologydataservice.ac.uk/archsearch/record?titleId=", "")
        else:
            identifier = ""

        ## create new (cleaned) record and add it
        record = {}
        record["id"] = str(identifier).strip()
        record["text"] = str(abstract).strip()
        records.append(record)

    # finally, return the extracted list
    return records


# parse and extract a list of OASIS abstract records from source CSV file 
# returns [{"id", "text"}, {"id", "text"}, ...] for subsequent processing
def get_records_from_csv_file(file_path: str="") -> list:
    records = []
    
    # read the CSV file to a DataFrame
    df = pd.read_csv(file_path, skip_blank_lines=True)
    # set any NaN values to blank string
    df.fillna("", inplace=True)
    # convert the data to a dict structure
    items = df.to_dict(orient="records") 
    
    records = list(map(lambda item: { 
        "id": str(item["file"]).strip(), 
        "text": str(item["abstract"]).strip() }, items))
    
    return records
     

In [None]:
# use predefined spaCy pipeline (English), no NER
nlp = spacy.load("en_core_web_sm", disable = ['ner'])

# using HE Periods list
periodo_authority_id = "p0kh9ds" 

# add rematch2 NER components to the end of the pipeline
nlp.add_pipe("yearspan_ruler", last=True)    
nlp.add_pipe("periodo_ruler", last=True, config={"periodo_authority_id": periodo_authority_id}) 
nlp.add_pipe("fish_archobjects_ruler", last=True)
nlp.add_pipe("fish_monument_types_ruler", last=True)  
nlp.add_pipe("fish_supplementary_ruler", last=True) 
nlp.add_pipe("negation_ruler", last=True) 

# process ADS CSV report examples
input_file_path = "./data/report_metadata"
input_file_name = "report_metadata.csv"

# process ADS CSV journal examples
#input_file_path = "./data/journal_metadata"
#input_file_name = "journal_metadata.csv"

# process ADS XML metadata examples
# input_file_path = "./data/oasis_descr_examples"
# input_file_name = "oasis_descr_examples.xml"

input_records = []
input_file = os.path.join(input_file_path, input_file_name)
if input_file.lower().endswith("xml"):
    input_records = get_records_from_xml_file(input_file)
elif input_file.lower().endswith("csv"):
    input_records = get_records_from_csv_file(input_file)

record_count = len(input_records)

metadata = {
    "identifier": "",
    "title": "vocabulary-based NER results",
    "description": "vocabulary-based NER annotation on text abstracts",
    "creator": "T4-1-2-NER-OASIS-metadata-records.ipynb",
    "created": DT.now().strftime('%Y-%m-%dT%H:%M:%SZ'),
    "periodo_authority_id": periodo_authority_id,
    "ner_pipeline": nlp.pipe_names,
    "input_file_name": input_file_name,
    "input_record_count": record_count        
}

# create output file path if it does not already exist
output_file_path = os.path.join(input_file_path, "output")
if not os.path.exists(output_file_path):
    os.makedirs(output_file_path)

current_record = 0    
for record in input_records or []:        
    current_record += 1

    # get ID and text from the record
    identifier = record.get("id", "").strip()
    input_text = record.get("text", "")

    metadata["identifier"] = identifier

    # print progress indicator
    print(f"processing record {current_record} of {record_count} [ID: {identifier}]")
        
    # normalise text prior to NER processing
    # (whitespace, punctuation & spelling)
    cleaned = StringCleaning.normalize(input_text)

    # perform annotation on cleaned text    
    doc = nlp(cleaned)
    summary = DocSummary(doc, metadata=metadata)

    # build output file names incorporating record identifier
    # slugify identifier in case of bad characters for file names
    html_file = os.path.join(output_file_path, f"{slugify(identifier)}.html")
    text_file = os.path.join(output_file_path, f"{slugify(identifier)}.txt")
    json_file = os.path.join(output_file_path, f"{slugify(identifier)}.json")

    # write results to HTML, TEXT and JSON files    
    with open(html_file, "w") as file:
        file.write(summary.report(format="html"))
    with open(text_file, "w") as file:
        file.write(summary.report(format="text"))    
    with open(json_file, "w") as file:
        file.write(summary.report(format="json")) 
    
    # temp interrupt after a few records (while testing)
    #if current_record == 5:
        #break

In [None]:
"""
from IPython.display import display, HTML

# build list of results
def result_link(record):
    identifier = record["id"] 
    file_path=f"https://html-preview.github.io/?url=https://github.com/cbinding/rematch2/blob/main/data/output/{slugify(identifier)}.html"
    return f"<li><a href='{file_path}'>{identifier}</a></li>" 
results = list(map(result_link, input_records or []))
results.sort()
#display(HTML("<ul>" + "".join(results) + "</ul>"))
with open("./data/output/results.md", "w") as file:
    file.write("<ul>" + "".join(results) + "</ul>")
"""