# Process OASIS records
Vocabulary-based Named Entity Recognition (NER) applied to a set of XML OASIS abstracts obtained from ADS. Detecting temporal phrases and object/monument types.

In [1]:
import warnings
# suppress user warnings during execution
warnings.filterwarnings(action='ignore', category=UserWarning)

# load required dependencies
%pip install --upgrade pip
%pip install spacy
%pip install ipywidgets
%sx python -m spacy download en_core_web_sm

#from IPython.display import display, HTML
from slugify import slugify # for valid filenames from identifiers
import spacy # for NER processing
from spacy.tokens import Doc # for NER results
from lxml import etree as ET # for parsing input records from XML file
from datetime import datetime as DT # for timestamps
from html import escape # for writing escaped HTML

from rematch2 import PeriodoRuler, VocabularyRuler, NegationRuler, DocSummary


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
# write ISO timestamp in consistent format
def timestamp():
  return DT.now().strftime('%Y-%m-%dT%H:%M:%SZ')  

# normalize string whitespace
def normalize_whitespace(s: str = "") -> str: 
    return ' '.join(s.strip().split()) 
    
# parse and extract a list of OASIS abstract records from source XML file 
# returns [{"id", "text"}, {"id", "text"}, ...] for subsequent processing
def get_records_from_xml_file(file_path: str="") -> list:
    records = []
    try:
        # read XML file
        tree = ET.parse(file_path)
        root = tree.getroot()
    except:
        print(f"Could not read from {file_path}")
        return []

    # find rows to be processed in the XML file
    rows = tree.xpath("/table/rows/row")

    for row in rows:
        # find abstract(s) in the current item
        abstracts = row.xpath("value[@columnNumber='1']/text()")
       
        # if multiple abstracts, get first one
        if (len(abstracts) > 0):
            abstract = abstracts[0]
        else:
            abstract = ""

         # find identifier(s) in the current item
        identifiers = row.xpath("value[@columnNumber='0']/text()")

        # if multiple identifiers, get first one (remove URL prefix if present)
        if (len(identifiers) > 0):
            identifier = identifiers[0]
            identifier = identifier.replace(
                "https://archaeologydataservice.ac.uk/archsearch/record?titleId=", "")
        else:
            identifier = ""

        ## create new (cleaned) record and add it
        record = {}
        record["id"] = identifier.strip()
        record["text"] = abstract.strip()
        records.append(record)

    # finally, return the extracted list
    return records

# write single results as a HTML file for presentation of output
def write_result_to_html_file(file_name: str="", doc: Doc = None, metadata: dict = {}):
    summary = DocSummary(doc)
    html = []

    # write header tags
    html.append("<!DOCTYPE html>")
    html.append("<html>")
        
    # write CSS from file to style tag (so no file dependency)
    html.append("<head>")
    with open('find_pairs.css', 'r', encoding='utf8') as css_file:
        css_text = css_file.read()
        html.append(f'<style>{css_text}</style>')    
    
    html.append("</head>")
    html.append("<body>")

    def metadata_value(key: str) -> str: return metadata.get(key.strip(), "").strip()

    # write identifier as heading
    html.append("<h3>")
    identifier = metadata_value("identifier")   
    if(identifier.startswith("http")):
        html.append(f"<a target='_blank' rel='noopener noreferrer' href='{identifier}'>{escape(identifier)}</a>")
    else:
        html.append(f"{escape(identifier)}")
    html.append("</h3>")

    # write metadata   
    html.append("<details>")
    html.append(f"<summary>Metadata</summary>")
    html.append("<ul>")
    html.append(f"<li><strong>identifier:</strong> {escape(metadata_value("identifier"))}</li>")
    html.append(f"<li><strong>title:</strong> {escape(metadata_value("title"))}</li>")
    html.append(f"<li><strong>description:</strong> {escape(metadata_value("description"))}</li>")
    html.append(f"<li><strong>creator:</strong> {escape(metadata_value("creator"))}</li>")
    html.append(f"<li><strong>created:</strong> {escape(metadata_value("timestamp"))}</li>")
    html.append(f"<li><strong>periodo authority ID:</strong> {escape(metadata_value("periodo_authority_id"))}</li>")

    pipeline = metadata.get('ner_pipeline', [])
    def listitem(value: str) -> str: return f"<li>{escape(value)}</li>"
    pipelist = "<ul>" + "".join(list(map(listitem, pipeline))) + "</ul>"
    html.append(f"<li><strong>NER pipeline:</strong>{pipelist}</li>")
    html.append("</ul>")
    html.append("</details>")    

    # write displacy HTML rendering of doc text as paragraph with highlighted spans 
    html.append("<details open>")
    html.append(f"<summary>Text ({len(summary.doctext('text'))} characters)</summary>")
    doctext = summary.doctext(format="html")
    html.append(f"<p>{doctext}</p>")
    html.append("</details>")

    # write list of tokens
    html.append("<details>")
    html.append(f"<summary>Tokens ({len(summary.tokens('list'))})</summary>")        
    html.append(summary.tokens("htmll"))
    html.append("</details>")

    # write label counts
    #html.append("<details>")
    #html.append(f"<summary>Label Counts ({len(DocSummary(doc).labelcounts('list'))})</summary>")
    #html.append(DocSummary(doc).labelcounts(format="htmlt"))
    #html.append("</details>")
    
    # write span counts
    html.append("<details>")
    html.append(f"<summary>Span Counts ({len(DocSummary(doc).spancounts('list'))})</summary>")
    html.append(DocSummary(doc).spancounts(format="htmlt"))
    html.append("</details>")
    
    # get and write span pairs
    html.append("<details>")
    html.append(f"<summary>Span Pairs</summary>")
    pairs = summary.spanpairs(
        format="htmlt", 
        rel_ops=[ "<", ">", "<<", ">>", ".", ";" ], 
        left_labels=["PERIOD", "YEARSPAN"], 
        right_labels=["OBJECT", "MONUMENT"]
        )
    html.append(pairs)
    html.append("</details>")

    html.append("<details>")
    html.append(f"<summary>Negated Pairs</summary>")
    pairs = summary.spanpairs(
        format="htmlt", 
        rel_ops=[ "<", ">", "<<", ">>", ".", ";" ], 
        left_labels=["NEGATION"], 
        right_labels=["YEARSPAN", "PERIOD", "OBJECT", "MONUMENT"]
        )
    html.append(pairs)
    html.append("</details>")
    # write footer tags
    html.append("</body>")
    html.append("</html>")
        
    # finally, write HTML as string to file
    with open(file_name, "w") as html_file:
        html_file.write("".join(html))

In [5]:
periodo_authority_id = "p0kh9ds" # HE Periods list
# use predefined spaCy pipeline (English)
nlp = spacy.load("en_core_web_sm", disable = ['ner'])
# add rematch2 NER component(s) to the end of the pipeline
nlp.add_pipe("yearspan_ruler", last=True)    
nlp.add_pipe("periodo_ruler", last=True, config={"periodo_authority_id": periodo_authority_id}) 
nlp.add_pipe("fish_archobjects_ruler", last=True)
nlp.add_pipe("fish_monument_types_ruler", last=True)  
nlp.add_pipe("fish_supplementary_ruler", last=True) 
nlp.add_pipe("negation_ruler", last=True) 

input_records = get_records_from_xml_file("./data/input/oasis_descr_examples.xml")
record_count = len(input_records)

metadata = {
    "title": "process_oasis_records results",
    "description": "vocabulary-based NER annotation of archaeology abstracts",
    "creator": "process_oasis_records.ipynb",
    "timestamp": timestamp(),
    "periodo_authority_id": periodo_authority_id,
    "ner_pipeline": nlp.pipe_names,
    "input_record_count": record_count        
}

current_record = 0    
for record in input_records or []:        
    current_record += 1

    # get ID and text from the record
    identifier = record.get("id", "")
    input_text = record.get("text", "")

    # print progress indicator
    print(f"processing record {current_record} of {record_count} [ID: {identifier}]")

    # slugify identifier in case of bad characters for file names
    # TODO: ensure 'output' directory exists first, or create it
    output_file_path = f"./data/output/{slugify(identifier)}.html"    

    # normalise white space prior to annotation
    # (extra spaces frustrate pattern matching)
    cleaned = normalize_whitespace(input_text)

    # perform annotation on cleaned text    
    doc = nlp(cleaned)

    # write results to html file
    metadata["identifier"] = identifier
    write_result_to_html_file(file_name=output_file_path, doc=doc, metadata=metadata) 
    # temp interrupt after a few records
    #if current_record == 20:
        #break

processing record 1 of 1692 [ID: cambridg1-30423]
processing record 2 of 1692 [ID: universi1-91218]
processing record 3 of 1692 [ID: preconst1-3588]
processing record 4 of 1692 [ID: preconst1-3588]
processing record 5 of 1692 [ID: cambridg3-76125]
processing record 6 of 1692 [ID: preconst1-4915]
processing record 7 of 1692 [ID: aocarcha1-129642]
processing record 8 of 1692 [ID: thamesva1-93666]
processing record 9 of 1692 [ID: fieldsec1-100417]
processing record 10 of 1692 [ID: trentpea1-324597]
processing record 11 of 1692 [ID: archaeol7-404823]
processing record 12 of 1692 [ID: cotswold2-93883]
processing record 13 of 1692 [ID: colchest3-92994]
processing record 14 of 1692 [ID: tyneandw3-6189]
processing record 15 of 1692 [ID: essexcou1-41978]
processing record 16 of 1692 [ID: molas1-8593]
processing record 17 of 1692 [ID: eastsuss3-7591]
processing record 18 of 1692 [ID: archaeol6-186501]
processing record 19 of 1692 [ID: surreyco1-105591]
processing record 20 of 1692 [ID: archaeol7

In [6]:
from IPython.display import display, HTML

# build list of results
def result_link(record):
    identifier = record["id"] 
    file_path=f"https://html-preview.github.io/?url=https://github.com/cbinding/rematch2/blob/main/data/output/{slugify(identifier)}.html"
    return f"<li><a href='{file_path}'>{identifier}</a></li>" 
results = list(map(result_link, input_records or []))
results.sort()
#display(HTML("<ul>" + "".join(results) + "</ul>"))
with open("./data/output/results.md", "w") as file:
    file.write("<ul>" + "".join(results) + "</ul>")