# Process OASIS records
Vocabulary-based Information Extraction (IE) and vocabulary alignment, applied to a set of XML OASIS abstracts obtained from ADS. Detecting temporal phrases and object/monument types.

In [None]:
%%%capture
import warnings
# suppress user warnings during execution
warnings.filterwarnings(action='ignore', category=UserWarning)
warnings.filterwarnings(action='ignore', category=FutureWarning)

# install required dependencies
%pip install --upgrade pip
%pip install spacy
#%pip install ipywidgets

%sx python -m spacy download en_core_web_sm

In [None]:
# reference required modules
#from IPython.display import display, HTML
from slugify import slugify # for creating valid filenames from identifiers
import spacy # for text processing
from spacy.tokens import Doc # for text processing results
from spacy.pipeline import Pipe # for custom pipeline components
from lxml import etree as ET # for parsing input records from XML file
from datetime import datetime as DT # for timestamps
from weasyprint import HTML
from html import escape # for writing escaped text within HTML 
import pandas as pd  # for DataFrame
import json, os
from rematch2 import DocSummary
import warnings
from rematch2.SpanScorer import SpanScorer
from ATRIUM_T4_1_2_IE_pipeline import get_pipeline, read_csv_file

#import pdfkit # to produce PDF output from html output
# suppress user warnings during execution
warnings.filterwarnings(action='ignore', category=UserWarning)


In [None]:
# parse and extract a list of OASIS abstract records from source XML file 
# returns [{"id", "text"}, {"id", "text"}, ...] for subsequent processing
def get_records_from_xml_file(file_path: str="") -> list:
    records = []
    try:
        # read XML file
        tree = ET.parse(file_path)
        root = tree.getroot()
    except:
        print(f"Could not read from {file_path}")
        return []

    # find rows to be processed in the XML file
    rows = tree.xpath("/table/rows/row")

    for row in rows:
        # find abstract(s) in the current item
        abstracts = row.xpath("value[@columnNumber='1']/text()")
       
        # if multiple abstracts, get first one
        if (len(abstracts) > 0):
            abstract = abstracts[0]
        else:
            abstract = ""

         # find identifier(s) in the current item
        identifiers = row.xpath("value[@columnNumber='0']/text()")

        # if multiple identifiers, get first one (remove URL prefix if present)
        if (len(identifiers) > 0):
            identifier = identifiers[0]
            identifier = identifier.replace(
                "https://archaeologydataservice.ac.uk/archsearch/record?titleId=", "")
        else:
            identifier = ""

        ## append new (cleaned) record to output list       
        records.append({
            "id": str(identifier).strip(),
            "text": str(abstract).strip()
        })

    # finally, return the extracted list
    return records


# parse and extract a list of OASIS abstract records from source CSV file 
# returns [{"id", "title", "text"}, {"id", "title", "text"}, ...] for subsequent processing
def get_records_from_csv(file_path: str="") -> list:
    records = []
    items = read_csv_file(file_path)
    # read the CSV file to a DataFrame
    #df = pd.read_csv(file_path, skip_blank_lines=True)
    # set any NaN values to blank string
    #df.fillna("", inplace=True)
    # convert the data to a dict structure
    #items = df.to_dict(orient="records") 
    
    records = list(map(lambda item: { 
            "id": str(item.get("file", "")).strip(), 
            "title": str(item.get("title", "")).strip(), 
            "text": str(item.get("abstract")).strip() 
        }, items))
    
    return records
     

In [None]:
# using predefined spaCy pipeline (English)
nlp = get_pipeline("en")

# timestamp for use in directory names
timestamp = DT.now().strftime('%Y%m%d')

# process ADS CSV report examples
input_directory = "./data/ie-input/oasis-report-metadata"
input_file_name = "report_metadata.csv"
output_directory = "./data/ie-output/ie-output-oasis-report-metadata"

# process ADS CSV journal examples
#input_directory = "./data/ie-input/ads-journal-metadata"
#input_file_name = "journal_metadata.csv"
#output_directory = "./data/ie-output/ie-output-ads-journal-metadata"

# process ADS XML metadata examples
#input_directory = "./data/ie-input/oasis-descr-examples"
#input_file_name = "oasis_descr_examples.xml"
#output_directory = f"./data/ie-output/ie-output-oasis-descr-examples"

# create output file path if it does not already exist
output_directory = f"{output_directory}-{timestamp}"
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

input_records = []
input_file = os.path.join(input_directory, input_file_name)
if input_file.lower().endswith(".xml"):
    input_records = get_records_from_xml_file(input_file)
elif input_file.lower().endswith(".csv"):
    input_records = read_csv_file(input_file)

record_count = len(input_records)

metadata = {
    "identifier": "",
    "title": "vocabulary-based information extraction results",
    "description": "vocabulary-based information extraction annotation on text abstracts",
    "creator": "T4-1-2-IE-OASIS-metadata-records.ipynb",
    "created": DT.now().strftime('%Y-%m-%dT%H:%M:%SZ'),
    #"periodo_authority_id": periodo_authority_id,
    "pipeline": nlp.pipe_names,
    "input_file_name": input_file_name,
    "input_record_count": record_count
}

current_record = 0
for record in input_records or []:
    current_record += 1

    # get ID from the record
    identifier = record.get("id", "").strip()   
    metadata["identifier"] = identifier

    # print progress indicator
    print(f"processing record {current_record} of {record_count} [ID: {identifier}]")
        
    # Combine title and main text from the record   
    # input_text = "\n".join([record.get("title", "") + ".", record.get("text", "")])
    # 04/10/24 just process main text don't include title
    input_text = record.get("text", "")

    if(len(input_text) > 0):
        # perform annotation on (normalized) text
        doc = nlp(input_text)

        # (optionally) add any identified place entities to the custom spans        
        #for ent in filter(lambda e: e.label_ == "GPE", doc.ents):
            #doc.spans["rematch"].append(ent)            

        summary = DocSummary(doc, metadata=metadata)
        # add calculated scores for spans
        sections = list(record.get("sections", []))
        scorer = SpanScorer(nlp, sections=sections)
        doc = scorer(doc)
        # build output file names incorporating record identifiers
        # slugify identifiers in case of bad characters for file names
        #html_file_name = os.path.join(output_directory, f"ie-output-{slugify(identifier)}.html") 
        #text_file_name = os.path.join(output_directory, f"ie-output-{slugify(identifier)}.txt")
        csv_file_name = os.path.join(output_directory, f"ie-output-{slugify(identifier)}.csv")
        json_file_name = os.path.join(output_directory, f"ie-output-{slugify(identifier)}.json")
        pdf_file_name = os.path.join(output_directory, f"ie-output-{slugify(identifier)}.pdf") 

        # write results report to PDF and JSON files    
        report = summary.report(format="html")      
        HTML(None, string=report, encoding="utf-8").write_pdf(target=pdf_file_name)
                
        #report = summary.report(format="text")
        #with open(text_file_name, "w") as file:
            #file.write(report)

        # write results to CSV file
        ts_started = DT.now()   
        with open(csv_file_name, "w") as file:
            file.write(summary.spans_to_csv())

         # write results to JSON file 
        report = summary.report(format="json")
        with open(json_file_name, "w") as file:
            file.write(report) 
        
        # temp interrupt after a few records (while testing)
        #if current_record == 2:
            #break

processing record 1 of 20 [ID: acarchae2-517986_214251.pdf]
"_get_dependency_pairs" ran in 0.031 seconds
"_get_noun_chunk_pairs" ran in 0.003 seconds
"_get_all_pairs" ran in 0.036 seconds
"_get_dependency_pairs" ran in 0.029 seconds
"_get_noun_chunk_pairs" ran in 0.007 seconds
"_get_all_pairs" ran in 0.036 seconds
"_get_dependency_pairs" ran in 0.014 seconds
"_get_noun_chunk_pairs" ran in 0.001 seconds
"_get_all_pairs" ran in 0.016 seconds
"get_span_pairs" ran in 0.016 seconds
"_get_dependency_pairs" ran in 0.011 seconds
"_get_noun_chunk_pairs" ran in 0.001 seconds
"_get_all_pairs" ran in 0.013 seconds
"get_negated_span_pairs" ran in 0.013 seconds
"report_to_html" ran in 0.075 seconds
"report" ran in 0.075 seconds
"report_to_json" ran in 0.006 seconds
"report" ran in 0.006 seconds
processing record 2 of 20 [ID: allenarc1-513712_214540.pdf]
"_get_dependency_pairs" ran in 0.034 seconds
"_get_noun_chunk_pairs" ran in 0.005 seconds
"_get_all_pairs" ran in 0.039 seconds
"_get_dependency_pai