## Introduction
rematch2 performs specialised NER focussed on temporal entities, and implements a series of spaCy pipeline components to identify various entity types in free text. This notebook describes each component with working examples of usage. The following example python script uses the ``century_ruler`` component to identify ordinal centuries in a given text.

In [None]:
# Using a rematch2 component
import spacy
from spacy import displacy
from rematch2 import components

# use a predefined spaCy pipeline, disabling the default NER component
nlp = spacy.load("en_core_web_sm", disable = ['ner'])
# add rematch2 component(s) to the end of the pipeline
nlp.add_pipe("century_ruler", last=True)
# process example text using the modified pipeline 
doc = nlp("The excavation located a number of late second century BC or possibly early 1st century AD artefacts.")
# highlight identified entities in the text
displacy.render(doc, style="ent")

The pipeline components contain language-specific patterns allowing you to perform similar processing in other languages. The next example uses multiple components run against some pre-defined texts in different languages. A small user interface is included so you can change the selected language and re-run to observe the entities identified. 

In [1]:
# Testing temporal components using any of the supported languages
import spacy
from spacy import displacy
from rematch2 import components
from test_examples import test_examples
import ipywidgets as widgets
from IPython.display import display, HTML

def run(btn):
    language = dropdown_language.value
    # get the (first) test present for the chosen language
    test = next(filter(lambda test: test["language"] == language, test_examples), None)
    if(test):
        # use predefined pipeline, disabling the default NER component
        nlp = spacy.load(test["pipe"], disable = ['ner'])
        # add the required pipeline component(s) to the end of the pipeline
        nlp.add_pipe("namedperiod_ruler", last=True, config={"periodo_authority_id": test["periodo_authority_id"]})
        nlp.add_pipe("century_ruler", last=True)        
        nlp.add_pipe("yearspan_ruler", last=True)
        # process some example text using the modified pipeline 
        doc = nlp(test["text"])
        # shhow highlighted entities in the example text
        output.clear_output(wait=True)
        with output:
            display(HTML(displacy.render(doc, style="ent")))        
    else:
        with output:
            display(f"No tests for language '{language_code}'")

# define UI components
dropdown_language = widgets.Dropdown(
    options = [
        ["German", "de"], 
        ["English", "en"], 
        ["Spanish", "es"], 
        ["French", "fr"],
        ["Italian", "it"], 
        ["Dutch", "nl"],
        ["Norwegian", "no"],
        ["Swedish", "sv"] 
    ],
    value="en", 
    description='Language:',
    disabled=False
)
button_go = widgets.Button(description="Go")
input = widgets.HBox([dropdown_language, button_go])
output = widgets.Output(layout=widgets.Layout(overflow='scroll', border='1px solid black', height='250px'))
# what to do when the button is clicked..
button_go.on_click(run)
# display the UI components
display(input, output)

HBox(children=(Dropdown(description='Language:', index=1, options=(['German', 'de'], ['English', 'en'], ['Span…

Output(layout=Layout(border='1px solid black', height='250px', overflow='scroll'))

xx

In [None]:
"""
%load_ext autoreload
%autoreload 2

import json
import ipywidgets as widgets

# tell Python where the local modules are located
import sys
module_path = os.path.abspath(os.path.join('..', 'src'))
if module_path not in sys.path:
    sys.path.append(module_path)

# import local modules
from TemporalRecognizer import TemporalRecognizer
from PeriodoData import PeriodoData

# modules for UI
from IPython.display import display, HTML

testdata_file_name = "../src/test-examples.json"
testData = None 
with open(testdata_file_name, "r") as f:  # what if file doesn't exist?            
    testData = json.load(f)   

# get list of all Perio.do authorities
pd = PeriodoData(False) # create new instance, don't refresh cache
authorities = pd.get_authority_list() # returns [{id: "x", label: "label x"}, {id: "y", label: "label y"}]
authorities_for_dropdown = list(map(lambda x: [x["label"], x["id"]], authorities)) # dropdown requires array of tuples: [[label, id], [label, id]]

# define dropdown control for list of Perio.do authorities
dropdown_authorities = widgets.Dropdown(
    options = authorities_for_dropdown, 
    value=None, 
    description='Authority:',
    disabled=False
)

# define dropdown control for list of Perio.do periods
dropdown_periods = widgets.Dropdown(
    options = [], 
    value=None, 
    description='Periods:',
    disabled=False
)

# what to do when selected perio.do authority is changed
def on_dropdown_change(*args):
    periods = pd.get_period_list(dropdown_authorities.value)
    periods_for_dropdown = list(map(lambda item: [item["label"], item["id"]], periods))
    dropdown_periods.options = periods_for_dropdown

dropdown_authorities.observe(on_dropdown_change, 'value')

# define input textarea
#example_input_text_en = "the artefact was medieval or circa 1250-1275 or maybe post medieval? Underlying the modern made ground on the site was a layer covering the entire shaft area. This has been dated to c.1480-1800/1900 and interpreted as a post-medieval cultivation soil. Historic mapping illustrates that the site remained undeveloped through the post medieval period until the mid 19th century, when urban development around the site accelerated and construction of railways in this part of London began. On Gascoigne's 1703 map the site was open ground, the later maps of Rocque in 1746 and Horwood in 1799 show the area was in use as fields and Stanford's map of 1862 depicts the area surrounding Eleanor Street comprising of market gardens. These are all consistent with the archaeological evidence. Underlying the layer were natural terrace gravels. The archaeological fieldwork has demonstrated that remains relating to the Prehistoric, Roman or medieval period have not survived to the modern era, if they were once present on site."

textarea_input = widgets.Textarea(
    value="",
    placeholder='Type something',
    description='String:',
    disabled=False,
    layout=widgets.Layout(width='100%', height='250px')    
)

# define language dropdown
dropdown_language = widgets.Dropdown(
    options = [
        ["German", "de"], 
        ["English", "en"], 
        ["Spanish", "es"], 
        ["French", "fr"],
        ["Italian", "it"], 
        ["Dutch", "nl"],
        ["Norwegian", "no"],
        ["Swedish", "sv"] 
    ],
    value="fr", 
    description='Language:',
    disabled=False
)

# define checkboxes for entity types
data = ["TEMPORAL", "MONUMENT", "MATERIAL", "EVENTTYPE", "ARCHSCIENCE"]
checkboxes = [widgets.Checkbox(value=False, description=label) for label in data]
entity_type_choices = widgets.VBox(children=checkboxes)

#input_text = ""
for testItem in testData:
        if testItem["language"] == dropdown_language.value:
            textarea_input.value = testItem["value"]  
            break       

# define output format dropdown
dropdown_output_format = widgets.Dropdown(
    options = ["html", "json", "csv", "tsv"], 
    value="html", 
    description='Format:',
    disabled=False
)

output = widgets.Output(
    layout=widgets.Layout(overflow='scroll', border='1px solid black', height='250px')
)

def on_button_go_clicked(b):    
    input_text = textarea_input.value
    language = dropdown_language.value
    output_format = dropdown_output_format.value
    authority_id = dropdown_authorities.value
    tr = TemporalRecognizer(language, authority_id)
    entities = tr.get_entities(input_text, output_format)
    #formatted = TemporalRecognizer.format_entities(input_text, entities, output_format)
    #print(formatted)
    output.clear_output(wait=True)
    with output:
        #pprint(formatted)        
        display(HTML(entities))
        #display(HTML('<a href="http://example.com">link</a>'))
    #display(formatted)
        
# define 'Go' button and output area
button_go = widgets.Button(description="Go")
button_go.on_click(on_button_go_clicked)
    
# display all controls on the screen

display(entity_type_choices)
display(dropdown_authorities)
#display(dropdown_periods)
display(dropdown_language)
display(dropdown_output_format)
display(textarea_input)
display(button_go)
display(output) 
#display(HTML('<a href="http://example.com">link</a>'))
"""