In [None]:
import warnings
%pip install --upgrade pip
%pip install spacy
%pip install ipywidgets
#%pip install -U jupyter

# suppress user warnings during execution
warnings.filterwarnings(action='ignore', category=UserWarning)


## Introduction
rematch2 performs specialised NER focussed on temporal entities, and implements a series of spaCy pipeline components to identify various entity types in free text. These components are used by the TemporalAnnotator class but may also be used directly for more fine-grained control if required. This notebook describes each component with working examples of usage. The following example python script uses the ``namedperiod_ruler`` component to identify named periods (terms from a specified Perio.do authority) in a given text.

In [8]:
# Using a rematch2 component
import spacy
from spacy import displacy
from rematch2 import components

# use a predefined spaCy pipeline, disabling the default NER component
nlp = spacy.load("en_core_web_sm", disable = ['ner'])
# add rematch2 component(s) to the end of the pipeline
nlp.add_pipe("namedperiod_ruler", last=True, config={
             "periodo_authority_id": "p0kh9ds"})
# processing example text using the modified pipeline
# (text from https://doi.org/10.5284/1100106)
txt = '''
Eight trenches were excavated across the 0.93ha Site. Previous archaeological works immediately to the north of the \
Site in the early 1990s, including field walking and targeted excavation, recorded worked flint and a pit possibly dated to \
the Late Iron Age, while other investigations in the wider area have identified remains of Late Bronze Age, Early Iron Age and \
Saxon date. Evidence for truncation of the deposit sequence was noted across the development area, with the Site area potentially \
having been levelled and drained prior to the establishment of turf pitches, with topsoil present only as a very thin turf horizon overlying \
sterile subsoil and brickearth deposits. The evaluation identified a single pit of Late Bronze Age to Early Iron Age date. No other \
archaeological features were located during the works.
'''
doc = nlp(txt)
# highlight identified entities in the text
displacy.render(doc, style="ent")

The pipeline components contain language-specific patterns allowing you to perform similar processing in other languages. The following example uses the "namedperiod_ruler" component with French NER pipeline and a specific Perio.do authority file identifier (see http://n2t.net/ark:/99152/p02chr4 - "PACTOLS chronology periods used in DOLIA data", 2021).

In [5]:
# Using a rematch2 component
import spacy
from spacy import displacy
from rematch2 import components

# use a predefined spaCy pipeline, disabling the default NER component
nlp = spacy.load("fr_core_news_sm", disable = ['ner'])
# add rematch2 component(s) to the end of the pipeline
nlp.add_pipe("namedperiod_ruler", last=True, config={"periodo_authority_id": "p02chr4"})
# process example text using the modified pipeline 
doc = nlp("Quelques éléments lithiques Würm IV dispersés sont des indicateurs de la période néolithique d'environ fin 11000 - début 10000 av JC. Un ensemble de fosses, circonscrit sur une surface de 35 m2, a livré du mobilier céramique du premier âge du Fer (Hallstatt C) et quelques éléments lithiques. Un bâtiment sur poteaux se situe à une distance de 100 m vers l’ouest. Pour la période de La Tène finale et gallo-romaine, l’ensemble des vestiges fossoyés et bâtis sont concentrés sur une parcelle à la croisée de deux chemins actuels présents sur le cadastre de 1807. L’élément structurant majeur est le fossé F6 qui traverse perpendiculairement la parcelle, large de 3 m pour une profondeur de 1,80 m sous la terre arable. Seul un angle de fossé, de nature différente et de plus petite taille, semble participer à cette même organisation du paysage. Un bâtiment de plan rectangulaire, 13 x 9 m, sur fondations de schiste dont deux angles ont été découverts, est orienté de façon identique au fossé F6. Il est très bien fondé sur une profondeur de 0,70 m avec de gros blocs de schiste. Si quelques rares tessons du Haut-Empire ont été trouvés dans la fondation du bâtiment, le mobilier céramique du colmatage de la zone humide située à 10 m au sud-ouest, est compris entre le milieu du Ier siècle et le début du IIe siècle de notre ère. Situé à 45 m plus au sud et parallèle au bâtiment, un fossé rectiligne de 3 m de large pour 1,80 m de profondeur sous la terre arable, scinde l’espace en deux. Son creusement en V à été comblé en deux temps. La première phase est une sédimentation naturelle qui a piégé quelques tessons protohistoriques et des scories ferreuses dont un culot de forge. Le colmatage supérieur est composé de matériaux issus de la démolition avec de très nombreuses tuiles, des blocs de schiste brut ainsi que quelques tessons de céramique gallo-romaine. De nombreux trous de poteaux et fosses se situent entre ces deux structures majeures. Un angle de fossé dessinant l’amorce d’un enclos se développe au sud. Du mobilier La Tène finale a également été trouvé dans une fosse située dans cette espace. Un réseau fossoyé se développe à l’est, très érodé du côté nord. Des fragments de céramique possiblement haut Moyen Âge ont été trouvés dans son comblement")
# highlight identified entities in the text
displacy.render(doc, style="ent")

The next example uses multiple components (century_ruler, yearspan_ruler and namedperiod_ruler) run against some pre-defined test text in different languages. A small user interface is included so you can change the selected language and re-run to observe the entities identified. 

In [6]:
# Testing temporal components using any of the supported languages
import spacy
from spacy import displacy
from rematch2 import components
from test_examples import test_examples
import ipywidgets as widgets
from IPython.display import display, HTML

def run(btn):
    language = dropdown_language.value
    # get the (first) test present for the chosen language
    test = next(filter(lambda test: test["language"] == language, test_examples), None)
    if(test):
        # use predefined pipeline, disabling the default NER component
        nlp = spacy.load(test["pipe"], disable = ['ner'])
        # add the required pipeline component(s) to the end of the pipeline
        nlp.add_pipe("century_ruler", last=True)        
        nlp.add_pipe("yearspan_ruler", last=True)
        nlp.add_pipe("namedperiod_ruler", last=True, config={"periodo_authority_id": test["periodo_authority_id"]})
        #print(nlp.pipe_names)
        # process some example text using the modified pipeline 
        doc = nlp(test["text"])
        # show highlighted entities in the example text
        output.clear_output(wait=True)
        with output:
            display(HTML(displacy.render(doc, style="ent")))        
    else:
        with output:
            display(f"No tests for language '{language_code}'")

# define UI components
dropdown_language = widgets.Dropdown(
    options = [
        ["German", "de"], 
        ["English", "en"], 
        ["Spanish", "es"], 
        ["French", "fr"],
        ["Italian", "it"], 
        ["Dutch", "nl"],
        ["Norwegian", "no"],
        ["Swedish", "sv"] 
    ],
    value="en", 
    description='Language:',
    disabled=False
)
button_go = widgets.Button(description="Go")
input = widgets.HBox([dropdown_language, button_go])
output = widgets.Output(layout=widgets.Layout(overflow='scroll', border='1px solid black', height='250px'))
# what to do when the button is clicked..
button_go.on_click(run)
# display the UI components
display(input, output)

HBox(children=(Dropdown(description='Language:', index=1, options=(['German', 'de'], ['English', 'en'], ['Span…

Output(layout=Layout(border_bottom='1px solid black', border_left='1px solid black', border_right='1px solid b…

In [None]:
"""
%load_ext autoreload
%autoreload 2

import json
import ipywidgets as widgets

# tell Python where the local modules are located
import sys
module_path = os.path.abspath(os.path.join('..', 'src'))
if module_path not in sys.path:
    sys.path.append(module_path)

# import local modules
from TemporalRecognizer import TemporalRecognizer
from PeriodoData import PeriodoData

# modules for UI
from IPython.display import display, HTML

testdata_file_name = "../src/test-examples.json"
testData = None 
with open(testdata_file_name, "r") as f:  # what if file doesn't exist?            
    testData = json.load(f)   

# get list of all Perio.do authorities
pd = PeriodoData(False) # create new instance, don't refresh cache
authorities = pd.get_authority_list() # returns [{id: "x", label: "label x"}, {id: "y", label: "label y"}]
authorities_for_dropdown = list(map(lambda x: [x["label"], x["id"]], authorities)) # dropdown requires array of tuples: [[label, id], [label, id]]

# define dropdown control for list of Perio.do authorities
dropdown_authorities = widgets.Dropdown(
    options = authorities_for_dropdown, 
    value=None, 
    description='Authority:',
    disabled=False
)

# define dropdown control for list of Perio.do periods
dropdown_periods = widgets.Dropdown(
    options = [], 
    value=None, 
    description='Periods:',
    disabled=False
)

# what to do when selected perio.do authority is changed
def on_dropdown_change(*args):
    periods = pd.get_period_list(dropdown_authorities.value)
    periods_for_dropdown = list(map(lambda item: [item["label"], item["id"]], periods))
    dropdown_periods.options = periods_for_dropdown

dropdown_authorities.observe(on_dropdown_change, 'value')

# define input textarea
#example_input_text_en = "the artefact was medieval or circa 1250-1275 or maybe post medieval? Underlying the modern made ground on the site was a layer covering the entire shaft area. This has been dated to c.1480-1800/1900 and interpreted as a post-medieval cultivation soil. Historic mapping illustrates that the site remained undeveloped through the post medieval period until the mid 19th century, when urban development around the site accelerated and construction of railways in this part of London began. On Gascoigne's 1703 map the site was open ground, the later maps of Rocque in 1746 and Horwood in 1799 show the area was in use as fields and Stanford's map of 1862 depicts the area surrounding Eleanor Street comprising of market gardens. These are all consistent with the archaeological evidence. Underlying the layer were natural terrace gravels. The archaeological fieldwork has demonstrated that remains relating to the Prehistoric, Roman or medieval period have not survived to the modern era, if they were once present on site."

textarea_input = widgets.Textarea(
    value="",
    placeholder='Type something',
    description='String:',
    disabled=False,
    layout=widgets.Layout(width='100%', height='250px')    
)

# define language dropdown
dropdown_language = widgets.Dropdown(
    options = [
        ["German", "de"], 
        ["English", "en"], 
        ["Spanish", "es"], 
        ["French", "fr"],
        ["Italian", "it"], 
        ["Dutch", "nl"],
        ["Norwegian", "no"],
        ["Swedish", "sv"] 
    ],
    value="fr", 
    description='Language:',
    disabled=False
)

# define checkboxes for entity types
data = ["TEMPORAL", "MONUMENT", "MATERIAL", "EVENTTYPE", "ARCHSCIENCE"]
checkboxes = [widgets.Checkbox(value=False, description=label) for label in data]
entity_type_choices = widgets.VBox(children=checkboxes)

#input_text = ""
for testItem in testData:
        if testItem["language"] == dropdown_language.value:
            textarea_input.value = testItem["value"]  
            break       

# define output format dropdown
dropdown_output_format = widgets.Dropdown(
    options = ["html", "json", "csv", "tsv"], 
    value="html", 
    description='Format:',
    disabled=False
)

output = widgets.Output(
    layout=widgets.Layout(overflow='scroll', border='1px solid black', height='250px')
)

def on_button_go_clicked(b):    
    input_text = textarea_input.value
    language = dropdown_language.value
    output_format = dropdown_output_format.value
    authority_id = dropdown_authorities.value
    tr = TemporalRecognizer(language, authority_id)
    entities = tr.get_entities(input_text, output_format)
    #formatted = TemporalRecognizer.format_entities(input_text, entities, output_format)
    #print(formatted)
    output.clear_output(wait=True)
    with output:
        #pprint(formatted)        
        display(HTML(entities))
        #display(HTML('<a href="http://example.com">link</a>'))
    #display(formatted)
        
# define 'Go' button and output area
button_go = widgets.Button(description="Go")
button_go.on_click(on_button_go_clicked)
    
# display all controls on the screen

display(entity_type_choices)
display(dropdown_authorities)
#display(dropdown_periods)
display(dropdown_language)
display(dropdown_output_format)
display(textarea_input)
display(button_go)
display(output) 
#display(HTML('<a href="http://example.com">link</a>'))
"""