In [1]:
# pre-install required libraries
import warnings
#%pip install -upgrade pip
%pip install spacy
%pip install ipywidgets
#%pip install -U jupyter

%sx python - m spacy download en_core_web_sm

# suppress user warnings during execution
warnings.filterwarnings(action='ignore', category=UserWarning)

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


# Using the Vocabulary Annotator
## Introduction
The VocabularyAnnotator class performs *vocabulary-based* lookup matching on terms. The following example vocabularies are included for use with the VocabularyAnnotator class. Note these term lists are a dated snapshot (extracted via SPARQL queries) for experimentation. They do not necessarily represent comprehensive coverage; in all cases the originating vocabularies should be consulted for fuller information.


| Vocabulary                            | Entity Type   | Description      | Examples  |
|---------------------------------------|---------------|------------------| ----------|
| AAT_ACTIVITIES        | ACTIVITY      | Terms from the [AAT 'Activities' facet](http://vocab.getty.edu/aat/300264090)   | *religious holidays, subtractive processes* |
| AAT_AGENTS            | AGENT         | Terms from the [AAT 'Agents' facet](http://vocab.getty.edu/aat/300264089)  | *scientific photographers, software developers* |
| AAT_ASSOCIATED_CONCEPTS | ASSOCIATED_CONCEPT | Terms from the [AAT 'Associated Concepts' facet](http://vocab.getty.edu/aat/300264086) | *literature, maiden names* |
| AAT_MATERIALS         | MATERIAL      | Terms from the [AAT 'Materials' facet](http://vocab.getty.edu/aat/300264091) | *polymer, frosted glass* |
| AAT_OBJECTS           | OBJECT        | Terms from the [AAT 'Objects' facet](http://vocab.getty.edu/aat/300264092) | *shields, horseshoe arches* |
| AAT_PHYSICAL_ATTRIBUTES | PHYSICAL_ATTRIBUTE | Terms from the [AAT 'Physical Attributes' facet](http://vocab.getty.edu/aat/300264087) | *surface mounted, kidney shaped* |
| AAT_STYLEPERIODS      | STYLEPERIOD   | Terms from the [AAT 'Styles &amp; Periods' facet](http://vocab.getty.edu/aat/300264088) | *transitional, Dutch colonial revival* |
| FISH_ARCHOBJECTS      | OBJECT        | Terms from the [FISH Archaeological Objects Thesaurus](http://purl.org/heritagedata/schemes/mda_obj) | *axe, sherds, ring* |
| FISH_ARCHSCIENCES     | ARCHSCIENCE   | Terms from the [FISH Archaeological Sciences Thesaurus](http://purl.org/heritagedata/schemes/560) | *lead isotope dating, palynology* |
| FISH_BUILDING_MATERIALS |  MATERIAL   | Terms from the [FISH Building Materials Thesaurus](http://purl.org/heritagedata/schemes/eh_tbm) | *brass, quartz, pine, bone, leather* |
| FISH_COMPONENTS       | COMPONENT     | Terms from the [HE Components Thesaurus](http://purl.org/heritagedata/schemes/eh_com) | *rafter, truss, flue* |
| FISH_EVENT_TYPES      | EVENTTYPE     | Terms from the [FISH Event Types Thesaurus](http://purl.org/heritagedata/schemes/agl_et) | *core sampling, geophysical survey, evaluation* |
| FISH_EVIDENCE         | EVIDENCE      | Terms from the [HE Evidence Thesaurus](http://purl.org/heritagedata/schemes/eh_evd) | *cropmark, artefact scatter* |
| FISH_MARITIME_CRAFT   | MARITIME      | Terms from the [FISH Maritime Craft Types Thesaurus](http://purl.org/heritagedata/schemes/eh_tmc) | *galley, salvage tug, dredger* |
| FISH_MONUMENT_TYPES   | MONUMENT      | Terms from the [FISH Thesaurus of Monument Types](http://purl.org/heritagedata/schemes/eh_tmt2) | *midden, weighbridge, kiln* |
| FISH_PERIODS          | NAMEDPERIOD   | Terms from Perio.do [Historic England Periods Authority File](http://n2t.net/ark:/99152/p0kh9ds) | *Medieval, Bronze Age* |





In [2]:
# example using VocabularyAnnotator class on a passage of text
from IPython.display import display  # , HTML

from rematch2.VocabularyAnnotator import VocabularyAnnotator
from rematch2.VocabularyEnum import VocabularyEnum
#from pathlib import Path
#import os
#import json

# example test text from https://doi.org/10.5284/1100093
test_text = """
This collection comprises site data (images, a report, a project database and GIS data) from an archaeological excavation undertaken by Cotswold Archaeology between January and February 2020 at Lydney B Phase III, Archers Walk, Lydney, Gloucestershire. An area of 0.6ha was excavated within this phase (Phase III) of a wider development area.
Aside from three residual flints, none closely datable, the earliest remains comprised an aesica brooch and an assemblage of Roman pottery and ceramic building material, also residual and most likely derived from a Roman farmstead found immediately to the north within the Phase II excavation area. A single sherd of Anglo-Saxon grass-tempered pottery was also residual.
The earliest features, which accounted for the majority of the remains on site, relate to medieval agricultural activity focused within a large enclosure. There was little to suggest domestic occupation within the site: the pottery assemblage was modest and well abraded, whilst charred plant remains were sparse, and, as with some metallurgical residues, point to waste disposal rather than the locations of processing or consumption. A focus of occupation within the Rodley Manor site, on higher ground 160m to the north-west, seems likely, with the currently site having lain beyond this and providing agricultural facilities, most likely corrals and pens for livestock. Animal bone was absent, but the damp, low-lying ground would have been best suited to cattle. An assemblage of medieval coins recovered from the subsoil during a metal detector survey may represent a dispersed hoard.
"""


# required output format options: html|csv|json|dataframe|doc
# 'html' returns inline markup for visualising annotations in context
# 'dataframe' useful for visualising tabular data in python notebook
# 'csv' and 'json' are useful textual interchange formats
# 'doc' returns the spaCy document object for further processing
output_format = "html"   

# create the annotator instance, specifying the vocabular(y|ies) to use
annotator = VocabularyAnnotator(vocabs=[VocabularyEnum.FISH_ARCHOBJECTS])

# process example text and display the results in required output format
results = annotator.annotateText(input_text=test_text, output_format=output_format)
display(results)

None

In [None]:
# UI to test VocabularyAnnotator on a range of example texts
import ipywidgets as widgets
from IPython.display import display, HTML
from rematch2.VocabularyAnnotator import VocabularyAnnotator
from rematch2.VocabularyEnum import VocabularyEnum
from test_examples_english import test_examples_english

# TODO - choose these as checkboxes in UI?
#vocab_dir = os.path.join(os.path.abspath(""), "rematch2/vocabularies")
#file_path = os.path.join(vocab_dir, "vocab_en_FISH_MONUMENT_TYPES_20210921.json")
#with open(file_path, "r") as f:  # what if file doesn't exist?
    #vocabulary = json.load(f)
    
annotator = VocabularyAnnotator(
    vocabs=[VocabularyEnum.FISH_MONUMENT_TYPES]
)



def run(btn):
    # clear any previous output
    output.clear_output(wait=True)
   
    # get the test text for the for the chosen id    
    selected_test = next(
        filter(lambda test: test.get("id", "") == dropdown_tests.value, test_examples_english), None)
    if(selected_test):
        # get annotation results
        output_format = dropdown_format.value
        results = annotator.annotateText(
            input_text=selected_test.get("text", ""), output_format=output_format)
        # display annotation results
        with output:
            if(output_format == "html"):
                display(HTML(results))
            else:
                display(results)
        
# define test selector dropdown UI component
dropdown_tests = widgets.Dropdown(
    options=[[test.get("source",""), test.get("id","")]
             for test in test_examples_english],
    #value="en",
    description='Test:',
    disabled=False
)

# define output format dropdown UI component
dropdown_format = widgets.Dropdown(
    options=[
        ["HTML", "html"],
        ["Tabular", "dataframe"]
    ],
    value="html",
    description="Format:",
    disabled=False
)

# define and display other UI components
button_go = widgets.Button(description="Go")
input = widgets.HBox([dropdown_tests, dropdown_format, button_go])
output = widgets.Output(layout=widgets.Layout(
    overflow='scroll', border='1px solid black', height='500px'))
display(input, output)

# what to do when the button is clicked
button_go.on_click(run)

HBox(children=(Dropdown(description='Test:', options=(['https://doi.org/10.5284/1100095', 'test1'], ['ADS coll…

Output(layout=Layout(border_bottom='1px solid black', border_left='1px solid black', border_right='1px solid b…