Install spaCy, etc.:

In [None]:
!python -m pip install 'spacy~=3.6.0' matplotlib --quiet

Clone project repository and install requirements:

In [None]:
# edit the line below to choose a different working directory
%cd
!python -m spacy project clone litbank -r https://github.com/adrianeboyd/workshop-dh2023
%cd litbank
!python -m pip install -r requirements.txt

In [None]:
!python -m spacy project run download-lg

Use `spacy debug data` to get an overview of the corpus annotation:

In [None]:
!python -m spacy debug data configs/spancat_ngram_lg.cfg --components.spancat.spans_key entities --paths.train corpus/entity_spans-train.spacy --paths.dev corpus/entity_spans-dev.spacy

In [None]:
import spacy
from spacy.displacy import render
from spacy.tokens import DocBin
nlp_blank = spacy.blank("en")

Reload this cell to start again at the first doc:

In [None]:
docs = DocBin().from_disk("corpus/entity_spans-train.spacy").get_docs(nlp_blank.vocab)

Reload the following cell to step through the docs in the dataset:

In [None]:
# display the gold annotation with displacy

doc = next(docs)
colors = {
    "LOC": "#42D1FF",
    "GPE": "#DB536B",
    "PER": "#FAB400",
    "FAC": "#94356F",
    "ORG": "#FF6A16",
    "VEH": "#319621",
}
render(doc, style="span", options={"spans_key": "entities", "colors": colors}, jupyter=True)

In [None]:
# show the distribution of span lengths

from collections import Counter
import matplotlib.pyplot as plt
docs = list(DocBin().from_disk("corpus/entity_spans-train.spacy").get_docs(nlp_blank.vocab))
span_lengths = Counter(len(span) for doc in docs for span in doc.spans["entities"])
plt.bar(span_lengths.keys(), span_lengths.values())
plt.xlabel("N-Gram Length")
plt.ylabel("Count")
plt.show()

In [None]:
from spacy.util import registry
from spacy.scorer import PRFScore
nlp_core = spacy.load("en_core_web_lg")
docs = list(DocBin().from_disk("corpus/entity_spans-dev.spacy").get_docs(nlp_core.vocab))
parsed_docs = list(nlp_core.pipe(DocBin().from_disk("corpus/entity_spans-dev.spacy").get_docs(nlp_core.vocab)))

In [None]:
def evaluate_suggester(suggester, docs, processed_docs):
    gold_span_tuples = [(i, span.start, span.end) for i, doc in enumerate(docs) for span in doc.spans["entities"]]
    suggestions = suggester(processed_docs)
    span_tuples = []
    offset = 0
    for i, length in enumerate(suggestions.lengths):
        for j in range(length):
            span_tuples.append((i, *suggestions.dataXd[j+offset]))
        offset += length
    prf = PRFScore()
    prf.score_set(set(span_tuples), set(gold_span_tuples))
    print("gold:        ", len(gold_span_tuples))
    print("suggestions: ", len(span_tuples))
    print("precision:   ", prf.precision)
    print("recall:      ", prf.recall)

Evaluate suggesters:

In [None]:
# evaluate the ngram suggester

suggester = registry.misc.get("spacy.ngram_suggester.v1")([1, 2, 3, 4, 5, 6, 7, 8])
evaluate_suggester(suggester, docs, parsed_docs)

In [None]:
# evaluate the subtree suggester

suggester = registry.misc.get("spacy-experimental.subtree_suggester.v1")()
evaluate_suggester(suggester, docs, parsed_docs)

In [None]:
# evaluate the noun chunk suggester

suggester = registry.misc.get("spacy-experimental.chunk_suggester.v1")()
evaluate_suggester(suggester, docs, parsed_docs)

In [None]:
# install a spancat pipeline with a span finder

!python -m pip install https://github.com/adrianeboyd/workshop-dh2023/releases/download/v0.0.1/en_litbank_spancat_span_finder_lg-0.0.1-py3-none-any.whl

In [None]:
# evaluate the span finder suggester

nlp_finder = spacy.load("en_litbank_spancat_span_finder_lg", exclude=["spancat"])
docs = list(DocBin().from_disk("corpus/entity_spans-dev.spacy").get_docs(nlp_finder.vocab))
processed_docs = list(nlp_finder.pipe(doc.text for doc in docs))
suggester = registry.misc.get("spacy.preset_spans_suggester.v1")(spans_key="entities")
evaluate_suggester(suggester, docs, processed_docs)