Install spaCy, etc.:

In [None]:
!python -m pip install spacy==3.6.0.dev1 matplotlib --quiet

Clone project repository and install requirements:

In [None]:
# edit the line below to choose a different working directory
%cd
!python -m spacy project clone litbank -r https://github.com/adrianeboyd/workshop-dh2023
%cd litbank
!python -m pip install -r requirements.txt

In [None]:
#!python -m spacy project run download-vectors
# TODO: replace after v3.6.0 release
!python -m pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.5.0/en_core_web_lg-3.5.0-py3-none-any.whl --no-deps

Use `spacy debug data` to get an overview of the corpus annotation:

In [None]:
!python -m spacy debug data configs/spancat.cfg --components.spancat.spans_key entities --paths.train corpus/entity_spans-train.spacy --paths.dev corpus/entity_spans-dev.spacy

In [None]:
import spacy
from spacy.displacy import render
from spacy.tokens import DocBin
nlp = spacy.blank("en")

Reload this cell to start again at the first doc:

In [None]:
docs = DocBin().from_disk("corpus/entity_spans-train.spacy").get_docs(nlp.vocab)

Reload the following cell to step through the docs in the dataset:

In [None]:
# displacy the gold annotation with displacy
doc = next(docs)
colors = {
    "LOC": "#42D1FF",
    "GPE": "#DB536B",
    "PER": "#FAB400",
    "FAC": "#94356F",
    "ORG": "#FF6A16",
    "VEH": "#319621",
}
render(doc, style="span", options={"spans_key": "entities", "colors": colors}, jupyter=True)

In [None]:
from collections import Counter
import matplotlib.pyplot as plt
docs = list(DocBin().from_disk("corpus/entity_spans-train.spacy").get_docs(nlp.vocab))
span_lengths = Counter(len(span) for doc in docs for span in doc.spans["entities"])
plt.bar(span_lengths.keys(), span_lengths.values())
plt.xlabel("N-Gram Length")
plt.ylabel("Count")
plt.show()

In [None]:
!python -m pip install 'spacy-experimental~=0.6.2' --no-deps

In [None]:
from spacy.util import registry
from spacy.scorer import PRFScore
nlp = spacy.load("en_core_web_lg")
parsed_docs = list(nlp.pipe(docs))

In [None]:
def evaluate_suggester_recall(suggester, parsed_docs):
    gold_span_tuples = [(i, span.start, span.end) for i, doc in enumerate(parsed_docs) for span in doc.spans["entities"]]
    suggestions = suggester(parsed_docs)
    span_tuples = []
    for i, length in enumerate(suggestions.lengths):
        for j in range(length):
            span_tuples.append((i, suggestions.dataXd[j, 0], suggestions.dataXd[j, 1]))
    prf = PRFScore()
    prf.score_set(set(span_tuples), set(gold_span_tuples))
    return len(span_tuples), prf.recall

In [None]:
suggester = registry.misc.get("spacy.ngram_suggester.v1")([1, 2, 3, 4, 5, 6, 7, 8])
evaluate_suggester_recall(suggester, parsed_docs)

In [None]:
suggester = registry.misc.get("spacy-experimental.subtree_suggester.v1")()
evaluate_suggester_recall(suggester, parsed_docs)

In [None]:
suggester = registry.misc.get("spacy-experimental.chunk_suggester.v1")()
evaluate_suggester_recall(suggester, parsed_docs)